Source code for pe.data.image.digiface1m

import pandas as pd
import os
import numpy as np
from PIL import Image
import glob
from tqdm.contrib.concurrent import process_map

from pe.data import Data
from pe.constant.data import LABEL_ID_COLUMN_NAME
from pe.constant.data import IMAGE_DATA_COLUMN_NAME


[docs] class DigiFace1M(Data): """The DigiFace1M dataset (https://github.com/microsoft/DigiFace1M/)."""
[docs] def __init__(self, root_dir, res=32, num_processes=50, chunksize=1000): """Constructor. :param root_dir: The root directory of the dataset, defaults to "data" :type root_dir: str, optional :param res: The resolution of the images, defaults to 32 :type res: int, optional :param num_processes: The number of processes to use for parallel processing, defaults to 50 :type num_processes: int, optional :param chunksize: The chunk size to use for parallel processing, defaults to 1000 :type chunksize: int, optional :raises ValueError: If the number of images in ``root_dir`` is not 1,219,995 """ self._res = res files = glob.glob(os.path.join(root_dir, "*", "*.png")) if len(files) != 1219995: raise ValueError( f"Expected 1,219,995 images, but found {len(files)}. Please download the dataset from " "https://github.com/microsoft/DigiFace1M, unzip the images into a folder, and pass the path to the " "folder as ``root_dir``." ) images = process_map(self._read_and_process_image, files, max_workers=num_processes, chunksize=chunksize) data_frame = pd.DataFrame( { IMAGE_DATA_COLUMN_NAME: images, LABEL_ID_COLUMN_NAME: [0] * len(images), } ) metadata = {"label_info": [{"name": "None"}]} super().__init__(data_frame=data_frame, metadata=metadata)
[docs] def _read_and_process_image(self, path): """Read and process an image. :param path: The path to the image :type path: str :return: The processed image :rtype: np.ndarray """ image = Image.open(path) image = image.convert("RGB") image = image.resize((self._res, self._res)) return np.array(image)