Source code for pe.data.tabular.tabular_csv

from pe.data import Data
import pandas as pd
import numpy as np
import json
import requests
from pe.constant.data import LABEL_ID_COLUMN_NAME
from pe.constant.data import TABULAR_DATA_COLUMN_NAME
import math
from enum import Enum


[docs] class TabularColumnType(Enum): """The type of the tabular column.""" CATEGORICAL = "cat" INTEGER = "int" FLOAT = "float"
[docs] class TabularCSV(Data): """The tabular dataset in CSV format."""
[docs] def __init__(self, csv_path, metadata_path): """Constructor. :param csv_path: The path to the CSV file or the URL to the CSV file :type csv_path: str :param metadata_path: The path to the metadata file or the URL to the metadata file :type metadata_path: str """ raw_tab_df = pd.read_csv(csv_path) # pd.reac_csv can work with both local path and http path if "https://" in metadata_path: metadata = json.loads(requests.get(metadata_path).text) # read metadata from http path else: with open(metadata_path, "r") as file: metadata = json.load(file) # read metadata from local path cat_columns = metadata["cat_columns"] int_columns = metadata["int_columns"] float_columns = metadata["float_columns"] label_columns = metadata["label_columns"] # Create label IDs following the same logic as TextCSV labels = raw_tab_df.apply(lambda row: tuple([row[col] for col in label_columns]), axis=1).tolist() label_set = list(sorted(set(labels))) label_id_map = {label: i for i, label in enumerate(label_set)} label_ids = [label_id_map[label] for label in labels] def to_builtin(x): if pd.isna(x): return None if isinstance(x, np.generic): return x.item() return x label_info = [ { "name": " | ".join(f"{label_columns[i]}: {to_builtin(label[i])}" for i in range(len(label_columns))), "column_values": {label_columns[i]: to_builtin(label[i]) for i in range(len(label_columns))}, } for label in label_set ] # Merge tabular features into a single column feature_columns = cat_columns + int_columns + float_columns merged_features = raw_tab_df[feature_columns].apply(lambda row: row.tolist(), axis=1) data_frame = pd.DataFrame( { TABULAR_DATA_COLUMN_NAME: merged_features, LABEL_ID_COLUMN_NAME: label_ids, } ) metadata = { "label_columns": label_columns, "label_info": label_info, "cat_columns": cat_columns, "int_columns": int_columns, "float_columns": float_columns, "feature_columns": feature_columns, } super().__init__(data_frame=data_frame, metadata=metadata)
[docs] def get_tab_info(self): """Get the information of the private data. :return: The information (categories and numerical bounds) of the private data :rtype: dict """ info = {} features_columns = self.metadata["feature_columns"] features_df = pd.DataFrame(self.data_frame[TABULAR_DATA_COLUMN_NAME].tolist(), columns=features_columns) for column in features_columns: if column in self.metadata["cat_columns"]: info[column] = { "categories": list(features_df[column].unique()), "type": TabularColumnType.CATEGORICAL, } elif column in self.metadata["int_columns"]: info[column] = { "min": math.floor(features_df[column].min()), "max": math.ceil(features_df[column].max()), "type": TabularColumnType.INTEGER, } elif column in self.metadata["float_columns"]: info[column] = { "min": features_df[column].min(), "max": features_df[column].max(), "type": TabularColumnType.FLOAT, } return info