Source code for pe.embedding.tabular.tabular_embedding

import pandas as pd
import numpy as np

from pe.embedding import Embedding
from pe.logging import execution_logger
from pe.constant.data import TABULAR_DATA_COLUMN_NAME
from pe.data.tabular.tabular_csv import TabularColumnType


[docs] class TabularEmbedding(Embedding): """Compute the tabular embedding."""
[docs] def __init__(self, info, cat_weight=1 / 3, num_weight=1): """Constructor. :param info: The information (categories and numerical bounds) of the private data :type info: dict :param cat_weight: The weight for the categorical columns, defaults to 1/3 :type cat_weight: float, optional :param num_weight: The weight for the numerical columns, defaults to 1 :type num_weight: float, optional """ super().__init__() self._info = info self._cat_weight = cat_weight self._num_weight = num_weight
[docs] def compute_embedding(self, data): """Compute the tabular embedding. (the embedding is computed using the features only, not the labels) Vectorization per column is implemented to improve the performance. :param data: The data object containing the tabular data :type data: :py:class:`pe.data.Data` :return: The data object with the computed embedding :rtype: :py:class:`pe.data.Data` """ uncomputed_data = self.filter_uncomputed_rows(data) if len(uncomputed_data.data_frame) == 0: execution_logger.info(f"Embedding: {self.column_name} already computed") return data execution_logger.info( f"Embedding: computing {self.column_name} for {len(uncomputed_data.data_frame)}/{len(data.data_frame)}" " samples" ) cat_columns = data.metadata["cat_columns"] num_columns = data.metadata["int_columns"] + data.metadata["float_columns"] feature_columns = data.metadata["feature_columns"] features_list = uncomputed_data.data_frame[TABULAR_DATA_COLUMN_NAME].tolist() features_df = pd.DataFrame(features_list, columns=feature_columns) # Build embedding vectors embedding_vectors = [] num_samples = len(features_df) for col in num_columns: if col in self._info and self._info[col]["type"] in [TabularColumnType.INTEGER, TabularColumnType.FLOAT]: col_values = features_df[col].values min_val = self._info[col]["min"] max_val = self._info[col]["max"] normalized = (col_values - min_val) * self._num_weight / (max_val - min_val) embedding_vectors.append(normalized.reshape(-1, 1)) else: raise ValueError(f"Tabular Embedding: No info for numerical column {col}, cannot proceed.") for col in cat_columns: if col in self._info and self._info[col]["type"] == TabularColumnType.CATEGORICAL: categories = self._info[col]["categories"] num_categories = len(categories) col_values = features_df[col].values # Get indices for each sample using vectorized lookup category_to_idx = {cat: idx for idx, cat in enumerate(categories)} indices = pd.Series(col_values).map(category_to_idx).fillna(0).astype(int).values # Create one-hot-like vectors one_hot = np.zeros((num_samples, num_categories)) one_hot[np.arange(num_samples), indices] = self._cat_weight embedding_vectors.append(one_hot) else: raise ValueError(f"Tabular Embedding: No info for categorical column {col}, cannot proceed.") # Concatenate all vectors embeddings = np.concatenate(embedding_vectors, axis=1) # Convert to list and store uncomputed_data.data_frame[self.column_name] = pd.Series( list(embeddings), index=uncomputed_data.data_frame.index ) execution_logger.info( f"Embedding: finished computing {self.column_name} for " f"{len(uncomputed_data.data_frame)}/{len(data.data_frame)} samples" ) return self.merge_computed_rows(data, uncomputed_data)