Source code for pe.embedding.tabular.tabular_embedding

import pandas as pd
import numpy as np

from pe.embedding import Embedding
from pe.logging import execution_logger
from pe.constant.data import TABULAR_DATA_COLUMN_NAME
from pe.data.tabular.tabular_csv import TabularColumnType



[docs]
class TabularEmbedding(Embedding):
    """Compute the tabular embedding."""


[docs]
    def __init__(self, info, cat_weight=1 / 3, num_weight=1):
        """Constructor.

        :param info: The information (categories and numerical bounds) of the private data
        :type info: dict
        :param cat_weight: The weight for the categorical columns, defaults to 1/3
        :type cat_weight: float, optional
        :param num_weight: The weight for the numerical columns, defaults to 1
        :type num_weight: float, optional
        """
        super().__init__()
        self._info = info
        self._cat_weight = cat_weight
        self._num_weight = num_weight



[docs]
    def compute_embedding(self, data):
        """Compute the tabular embedding. (the embedding is computed using the features only, not the labels)
        Vectorization per column is implemented to improve the performance.

        :param data: The data object containing the tabular data
        :type data: :py:class:`pe.data.Data`
        :return: The data object with the computed embedding
        :rtype: :py:class:`pe.data.Data`
        """
        uncomputed_data = self.filter_uncomputed_rows(data)
        if len(uncomputed_data.data_frame) == 0:
            execution_logger.info(f"Embedding: {self.column_name} already computed")
            return data
        execution_logger.info(
            f"Embedding: computing {self.column_name} for {len(uncomputed_data.data_frame)}/{len(data.data_frame)}"
            " samples"
        )

        cat_columns = data.metadata["cat_columns"]
        num_columns = data.metadata["int_columns"] + data.metadata["float_columns"]
        feature_columns = data.metadata["feature_columns"]

        features_list = uncomputed_data.data_frame[TABULAR_DATA_COLUMN_NAME].tolist()
        features_df = pd.DataFrame(features_list, columns=feature_columns)

        # Build embedding vectors
        embedding_vectors = []
        num_samples = len(features_df)

        for col in num_columns:
            if col in self._info and self._info[col]["type"] in [TabularColumnType.INTEGER, TabularColumnType.FLOAT]:
                col_values = features_df[col].values
                min_val = self._info[col]["min"]
                max_val = self._info[col]["max"]
                normalized = (col_values - min_val) * self._num_weight / (max_val - min_val)
                embedding_vectors.append(normalized.reshape(-1, 1))
            else:
                raise ValueError(f"Tabular Embedding: No info for numerical column {col}, cannot proceed.")

        for col in cat_columns:
            if col in self._info and self._info[col]["type"] == TabularColumnType.CATEGORICAL:
                categories = self._info[col]["categories"]
                num_categories = len(categories)
                col_values = features_df[col].values

                # Get indices for each sample using vectorized lookup
                category_to_idx = {cat: idx for idx, cat in enumerate(categories)}
                indices = pd.Series(col_values).map(category_to_idx).fillna(0).astype(int).values

                # Create one-hot-like vectors
                one_hot = np.zeros((num_samples, num_categories))
                one_hot[np.arange(num_samples), indices] = self._cat_weight
                embedding_vectors.append(one_hot)
            else:
                raise ValueError(f"Tabular Embedding: No info for categorical column {col}, cannot proceed.")

        # Concatenate all vectors
        embeddings = np.concatenate(embedding_vectors, axis=1)

        # Convert to list and store
        uncomputed_data.data_frame[self.column_name] = pd.Series(
            list(embeddings), index=uncomputed_data.data_frame.index
        )

        execution_logger.info(
            f"Embedding: finished computing {self.column_name} for "
            f"{len(uncomputed_data.data_frame)}/{len(data.data_frame)} samples"
        )
        return self.merge_computed_rows(data, uncomputed_data)