Source code for pe.callback.tabular.classifier

import pandas as pd
from pe.callback.callback import Callback
from pe.constant.data import TABULAR_DATA_COLUMN_NAME
from pe.constant.data import LABEL_ID_COLUMN_NAME
from pe.metric_item import FloatListMetricItem
from pe.logging import execution_logger
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score



[docs]
class TabClassifier(Callback):
    """Evaluate tabular classification accuracy using a tabular classifier."""


[docs]
    def __init__(self, test_data, model_name="xgboost", filter_criterion=None):
        """Constructor.

        :param test_data: The test data
        :type test_data: :py:class:`pe.data.Data`
        :param model_name: The classifier model to use, defaults to "xgboost"
        :type model_name: str, optional
        :param filter_criterion: Only computes the metric based on samples satisfying the criterion. None means no
            filtering. Defaults to None
        :type filter_criterion: dict, optional
        """
        self._test_data = test_data
        self._num_classes = len(self._test_data.metadata.label_info)
        self._model_name = model_name
        self._model = self._get_model()
        self._filter_criterion = filter_criterion
        self._filter_criterion_str = str(filter_criterion).replace(" ", "")
        self._metric_name = (
            f"tabular_classifier_{self._model_name}_filter_{self._filter_criterion_str}"
            if filter_criterion
            else f"tabular_classifier_{self._model_name}"
        )



[docs]
    def _get_model(self):
        """Getting the classifier model."""
        if self._model_name == "xgboost":
            try:
                import xgboost as xgb
            except ImportError:
                raise ImportError(
                    "XGBoost is not installed. Please install it using "
                    '`pip install "private-evolution[tabular] @ git+https://github.com/microsoft/DPSDA.git"`.'
                )
            if self._num_classes == 2:
                return xgb.XGBClassifier(objective="binary:logistic")
            else:
                return xgb.XGBClassifier(objective="multi:softmax", num_class=self._num_classes)
        elif self._model_name == "tabicl":
            try:
                from tabicl import TabICLClassifier
            except ImportError:
                raise ImportError(
                    "TabICLClassifier is not installed. Please install it using "
                    '`pip install "private-evolution[tabular] @ git+https://github.com/microsoft/DPSDA.git"`.'
                )
            return TabICLClassifier()
        elif self._model_name == "tabpfn":
            try:
                from tabpfn import TabPFNClassifier
            except ImportError:
                raise ImportError(
                    "TabPFNClassifier is not installed. Please install it using "
                    '`pip install "private-evolution[tabular] @ git+https://github.com/microsoft/DPSDA.git"`.'
                )
            return TabPFNClassifier()
        else:
            raise ValueError(f"Unsupported classifier model: {self._model_name}")



[docs]
    def _encoding(self, syn_data):
        """Encoding categorical and numerical columns.

        :param syn_data: The synthetic training data
        :type syn_data: :py:class:`pe.data.Data`
        :return: The encoded synthetic training and test data
        :rtype: tuple[:py:class:`pe.data.Data`, :py:class:`pe.data.Data`]
        """
        feature_columns = self._test_data.metadata["feature_columns"]
        syn_df = pd.DataFrame(syn_data.data_frame[TABULAR_DATA_COLUMN_NAME].tolist(), columns=feature_columns)
        test_df = pd.DataFrame(self._test_data.data_frame[TABULAR_DATA_COLUMN_NAME].tolist(), columns=feature_columns)
        syn_df[LABEL_ID_COLUMN_NAME] = syn_data.data_frame[LABEL_ID_COLUMN_NAME].tolist()
        test_df[LABEL_ID_COLUMN_NAME] = self._test_data.data_frame[LABEL_ID_COLUMN_NAME].tolist()

        for column in feature_columns + [LABEL_ID_COLUMN_NAME]:
            merged_feature = pd.concat([syn_df[column], test_df[column]])
            if column in syn_data.metadata["cat_columns"] + [LABEL_ID_COLUMN_NAME]:
                encoder = LabelEncoder()
                encoder.fit(merged_feature.values)
                syn_df[column] = encoder.transform(syn_df[column].values)
                test_df[column] = encoder.transform(test_df[column].values)
            else:
                scaler = MinMaxScaler()
                scaler.fit(merged_feature.values.reshape(-1, 1))
                syn_df[column] = scaler.transform(syn_df[column].values.reshape(-1, 1))
                test_df[column] = scaler.transform(test_df[column].values.reshape(-1, 1))

        return syn_df, test_df



[docs]
    def __call__(self, syn_data):
        """Evaluate the tabular classifier on the test set.

        :param syn_data: The synthetic training data
        :type syn_data: :py:class:`pe.data.Data`
        :return: Classification accuracy metrics
        :rtype: list[:py:class:`pe.metric_item.FloatListMetricItem`]
        """
        execution_logger.info("Evaluating tabular classifier")
        syn_data = syn_data.filter(self._filter_criterion)
        execution_logger.info(f"Number of samples after filtering: {len(syn_data.data_frame)}")
        if len(syn_data.data_frame) == 0:
            execution_logger.warning(
                f"No samples satisfy the filter criterion {self._filter_criterion_str}. Skipping computation."
            )
            return []
        # Encoding the synthetic training and test data
        syn_df, test_df = self._encoding(syn_data)

        X_train, y_train = syn_df.drop(LABEL_ID_COLUMN_NAME, axis=1).values, syn_df[LABEL_ID_COLUMN_NAME].values
        X_test, y_test = test_df.drop(LABEL_ID_COLUMN_NAME, axis=1).values, test_df[LABEL_ID_COLUMN_NAME].values
        self._model.fit(X_train, y_train)
        y_pred = self._model.predict(X_test)
        test_acc = accuracy_score(y_test, y_pred) * 100
        execution_logger.info(f"Tabular classifier test accuracy: {test_acc:.2f}%")
        if self._num_classes == 2:
            y_pred_proba = self._model.predict_proba(X_test)
            auc = roc_auc_score(y_test, y_pred_proba[:, 1]) * 100
            execution_logger.info(f"Tabular classifier test AUC: {auc:.2f}")
        else:
            auc = -1  # hard code, not available for multi-class classification
        f1 = f1_score(y_test, y_pred, average="macro") * 100
        execution_logger.info(f"Tabular classifier test (macro) F1 score: {f1:.2f}")

        metric_items = [
            FloatListMetricItem(name=f"{self._metric_name}_test_acc", value=[float(test_acc)]),
            FloatListMetricItem(name=f"{self._metric_name}_test_f1", value=[float(f1)]),
        ]

        if auc != -1:
            metric_items.append(FloatListMetricItem(name=f"{self._metric_name}_test_auc", value=[float(auc)]))
        execution_logger.info(f"Finished evaluating tabular classifier ({self._metric_name})")

        return metric_items