Source code for vivainsights.create_odds_ratios

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module calculates odds ratios for ordinal metrics against a specified metric.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

[docs] def create_odds_ratios(data: pd.DataFrame, ord_metrics: list, metric: str, return_type: str = 'table'): """ Name ---- create_odds_ratios Description ----------- Calculates odds ratios for ordinal metrics against a specified metric. Parameters ---------- data : pandas dataframe A Person Query dataset in the form of a pandas DataFrame. ord_metrics : list List of strings referring to the column names of the ordinal variables. metric : str Name of the variable to calculate proportional odds against the `ord_metrics`. return_type : str, optional Specifies what to return. Defaults to 'table'. - 'table': Returns a data frame with the final odds ratio table sorted by odds ratio. - 'plot': Returns a plot for visualizing the odds ratio. Returns ------- pandas DataFrame or matplotlib Figure Depending on the value of `return_type`, either a table or a plot is returned. Example ------- >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.create_odds_ratios(data=pq_data, ord_metrics=["Engagement_Score", "Satisfaction_Score"], metric="Copilot_Usage", return_type="table") """ # Validate inputs if not isinstance(ord_metrics, list): raise ValueError("`ord_metrics` must be a list of column names.") if metric not in data.columns: raise ValueError(f"Metric '{metric}' not found in data.") for ord_metric in ord_metrics: if ord_metric not in data.columns: raise ValueError(f"Ordinal metric '{ord_metric}' not found in data.") # Initialize a list to store odds ratio results odds_ratios = [] # Calculate odds ratios for each ordinal metric for ord_metric in ord_metrics: # Create a contingency table contingency_table = pd.crosstab(data[metric], data[ord_metric]) # Add 0.5 to each cell to avoid division by zero contingency_table += 0.5 # Calculate odds for each level of the ordinal metric odds = contingency_table.div(contingency_table.sum(axis=1), axis=0) # Calculate odds ratios for all levels odds_ratios_all_levels = odds.div(odds.iloc[:, 0], axis=0) # Reshape odds_ratios_all_levels for inclusion in the output odds_ratios_all_levels = odds_ratios_all_levels.reset_index().melt( id_vars=[metric], var_name="Level", value_name="Odds_Ratio" ) odds_ratios_all_levels["Ordinal_Metric"] = ord_metric # --- Add n: count of distinct PersonId for each (metric, Level, Ordinal_Metric) --- # Determine PersonId column person_id_col = "PersonId" if "PersonId" in data.columns else data.columns[0] # Compute counts n_counts = ( data .assign(Level=data[ord_metric], Ordinal_Metric=ord_metric) .groupby([metric, "Level", "Ordinal_Metric"])[person_id_col] .nunique() .reset_index(name="n") ) # Merge counts into odds_ratios_all_levels odds_ratios_all_levels = odds_ratios_all_levels.merge( n_counts, how="left", on=[metric, "Level", "Ordinal_Metric"] ) # Append to the results odds_ratios.append(odds_ratios_all_levels) # Combine all results into a single DataFrame odds_ratios_df = pd.concat(odds_ratios, ignore_index=True) if return_type == "table": return odds_ratios_df elif return_type == "plot": # Create a bar plot for visualizing odds ratios fig, ax = plt.subplots(figsize=(8, 6)) sns.barplot(data=odds_ratios_df, x="Odds_Ratio", y="Ordinal_Metric", hue="Level", ax=ax, palette="Blues_d") ax.set_title("Odds Ratios for Ordinal Metrics") ax.set_xlabel("Odds Ratio") ax.set_ylabel("Ordinal Metric") return fig else: raise ValueError("Invalid `return_type`. Choose 'table' or 'plot'.")
[docs] def compute_fav(data: pd.DataFrame, ord_metrics: list, item_options: int = 5, fav_threshold: int = 70, unfav_threshold: int = 40, drop_neutral: bool = True): """ Name ---- compute_fav Description ----------- Converts ordinal variables into categorical variables with favorable and unfavorable scores. Parameters ---------- data : pandas dataframe A dataset containing the ordinal variables. ord_metrics : list List of strings referring to the column names of the ordinal variables. item_options : int, optional Number of options in the ordinal metrics. Default is 5. fav_threshold : int, optional Threshold for favorable scores (in 100-point scale). Default is 70. unfav_threshold : int, optional Threshold for unfavorable scores (in 100-point scale). Default is 40. drop_neutral : bool, optional Whether to drop neutral scores. Default is True. Returns ------- pandas DataFrame The returned DataFrame includes all original columns, plus for each ordinal metric: - '<metric>_100': the metric rescaled to a 100-point scale - '<metric>_fav': the favorability category ('fav', 'unfav', or 'neu') If `drop_neutral` is True, rows with neutral scores are removed. Example ------- >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.compute_fav(data=pq_data, ord_metrics=["eSat", "Initiative"], item_options=5, fav_threshold=70, unfav_threshold=40, drop_neutral=True) """ # Validate inputs if not isinstance(ord_metrics, list): raise ValueError("`ord_metrics` must be a list of column names.") for ord_metric in ord_metrics: if ord_metric not in data.columns: raise ValueError(f"Ordinal metric '{ord_metric}' not found in data.") # Convert ordinal metrics to a 100-point scale and categorize into favorability for ord_metric in ord_metrics: data[f"{ord_metric}_100"] = (data[ord_metric] - 1) * (100 / (item_options - 1)) data[f"{ord_metric}_fav"] = data[f"{ord_metric}_100"].apply( lambda x: 'fav' if x > fav_threshold else ('unfav' if x < unfav_threshold else 'neu') ) if drop_neutral: data = data[data[f"{ord_metric}_fav"] != 'neu'] return data