Source code for vivainsights.create_IV

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon
from scipy.stats import mstats
import math
import warnings
from vivainsights.create_bar_asis import *


# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

from matplotlib.lines import Line2D

# Optional: reuse vivainsights colors if present
try:
    from vivainsights.color_codes import Colors
    _HIGHLIGHT = Colors.HIGHLIGHT_NEGATIVE.value  # orange
except Exception:
    _HIGHLIGHT = "#fe7f4f"

from matplotlib.figure import Figure  # add this import at the top
from contextlib import contextmanager

@contextmanager
def _suppress_matplotlib_show():
    orig_show = plt.show
    try:
        plt.show = lambda *a, **k: None  # no-op
        yield
    finally:
        plt.show = orig_show

# Header positions (tweak here if you like)
_TITLE_Y   = 0.955
_SUB_Y     = 0.915
_RULE_Y    = 0.900
_TOP_LIMIT = 0.84   # top of the Axes area (leave space for header above)

def _retitle_left(fig, title_text, subtitle_text=None, left=0.01):
    """Left-aligned figure-level title/subtitle; hide axis/suptitle."""
    for ax in fig.get_axes():
        try: ax.set_title("")
        except Exception: pass
    if getattr(fig, "_suptitle", None) is not None:
        fig._suptitle.set_visible(False)

    fig.text(left, _TITLE_Y, title_text, ha="left", fontsize=13, weight="bold", alpha=.8)
    if subtitle_text:
        fig.text(left, _SUB_Y, subtitle_text, ha="left", fontsize=11, alpha=.8)

def _add_header_decoration(fig, color=_HIGHLIGHT, y=_RULE_Y):
    """Orange rule + box under the subtitle, on an overlay so it's always on top."""
    overlay = fig.add_axes([0, 0, 1, 1], frameon=False, zorder=10)
    overlay.set_axis_off()
    overlay.add_line(Line2D([0.01, 1.0], [y, y], transform=overlay.transAxes,
                            color=color, linewidth=1.2))
    overlay.add_patch(plt.Rectangle((0.01, y), 0.03, -0.015,
                                    transform=overlay.transAxes,
                                    facecolor=color, linewidth=0))

def _reserve_header_space(fig, top=_TOP_LIMIT):
    """Push the plot area down so it doesn't overlap the header."""
    try:
        # If constrained layout was enabled by create_bar_asis, disable so we can adjust
        if hasattr(fig, "get_constrained_layout") and fig.get_constrained_layout():
            fig.set_constrained_layout(False)
    except Exception:
        pass
    fig.subplots_adjust(top=top)


[docs] def p_test( data: pd.DataFrame, outcome: str, behavior: list, paired = False ): """ Name ----- p_test Description ----------- Performs Wilcoxon signed-rank test or rank-sum test between two groups. Parameters ---------- data : pd.DataFrame A Pandas DataFrame. outcome : str Name of the outcome variable. behavior : list List of behavior variables to test. paired : bool, optional Boolean indicating if the test should be paired or not. Default is False. Returns ------- pd.DataFrame A DataFrame with variables and corresponding p-values. Examples -------- >>> import vivainsights as vi >>> import pandas as pd >>> data = pd.DataFrame({ ... 'outcome': [1, 0, 1, 0, 1], ... 'behavior1': [10, 20, 30, 40, 50], ... 'behavior2': [5, 15, 25, 35, 45] ... }) >>> outcome = 'outcome' >>> behavior = ['behavior1', 'behavior2'] >>> vi.p_test(data, outcome, behavior) """ # Filter the dataset based on the outcome variable train = data[data[outcome].isin([0, 1])].copy() # Convert outcome to string and then to a factor train[outcome] = train[outcome].astype(str).astype('category') p_value_dict = {} for i in behavior: # Separate data into positive and negative outcomes pos = train[train[outcome] == '1'][i].dropna() neg = train[train[outcome] == '0'][i].dropna() # Ensure that the lengths of pos and neg are the same min_len = min(len(pos), len(neg)) pos = pos[:min_len] neg = neg[:min_len] # Perform Wilcoxon signed-rank test (or rank-sum test for unpaired data) _, p_value = wilcoxon(pos, neg) if paired else wilcoxon(pos, neg, alternative='two-sided') p_value_dict.update({i: p_value}) data_frame = pd.DataFrame(list(p_value_dict.items()), columns=['Variable', 'pval']) return data_frame
[docs] def calculate_IV( data: pd.DataFrame, outcome: str, predictor: str, bins: int ): """ Name ---- calculate_IV Description ----------- Calculates Information Value (IV) between a single predictor variable and the outcome variable. Parameters ---------- data : pd.DataFrame A DataFrame containing the data. outcome : str Name of the outcome variable. predictor : str Name of the predictor variable. bins : int Number of bins for binning the predictor variable. Returns ------- pd.DataFrame A DataFrame with IV calculations for the predictor variable. Raises ------ ValueError If the outcome variable has missing values in the input training data frame. Examples -------- >>> import vivainsights as vi >>> import pandas as pd >>> data = pd.DataFrame({ ... 'outcome': [1, 0, 1, 0, 1], ... 'predictor': [10, 20, 30, 40, 50] ... }) >>> outcome = 'outcome' >>> predictor = 'predictor' >>> bins = 5 >>> vi.calculate_IV(data, outcome, predictor, bins) """ pred_var = data[predictor] outc_var = data[outcome] # Check inputs if outc_var.isna().sum() > 0: raise ValueError(f"dependent variable {outcome} has missing values in the input training data frame") # Compute quantiles q = mstats.mquantiles(pred_var, prob=np.arange(1, bins) / bins, alphap=0, betap=0) # Compute cuts cuts = np.unique(q) # Compute intervals intervals = np.digitize(pred_var, bins=cuts, right=False) # Compute cut_table cut_table = pd.crosstab(intervals, outc_var).reset_index() # Compute min/max and percentage cut_table_2 = pd.DataFrame({ 'var': pred_var, 'intervals': intervals }).groupby('intervals').agg( min=('var', 'min'), max=('var', 'max'), n=('var', 'size') ).reset_index().round({'min': 1, 'max': 1}) cut_table_2[predictor] = cut_table_2.apply(lambda row: f"[{row['min']},{row['max']}]", axis=1) cut_table_2['percentage'] = cut_table_2['n'] / cut_table_2['n'].sum() cut_table_2 = cut_table_2[[predictor, 'intervals', 'n', 'percentage']] # Calculate Non-events and Events cut_table_1 = cut_table[1].values.astype(float) cut_table_0 = cut_table[0].values.astype(float) n_non_event = cut_table_1 * np.sum(cut_table_0) n_yes_event = cut_table_0 * np.sum(cut_table_1) # Compute WOE (Weight of Evidence) cut_table_2['WOE'] = np.where((cut_table[1] > 0) & (cut_table[0] > 0), np.log(n_non_event / n_yes_event), 0) # Compute IV_weight p1 = cut_table[1] / cut_table[1].sum() p0 = cut_table[0] / cut_table[0].sum() cut_table_2['IV_weight'] = p1 - p0 cut_table_2['IV'] = cut_table_2['WOE'] * cut_table_2['IV_weight'] cut_table_2['IV'] = cut_table_2['IV'].cumsum() return cut_table_2[[predictor, 'n', 'percentage', 'WOE', 'IV']]
[docs] def map_IV( data: pd.DataFrame, outcome: str, predictors = None, bins: int = 5 ): """ Name ---- map_IV Description ----------- Maps Information Value (IV) calculations for multiple predictor variables. Calls `calculate_IV()` for every predictor-outcome variable pair. Parameters ---------- - data: DataFrame containing the data - outcome: Name of the outcome variable - predictors: List of predictor variables (if None, all numeric variables except outcome are used) - bins: Number of bins for binning the predictor variables Returns ------- - Dictionary containing IV calculations for each predictor variable and a summary DataFrame """ if predictors is None: predictors = data.select_dtypes(include='number').columns.difference([outcome]) # List of individual tables Tables = {pred: calculate_IV(data, outcome, pred, bins) for pred in predictors} # Compile Summary Table Summary = pd.DataFrame({'Variable': list(Tables.keys())}).assign( IV=lambda df: df['Variable'].map(lambda var: Tables[var].iloc[-1]['IV']) ).sort_values(by='IV', ascending=False) return {'Tables': Tables, 'Summary': Summary}
[docs] def plot_WOE(IV, predictor, figsize: tuple = None): """ Name ---- plot_WOE Description ----------- Plots Weight of Evidence (WOE) for a predictor variable. Parameters ---------- IV : dict Dictionary containing IV calculations for each predictor variable. predictor : str Name of the predictor variable. figsize : tuple, optional The `figsize` parameter is an optional tuple that specifies the size of the figure for the WOE plot visualization. It should be in the format `(width, height)`, where `width` and `height` are in inches. If not provided, a default size of (8, 6) will be used. Returns ------- None This function doesn't return a value; it plots the WOE. Examples -------- >>> import pandas as pd >>> data = pd.DataFrame({ ... 'outcome': [1, 0, 1, 0, 1], ... 'predictor': [10, 20, 30, 40, 50] ... }) >>> outcome = 'outcome' >>> predictor = 'predictor' >>> bins = 5 >>> IV = map_IV(data, outcome, [predictor], bins) >>> plot_WOE(IV, predictor) """ # Identify right table plot_table = IV['Tables'][predictor] # Get range WOE_values = [table['WOE'] for table in IV['Tables'].values()] for i in range(0, len(WOE_values)): WOE_range = np.min(WOE_values[i]), np.max(WOE_values[i]) mn = math.floor(np.min(plot_table['WOE'])) mx = math.ceil(np.max(plot_table['WOE'])) tick_lst = list(range(mn, mx + 1)) # Plot fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6)) sns.barplot(x=predictor, y='WOE', data=plot_table, color='#8BC7E0', ax=ax) for index, value in enumerate(plot_table['WOE']): ax.text(index, value, round(value, 1), ha='right', va='top' if value < 0 else 'bottom', color='red' if value < 0 else 'green') # Use figure-level title to match our header motif, clear Axes title ax.set_title("") fig.text(0.12, 0.91, predictor, ha='left', fontsize=13, weight='bold', alpha=.8) fig.text(0.12, 0.86, "Weight of Evidence by bin", ha='left', fontsize=11, alpha=.8) ax.set_xlabel(predictor) ax.set_ylabel("Weight of Evidence (WOE)") ax.set_ylim(WOE_range[0] * 1.1, WOE_range[1] * 1.1) ax.set_yticks(tick_lst) ax.grid(axis='y', alpha=0.15) # Orange header motif + sensible layout _add_header_decoration(fig) fig.subplots_adjust(top=0.80, right=0.95, bottom=0.12, left=0.01) plt.show() # preserve original behavior (returns None)
[docs] def create_IV( data = pd.DataFrame, predictors = None, outcome:str = None, bins: int = 5, siglevel = 0.05, exc_sig: bool = False, figsize: tuple = None, return_type ="plot" ): """ Name ---- create_IV Description ----------- Creates Information Value (IV) analysis for predictor variables. Parameters ---------- data : pd.DataFrame DataFrame containing the data. predictors : list, optional List of predictor variables. outcome : str Name of the outcome variable. bins : int, optional Number of bins for binning the predictor variables. Defaults to 5. siglevel : float, optional Significance level. Defaults to 0.05. exc_sig : bool, optional Boolean indicating if non-significant predictors should be excluded. If True, only predictors with p-value <= siglevel are included in the analysis. If False, all predictors are included regardless of significance. Defaults to False. return_type : str, optional Type of output to return ("plot", "summary", "list", "plot-WOE", "IV"). Defaults to "plot". Returns ------- Various The type of output to return. Can be "plot", "summary", "list", "plot-WOE", or "IV". Note ---- * create_IV function return_type 'list' and 'summary' has output format as a dictionary, please use for loop to access the key and values. * create_IV function return_type 'IV' has output format as a tuple, tuple element 'output_list'format is dictionary hence please use for loop to access the key and values. Example ------- >>> import numpy as np >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> pred_vars = ["Email_hours", "Meeting_hours", "Chat_hours"] >>> pq_data["outcome_sim"] = np.where(pq_data["Internal_network_size"] > 40, 1, 0) >>> # Example 1: Return IV tables for all predictors without excluding non-significant ones >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="IV") >>> # Example 2: Exclude non-significant predictors and return summary >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=True, return_type="summary") >>> # Example 3: Return IV for all predictors (single plot) >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="plot") >>> # Example 4: Return WOE plots for all predictors >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="plot-WOE") """ # Preserve string pred_chr = predictors.copy() if predictors else None # Select training dataset if predictors is None: train = data.select_dtypes(include=np.number).dropna() else: train = data[predictors + [outcome]].dropna() # Calculate odds odds = train[outcome].sum() / (len(train[outcome]) - train[outcome].sum()) lnodds = np.log(odds) # Assert if not isinstance(exc_sig, bool): raise ValueError("Invalid input to `exc_sig`") # Prepare predictors DataFrame predictors = pd.DataFrame({'Variable': np.array(train.columns)}) predictors = predictors[predictors['Variable'] != outcome].reset_index(drop=True) predictors['Variable'] = predictors['Variable'].astype(str) # Perform statistical test # Perform statistical test predictors_pval = p_test(data=train, outcome=outcome, behavior=predictors["Variable"].tolist()) # Filter significant predictors only if exc_sig is True if exc_sig: predictors_pval_filtered = predictors_pval[predictors_pval["pval"] <= siglevel] if predictors_pval_filtered.shape[0] == 0: raise ValueError("No predictors where the p-value lies below the significance level.") train = train[predictors_pval_filtered["Variable"].tolist() + [outcome]] predictors_to_use = predictors_pval_filtered["Variable"].tolist() else: # Use all predictors regardless of significance train = train[predictors_pval["Variable"].tolist() + [outcome]] predictors_to_use = predictors_pval["Variable"].tolist() # Filter significant predictors only if exc_sig is True if exc_sig: predictors_pval_filtered = predictors_pval[predictors_pval["pval"] <= siglevel] if predictors_pval_filtered.shape[0] == 0: raise ValueError("No predictors where the p-value lies below the significance level.") train = train[predictors_pval_filtered["Variable"].tolist() + [outcome]] predictors_to_use = predictors_pval_filtered["Variable"].tolist() else: # Use all predictors regardless of significance train = train[predictors_pval["Variable"].tolist() + [outcome]] predictors_to_use = predictors_pval["Variable"].tolist() # IV Analysis IV = map_IV(train, outcome, bins=bins, predictors=predictors_to_use) IV_names = list(IV["Tables"].keys()) # Merge with p-values for final output (use appropriate filtered/unfiltered version) if exc_sig: IV_summary = pd.merge(IV["Summary"], predictors_pval_filtered, on="Variable") else: IV_summary = pd.merge(IV["Summary"], predictors_pval, on="Variable") IV_summary["pval"] = IV_summary["pval"].round(10) # Output loop if return_type == "summary": return IV_summary elif return_type == "IV": output_list = {variable: IV["Tables"][variable].assign( ODDS=lambda df: np.exp(df["WOE"] + lnodds), PROB=lambda df: df["ODDS"] / (df["ODDS"] + 1)) for variable in IV_names} return output_list, IV_summary, lnodds elif return_type == "plot": top_n = min(12, IV_summary.shape[0]) # Track existing figures so we can detect the new one before = set(plt.get_fignums()) # Suppress any internal plt.show() inside create_bar_asis with _suppress_matplotlib_show(): bar_obj = create_bar_asis( IV_summary, group_var="Variable", bar_var="IV", title="Information Value (IV)", subtitle=("Showing top", top_n, "predictors"), caption=None, ylab=None, xlab=None, percent=False, bar_colour="default", rounding=1 ) # Resolve the actual figure to decorate fig = None try: # Prefer explicit return (Axes or Figure) from matplotlib.figure import Figure if hasattr(bar_obj, "figure"): # Axes-like fig = bar_obj.figure elif isinstance(bar_obj, Figure): # Figure fig = bar_obj else: # Fallback: pick the newly created figure after = set(plt.get_fignums()) new_ids = list(after - before) if new_ids: fig = plt.figure(new_ids[-1]) else: # last resort fig = plt.gcf() except Exception: fig = plt.gcf() # Apply dynamic size + orange header motif # after resolving `fig` and optional figsize if fig is not None: if figsize: fig.set_size_inches(*figsize, forward=True) subtitle_txt = f"Showing top {top_n} predictors" _retitle_left(fig, "Information Value (IV)", subtitle_txt, left=0.01) _add_header_decoration(fig) # draws at _RULE_Y just below subtitle _reserve_header_space(fig) # moves Axes down so nothing overlaps plt.show() return elif return_type == "plot-WOE": # Preserve original behavior: returns list of Nones (each plot_WOE shows a figure) return [plot_WOE(IV, variable, figsize=figsize) for variable in IV["Summary"]["Variable"]] elif return_type == "list": output_list = {variable: IV["Tables"][variable].assign( ODDS=lambda df: np.exp(df["WOE"] + lnodds), PROB=lambda df: df["ODDS"] / (df["ODDS"] + 1)) for variable in IV_names} return output_list else: raise ValueError("Please enter a valid input for `return_type`.")