Source code for vivainsights.create_IV

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon
from scipy.stats import mstats
import math
import warnings
from vivainsights.create_bar_asis import *


# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

from matplotlib.lines import Line2D

# Optional: reuse vivainsights colors if present
try:
    from vivainsights.color_codes import Colors
    _HIGHLIGHT = Colors.HIGHLIGHT_NEGATIVE.value  # orange
except Exception:
    _HIGHLIGHT = "#fe7f4f"

from matplotlib.figure import Figure  # add this import at the top
from contextlib import contextmanager

@contextmanager
def _suppress_matplotlib_show():
    orig_show = plt.show
    try:
        plt.show = lambda *a, **k: None  # no-op
        yield
    finally:
        plt.show = orig_show

# Header positions (tweak here if you like)
_TITLE_Y   = 0.955
_SUB_Y     = 0.915
_RULE_Y    = 0.900
_TOP_LIMIT = 0.84   # top of the Axes area (leave space for header above)

def _retitle_left(fig, title_text, subtitle_text=None, left=0.01):
    """Left-aligned figure-level title/subtitle; hide axis/suptitle."""
    for ax in fig.get_axes():
        try: ax.set_title("")
        except Exception: pass
    if getattr(fig, "_suptitle", None) is not None:
        fig._suptitle.set_visible(False)

    fig.text(left, _TITLE_Y, title_text, ha="left", fontsize=13, weight="bold", alpha=.8)
    if subtitle_text:
        fig.text(left, _SUB_Y, subtitle_text, ha="left", fontsize=11, alpha=.8)

def _add_header_decoration(fig, color=_HIGHLIGHT, y=_RULE_Y):
    """Orange rule + box under the subtitle, on an overlay so it's always on top."""
    overlay = fig.add_axes([0, 0, 1, 1], frameon=False, zorder=10)
    overlay.set_axis_off()
    overlay.add_line(Line2D([0.01, 1.0], [y, y], transform=overlay.transAxes,
                            color=color, linewidth=1.2))
    overlay.add_patch(plt.Rectangle((0.01, y), 0.03, -0.015,
                                    transform=overlay.transAxes,
                                    facecolor=color, linewidth=0))

def _reserve_header_space(fig, top=_TOP_LIMIT):
    """Push the plot area down so it doesn't overlap the header."""
    try:
        # If constrained layout was enabled by create_bar_asis, disable so we can adjust
        if hasattr(fig, "get_constrained_layout") and fig.get_constrained_layout():
            fig.set_constrained_layout(False)
    except Exception:
        pass
    fig.subplots_adjust(top=top)



[docs]
def p_test(
    data: pd.DataFrame,
    outcome: str,
    behavior: list,
    paired = False
    ):
    """
    Name
    -----
    p_test
    
    Description
    -----------
    Performs Wilcoxon signed-rank test or rank-sum test between two groups.

    Parameters
    ----------
    data : pd.DataFrame
        A Pandas DataFrame.
    outcome : str
        Name of the outcome variable.
    behavior : list
        List of behavior variables to test.
    paired : bool, optional
        Boolean indicating if the test should be paired or not. Default is False.

    Returns
    -------
    pd.DataFrame
        A DataFrame with variables and corresponding p-values.
    
    Examples
    --------    
    >>> import vivainsights as vi
    >>> import pandas as pd
    >>> data = pd.DataFrame({
    ...     'outcome': [1, 0, 1, 0, 1],
    ...     'behavior1': [10, 20, 30, 40, 50],
    ...     'behavior2': [5, 15, 25, 35, 45]
    ... })
    >>> outcome = 'outcome'
    >>> behavior = ['behavior1', 'behavior2']
    >>> vi.p_test(data, outcome, behavior)
    """
    
    # Filter the dataset based on the outcome variable
    train = data[data[outcome].isin([0, 1])].copy()

    # Convert outcome to string and then to a factor
    train[outcome] = train[outcome].astype(str).astype('category')
    p_value_dict = {}
    for i in behavior:
        # Separate data into positive and negative outcomes
        pos = train[train[outcome] == '1'][i].dropna()
        neg = train[train[outcome] == '0'][i].dropna()

        # Ensure that the lengths of pos and neg are the same
        min_len = min(len(pos), len(neg))
        pos = pos[:min_len]
        neg = neg[:min_len]

        # Perform Wilcoxon signed-rank test (or rank-sum test for unpaired data)
        _, p_value = wilcoxon(pos, neg) if paired else wilcoxon(pos, neg, alternative='two-sided')
        p_value_dict.update({i: p_value})

    data_frame = pd.DataFrame(list(p_value_dict.items()), columns=['Variable', 'pval'])

    return data_frame




[docs]
def calculate_IV(
    data: pd.DataFrame,
    outcome: str,
    predictor: str,
    bins: int
    ):
    """
    Name
    ----
    calculate_IV

    Description
    -----------
    Calculates Information Value (IV) between a single predictor variable and the outcome variable.

    Parameters
    ----------
    data : pd.DataFrame
        A DataFrame containing the data.
    outcome : str
        Name of the outcome variable.
    predictor : str
        Name of the predictor variable.
    bins : int
        Number of bins for binning the predictor variable.

    Returns
    -------
    pd.DataFrame
        A DataFrame with IV calculations for the predictor variable.

    Raises
    ------
    ValueError
        If the outcome variable has missing values in the input training data frame.

    Examples
    --------
    >>> import vivainsights as vi
    >>> import pandas as pd
    >>> data = pd.DataFrame({
    ...     'outcome': [1, 0, 1, 0, 1],
    ...     'predictor': [10, 20, 30, 40, 50]
    ... })
    >>> outcome = 'outcome'
    >>> predictor = 'predictor'
    >>> bins = 5
    >>> vi.calculate_IV(data, outcome, predictor, bins)
    """
    
    pred_var = data[predictor]
    outc_var = data[outcome]

    # Check inputs
    if outc_var.isna().sum() > 0:
        raise ValueError(f"dependent variable {outcome} has missing values in the input training data frame")

    # Compute quantiles
    q = mstats.mquantiles(pred_var, prob=np.arange(1, bins) / bins, alphap=0, betap=0)

    # Compute cuts
    cuts = np.unique(q)

    # Compute intervals
    intervals = np.digitize(pred_var, bins=cuts, right=False)

    # Compute cut_table
    cut_table = pd.crosstab(intervals, outc_var).reset_index()

    # Compute min/max and percentage
    cut_table_2 = pd.DataFrame({
        'var': pred_var,
        'intervals': intervals
    }).groupby('intervals').agg(
        min=('var', 'min'),
        max=('var', 'max'),
        n=('var', 'size')
    ).reset_index().round({'min': 1, 'max': 1})

    cut_table_2[predictor] = cut_table_2.apply(lambda row: f"[{row['min']},{row['max']}]", axis=1)
    cut_table_2['percentage'] = cut_table_2['n'] / cut_table_2['n'].sum()
    cut_table_2 = cut_table_2[[predictor, 'intervals', 'n', 'percentage']]

    # Calculate Non-events and Events
    cut_table_1 = cut_table[1].values.astype(float)
    cut_table_0 = cut_table[0].values.astype(float)
    n_non_event = cut_table_1 * np.sum(cut_table_0)
    n_yes_event = cut_table_0 * np.sum(cut_table_1)

    # Compute WOE (Weight of Evidence)
    cut_table_2['WOE'] = np.where((cut_table[1] > 0) & (cut_table[0] > 0), np.log(n_non_event / n_yes_event), 0)

    # Compute IV_weight
    p1 = cut_table[1] / cut_table[1].sum()
    p0 = cut_table[0] / cut_table[0].sum()
    cut_table_2['IV_weight'] = p1 - p0
    cut_table_2['IV'] = cut_table_2['WOE'] * cut_table_2['IV_weight']
    cut_table_2['IV'] = cut_table_2['IV'].cumsum()

    return cut_table_2[[predictor, 'n', 'percentage', 'WOE', 'IV']]




[docs]
def map_IV(
    data: pd.DataFrame,
    outcome: str,
    predictors = None,
    bins: int = 5
    ):
    """
    Name
    ----
    map_IV
    
    Description
    -----------
    Maps Information Value (IV) calculations for multiple predictor variables. 
    Calls `calculate_IV()` for every predictor-outcome variable pair.

    Parameters
    ----------
    - data: DataFrame containing the data
    - outcome: Name of the outcome variable
    - predictors: List of predictor variables (if None, all numeric variables except outcome are used)
    - bins: Number of bins for binning the predictor variables

    Returns
    -------
    - Dictionary containing IV calculations for each predictor variable and a summary DataFrame    
    """
    
    if predictors is None:
        predictors = data.select_dtypes(include='number').columns.difference([outcome])

    # List of individual tables
    Tables = {pred: calculate_IV(data, outcome, pred, bins) for pred in predictors}

    # Compile Summary Table
    Summary = pd.DataFrame({'Variable': list(Tables.keys())}).assign(
        IV=lambda df: df['Variable'].map(lambda var: Tables[var].iloc[-1]['IV'])
    ).sort_values(by='IV', ascending=False)
    return {'Tables': Tables, 'Summary': Summary}




[docs]
def plot_WOE(IV, predictor, figsize: tuple = None):
    """
    Name
    ----
    plot_WOE

    Description
    -----------
    Plots Weight of Evidence (WOE) for a predictor variable.

    Parameters
    ----------
    IV : dict
        Dictionary containing IV calculations for each predictor variable.
    predictor : str
        Name of the predictor variable.
    figsize : tuple, optional
        The `figsize` parameter is an optional tuple that specifies the size of the figure for the WOE plot visualization.
        It should be in the format `(width, height)`, where `width` and `height` are in inches. If not provided, a default size of (8, 6) will be used.

    Returns
    -------
    None
        This function doesn't return a value; it plots the WOE.

    Examples
    --------
    >>> import pandas as pd
    >>> data = pd.DataFrame({
    ...     'outcome': [1, 0, 1, 0, 1],
    ...     'predictor': [10, 20, 30, 40, 50]
    ... })
    >>> outcome = 'outcome'
    >>> predictor = 'predictor'
    >>> bins = 5
    >>> IV = map_IV(data, outcome, [predictor], bins)
    >>> plot_WOE(IV, predictor)
    """
    # Identify right table
    plot_table = IV['Tables'][predictor]

    # Get range
    WOE_values = [table['WOE'] for table in IV['Tables'].values()]
    for i in range(0, len(WOE_values)):
        WOE_range = np.min(WOE_values[i]), np.max(WOE_values[i])
    mn = math.floor(np.min(plot_table['WOE']))
    mx = math.ceil(np.max(plot_table['WOE']))
    tick_lst = list(range(mn, mx + 1))

    # Plot
    fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6))
    sns.barplot(x=predictor, y='WOE', data=plot_table, color='#8BC7E0', ax=ax)

    for index, value in enumerate(plot_table['WOE']):
        ax.text(index, value, round(value, 1),
                ha='right',
                va='top' if value < 0 else 'bottom',
                color='red' if value < 0 else 'green')

    # Use figure-level title to match our header motif, clear Axes title
    ax.set_title("")
    fig.text(0.12, 0.91, predictor, ha='left', fontsize=13, weight='bold', alpha=.8)
    fig.text(0.12, 0.86, "Weight of Evidence by bin", ha='left', fontsize=11, alpha=.8)

    ax.set_xlabel(predictor)
    ax.set_ylabel("Weight of Evidence (WOE)")
    ax.set_ylim(WOE_range[0] * 1.1, WOE_range[1] * 1.1)
    ax.set_yticks(tick_lst)
    ax.grid(axis='y', alpha=0.15)

    # Orange header motif + sensible layout
    _add_header_decoration(fig)
    fig.subplots_adjust(top=0.80, right=0.95, bottom=0.12, left=0.01)

    plt.show()   # preserve original behavior (returns None)




[docs]
def create_IV(
    data = pd.DataFrame,
    predictors = None,
    outcome:str = None,
    bins: int = 5,
    siglevel = 0.05,
    exc_sig: bool = False,
    figsize: tuple = None,
    return_type ="plot"
    ):
    """
    Name
    ----
    create_IV

    Description
    -----------
    Creates Information Value (IV) analysis for predictor variables.

    Parameters
    ----------
    data : pd.DataFrame
        DataFrame containing the data.
    predictors : list, optional
        List of predictor variables.
    outcome : str
        Name of the outcome variable.
    bins : int, optional
        Number of bins for binning the predictor variables. Defaults to 5.
    siglevel : float, optional
        Significance level. Defaults to 0.05.
    exc_sig : bool, optional
        Boolean indicating if non-significant predictors should be excluded. 
        If True, only predictors with p-value <= siglevel are included in the analysis.
        If False, all predictors are included regardless of significance. Defaults to False.
    return_type : str, optional
        Type of output to return ("plot", "summary", "list", "plot-WOE", "IV"). Defaults to "plot".

    Returns
    -------
    Various
        The type of output to return. Can be "plot", "summary", "list", "plot-WOE", or "IV".
    
    Note    
    ----
      * create_IV function return_type 'list' and 'summary' has output format as a dictionary, please use for loop to access the key and values.
      * create_IV function return_type 'IV' has output format as a tuple, tuple element 'output_list'format is dictionary hence please use for loop to access the key and values.

    Example
    -------
    >>> import numpy as np
    >>> import vivainsights as vi
    >>> pq_data = vi.load_pq_data()
    >>> pred_vars = ["Email_hours", "Meeting_hours", "Chat_hours"]
    >>> pq_data["outcome_sim"] = np.where(pq_data["Internal_network_size"] > 40, 1, 0)

    >>> # Example 1: Return IV tables for all predictors without excluding non-significant ones
    >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="IV")

    >>> # Example 2: Exclude non-significant predictors and return summary
    >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=True, return_type="summary")

    >>> # Example 3: Return IV for all predictors (single plot)
    >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="plot")
    
    >>> # Example 4: Return WOE plots for all predictors
    >>> vi.create_IV(pq_data, predictors=pred_vars, outcome="outcome_sim", exc_sig=False, return_type="plot-WOE")
    """
    
    # Preserve string
    pred_chr = predictors.copy() if predictors else None

    # Select training dataset
    if predictors is None:
        train = data.select_dtypes(include=np.number).dropna()
    else:
        train = data[predictors + [outcome]].dropna()

    # Calculate odds
    odds = train[outcome].sum() / (len(train[outcome]) - train[outcome].sum())
    lnodds = np.log(odds)

    # Assert
    if not isinstance(exc_sig, bool):
        raise ValueError("Invalid input to `exc_sig`")

    # Prepare predictors DataFrame
    predictors = pd.DataFrame({'Variable': np.array(train.columns)})
    predictors = predictors[predictors['Variable'] != outcome].reset_index(drop=True)
    predictors['Variable'] = predictors['Variable'].astype(str)

    # Perform statistical test
    # Perform statistical test
    predictors_pval = p_test(data=train, outcome=outcome, behavior=predictors["Variable"].tolist())
    
    # Filter significant predictors only if exc_sig is True
    if exc_sig:
        predictors_pval_filtered = predictors_pval[predictors_pval["pval"] <= siglevel]
        
        if predictors_pval_filtered.shape[0] == 0:
            raise ValueError("No predictors where the p-value lies below the significance level.")
        
        train = train[predictors_pval_filtered["Variable"].tolist() + [outcome]]
        predictors_to_use = predictors_pval_filtered["Variable"].tolist()
    else:
        # Use all predictors regardless of significance
        train = train[predictors_pval["Variable"].tolist() + [outcome]]
        predictors_to_use = predictors_pval["Variable"].tolist()
    
    # Filter significant predictors only if exc_sig is True
    if exc_sig:
        predictors_pval_filtered = predictors_pval[predictors_pval["pval"] <= siglevel]
        
        if predictors_pval_filtered.shape[0] == 0:
            raise ValueError("No predictors where the p-value lies below the significance level.")
        
        train = train[predictors_pval_filtered["Variable"].tolist() + [outcome]]
        predictors_to_use = predictors_pval_filtered["Variable"].tolist()
    else:
        # Use all predictors regardless of significance
        train = train[predictors_pval["Variable"].tolist() + [outcome]]
        predictors_to_use = predictors_pval["Variable"].tolist()

    # IV Analysis
    IV = map_IV(train, outcome, bins=bins, predictors=predictors_to_use)
    IV_names = list(IV["Tables"].keys())
    
    # Merge with p-values for final output (use appropriate filtered/unfiltered version)
    if exc_sig:
        IV_summary = pd.merge(IV["Summary"], predictors_pval_filtered, on="Variable")
    else:
        IV_summary = pd.merge(IV["Summary"], predictors_pval, on="Variable")
    
    IV_summary["pval"] = IV_summary["pval"].round(10)

    # Output loop
    if return_type == "summary":
        return IV_summary

    elif return_type == "IV":
        output_list = {variable: IV["Tables"][variable].assign(
            ODDS=lambda df: np.exp(df["WOE"] + lnodds),
            PROB=lambda df: df["ODDS"] / (df["ODDS"] + 1)) for variable in IV_names}
        return output_list, IV_summary, lnodds

    elif return_type == "plot":
        top_n = min(12, IV_summary.shape[0])
    
        # Track existing figures so we can detect the new one
        before = set(plt.get_fignums())
    
        # Suppress any internal plt.show() inside create_bar_asis
        with _suppress_matplotlib_show():
            bar_obj = create_bar_asis(
                IV_summary,
                group_var="Variable",
                bar_var="IV",
                title="Information Value (IV)",
                subtitle=("Showing top", top_n, "predictors"),
                caption=None,
                ylab=None,
                xlab=None,
                percent=False,
                bar_colour="default",
                rounding=1
            )
        
        # Resolve the actual figure to decorate
        fig = None
        try:
            # Prefer explicit return (Axes or Figure)
            from matplotlib.figure import Figure
            if hasattr(bar_obj, "figure"):           # Axes-like
                fig = bar_obj.figure
            elif isinstance(bar_obj, Figure):        # Figure
                fig = bar_obj
            else:
                # Fallback: pick the newly created figure
                after = set(plt.get_fignums())
                new_ids = list(after - before)
                if new_ids:
                    fig = plt.figure(new_ids[-1])
                else:
                    # last resort
                    fig = plt.gcf()
        except Exception:
            fig = plt.gcf()
    
        # Apply dynamic size + orange header motif
        # after resolving `fig` and optional figsize
        if fig is not None:
            if figsize:
                fig.set_size_inches(*figsize, forward=True)
        
            subtitle_txt = f"Showing top {top_n} predictors"
            _retitle_left(fig, "Information Value (IV)", subtitle_txt, left=0.01)
            _add_header_decoration(fig)       # draws at _RULE_Y just below subtitle
            _reserve_header_space(fig)        # moves Axes down so nothing overlaps
        
        plt.show()

        return


    elif return_type == "plot-WOE":
        # Preserve original behavior: returns list of Nones (each plot_WOE shows a figure)
        return [plot_WOE(IV, variable, figsize=figsize) for variable in IV["Summary"]["Variable"]]

    elif return_type == "list":
        output_list = {variable: IV["Tables"][variable].assign(
            ODDS=lambda df: np.exp(df["WOE"] + lnodds),
            PROB=lambda df: df["ODDS"] / (df["ODDS"] + 1)) for variable in IV_names}
        return output_list
    else:
        raise ValueError("Please enter a valid input for `return_type`.")