Source code for vivainsights.create_inc

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module creates an incidence analysis reflecting the proportion of the population scoring above or below a specified threshold for a metric. 
"""
import typing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.color_codes import COLOR_PALLET_ALT_2
from vivainsights.create_bar import create_bar
from vivainsights.extract_date_range import extract_date_range


[docs]
def create_inc(data: pd.DataFrame, metric: str, hrvar: typing.List or str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str = 'plot'):
    """
    Name
    ----
    create_inc

    Description
    -----------
    Create an incidence analysis reflecting proportion of population scoring above or below a threshold for a metric. 
    An incidence analysis is generated, with each value in the table reflecting the proportion of the population that 
    is above or below a threshold for a specified metric. There is an option to only provide a single `hrvar` in which a 
    bar plot is generated, or two `hrvar` values where an incidence table (heatmap) is generated.

    Parameters
    ----------
    data : pandas dataframe
        A Standard Person Query dataset in the form of a Pandas DataFrame.
    metric : str 
        Name of the metric, e.g. "Collaboration_hours".
    hrvar : str or list
         Name(s) of the HR Variable(s) by which to split metrics.
    mingroup : int
        Privacy threshold / minimum group size. Defaults to 5.
    threshold : float
        Threshold value to split the data based on the position argument. Defaults to None.
    position :  str 
        One of the below valid values:
        - "above": show incidence of those equal to or above the threshold
        - "below": show incidence of those equal to or below the threshold
    return_type : str
        What to return. This must be one of the following strings:
        - "plot"
        - "table"
                    
    Returns
    -------
    Output is returned depending on the value passed to the return_type argument:
    - "plot": Matplotlib or Seaborn plot object
    - "table": Pandas DataFrame
    
    Raises
    ------
    ValueError: If hrvar is not a string or list with at most length 2.

    Example
    -------
    >>> import vivainsights as vi
    >>> pq_data = vi.load_pq_data()
    >>> vi.create_inc(
        pq_data, 
        metric = 'Collaboration_hours',
        hrvar = 'LevelDesignation',
        mingroup = 5,
        threshold = 10,
        position = 'above',
        return_type = 'plot'
        ) 
    """
    
    if not isinstance(hrvar, list):
        hrvar = [hrvar]
    if len(hrvar) > 2:
        raise ValueError("`hrvar` can only accept a list of length 2.")
    
    if len(hrvar) == 1:
        return create_inc_bar(data, metric, hrvar[0], mingroup, threshold, position, return_type)
    else:
        return create_inc_grid(data, metric, hrvar, mingroup, threshold, position, return_type)

    

[docs]
def create_inc_bar(data: pd.DataFrame, metric: str, hrvar: str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str='plot'):
    """
    Name
    -----
    create_inc_bar

    Description
    -----------
    Run `create_inc` with only single `hrvar`. Returning a bar chart

    Parameters
    ----------
    data : pandas dataframe
        A Standard Person Query dataset in the form of a Pandas DataFrame.
    metric : str 
        Name of the metric, e.g. "Collaboration_hours".
    hrvar : str
        Name of the HR Variable by which to split metrics.
    mingroup : int
        Privacy threshold / minimum group size. Defaults to 5.
    threshold : float
        Threshold value to split the data based on the position argument. Defaults to None.
    position : str
        One of the below valid values:
        - "above": show incidence of those equal to or above the threshold
        - "below": show incidence of those equal to or below the threshold
    return_type : str 
        What to return. This must be one of the following strings:
        - "plot"
        - "table"
                    
    Returns
    -------
    Output is returned depending on the value passed to the return_type argument:
    - "plot": Matplotlib or Seaborn plot object
    - "table": Pandas DataFrame

    Raises
    ------
    ValueError: If hrvar is not a string.

    Example
    -------
    >>> create_inc_bar(data = pq_data, metric = "Collaboration_hours", hrvar = "LevelDesignation", threshold = 20, position = "below", return_type = "plot")
    """

    # Transform data so that metrics become proportions
    data_t = data.copy()
    if position == "above":
        data_t[metric] = data_t[metric] >= threshold
    elif position == "below":
        data_t[metric] = data_t[metric] <= threshold
    else:
        raise ValueError("Please enter a valid input for `position`.")
    
    
    title_text = f"Incidence of {metric} {position} {threshold}" # Set title text    
    subtitle_text = f"Percentage and number of employees by {hrvar}" # Set subtitle text
    
    if return_type == 'data':
        return data_t
    else:    
        return create_bar(
            data_t,
            metric,
            hrvar,
            mingroup,
            percent = True,
            plot_title = title_text,
            plot_subtitle = subtitle_text,
            return_type = return_type
            )



[docs]
def create_inc_grid(data: pd.DataFrame, metric: str, hrvar: typing.List, mingroup: int=5, threshold: float=None, position: str=None, return_type: str='plot'):
    """
    Name
    -----
    create_inc_grid

    Description
    -----------
    Run `create_inc` with two `hrvar`.
    Returning a heatmap

    Parameters
    ----------
    data : pandas dataframe 
        A Standard Person Query dataset in the form of a Pandas DataFrame.
    metric : str
         Name of the metric, e.g. "Collaboration_hours".
    hrvar : list
         Names of the HR Variables by which to split metrics.
    mingroup : int
         Privacy threshold / minimum group size. Defaults to 5.
    threshold : float
         Threshold value to split the data based on the position argument. Defaults to None.
    position : str 
        One of the below valid values:
        - "above": show incidence of those equal to or above the threshold
        - "below": show incidence of those equal to or below the threshold
    return_type : str
        What to return. This must be one of the following strings:
        - "plot"
        - "table"
                    
    Returns
    -------
    Output is returned depending on the value passed to the return_type argument:
    - "plot": Matplotlib or Seaborn plot object
    - "table": Pandas DataFrame

    Raises
    ------
    ValueError: If hrvar is not a list of length 2.
    """

    if not isinstance(hrvar, list) or len(hrvar) != 2:
        raise ValueError("`hrvar` must be a list of length 2.")

    metric_to_pass = np.where(data[metric] >= threshold, 1, 0) \
        if position == "above" else np.where(data[metric] <= threshold, 1, 0) \
            if position == "below" else {}
    
    myTable: pd.DataFrame = (
        data
        .assign(metric_inc=metric_to_pass)
        .groupby(hrvar + ['PersonId'], as_index=False)
        .agg({'metric_inc': 'mean'})
        .groupby(hrvar, as_index=False)
        .agg({'metric_inc': 'mean', 'PersonId': 'nunique'})
        .rename(columns={'metric_inc': 'incidence', 'PersonId': 'count'})
        .query('count >= @mingroup')
        .sort_values('incidence', ascending=False)
        )

    if return_type == "table":
        
        return myTable

    elif return_type == "plot":

        # Set title text
        title_text = f"Incidence of {metric.replace('_', ' ').capitalize()} {position} {threshold}"

        # Set subtitle text
        subtitle_text = f"Percentage and number of employees by {hrvar[0]} and {hrvar[1]}"
        cap_str = extract_date_range(data, return_type = 'text')
        
        # Create the heatmap with the new annot DataFrame
        myTable['metric_text'] = myTable.apply(lambda row: f"{row['incidence']*100:.1f}% ({row['count']})", axis=1)
        
        # Order the columns and rows by the longest first to fit landscape plot
        if myTable[hrvar[0]].nunique() > myTable[hrvar[1]].nunique():
            hrvar = [hrvar[1], hrvar[0]]
            
        # Annotation to pass to heatmap    
        annot_df = myTable.pivot(index=hrvar[0], columns=hrvar[1], values='metric_text')
        
        # Setup plot size.
        fig, ax = plt.subplots(figsize=(7, 4))
        
        # Create grid 
        # Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data.
        # ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1)        
        ax.grid(False)
        
        # Remove tick marks
        ax.tick_params(
            which='both',      # Both major and minor ticks are affected
            top=False,         # Remove ticks from the top
            bottom=False,      # Remove ticks from the bottom
            left=False,        # Remove ticks from the left
            right=False        # Remove ticks from the right
        )
        
        sns.set_theme(font_scale=0.7)
        # plot heatmap
        sns.heatmap(
            myTable.pivot(index=hrvar[0], columns=hrvar[1], values='incidence'),
            annot = annot_df,
            fmt='',
            cmap=COLOR_PALLET_ALT_2,
            center=0.5,
            square=True,
            ax=ax
            )

        # Add in line and tag
        ax.plot(
            [0, .9], # Set width of line, previously [-0.08, .9]
            [0.9, 0.9], # Set height of line
            # [1.17, 1.17], # Set height of line
            transform = fig.transFigure, # Set location relative to plot
            clip_on = False,
            color = '#fe7f4f',
            linewidth = .6
        )

        ax.add_patch(
            plt.Rectangle(
                (0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9]
                0.05, # Width of rectangle
                -0.025, # Height of rectangle
                facecolor = '#fe7f4f',
                transform = fig.transFigure,
                clip_on = False,
                linewidth = 0
                )
        )        
        
        # Set title
        ax.text(
            x = 0, y = 1.00,
            s = title_text,
            transform = fig.transFigure,
            ha = 'left',
            fontsize = 13,
            weight = 'bold',
            alpha = .8
        )
        
        # Set subtitle
        ax.text(
            x = 0, y = 0.95,
            s = subtitle_text,
            transform = fig.transFigure,
            ha = 'left',
            fontsize = 11,        
            alpha = .8
        )
        
        # Set caption
        ax.text(x=0, y=0.02, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7)  
        
        # return the plot object
        return fig
        # plt.show()
           
        """ Legacy
        ax.set(title=title_text, xlabel=hrvar[1], ylabel=hrvar[0], aspect='equal')
        ax.text(1.1, 1.05, subtitle_text, transform=ax.transAxes, fontsize=14, va='center')        
        """
    else: 
        raise ValueError("Please enter a valid input for `return_type`: Either `table` or `plot`.")