Source code for vivainsights.create_boxplot

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The function `create_boxplot` creates a boxplot visualization and summary table for a given metric
and grouping variable in a dataset.
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.extract_date_range import extract_date_range
from vivainsights.color_codes import *
from vivainsights.totals_col import *


[docs]
def create_boxplot_calc(data: pd.DataFrame, metric, hrvar, mingroup):
        # Data calculations    
        plot_data = (
        data.rename(columns={hrvar: "group"}) # Rename hrvar to "group"
            .groupby(["PersonId", "group"], as_index=False)[metric]
            .mean()
            .merge(
                data.rename(columns={hrvar: "group"}) # Rename hrvar to "group"
                    .groupby("group", as_index=False)["PersonId"]
                    .nunique()
                    .rename(columns={"PersonId": "Employee_Count"}),
                on="group"
            )
            .query("Employee_Count >= @mingroup")
            )
        # Data legend calculations    
        plot_legend = (
            plot_data.groupby("group", as_index=False)
                .first()
                .merge(
                    plot_data.groupby("group", as_index=False)["Employee_Count"]
                        .first()
                        .rename(columns={"Employee_Count": "n"}),
                    on="group"
                )
                .assign(Employee_Count=lambda x: x["n"].astype(str))
                .loc[:, ["group", "Employee_Count"]]
                )
        return(plot_data)




[docs]
def create_boxplot_summary(data: pd.DataFrame, metric, hrvar, mingroup):

        # Data calculations            
        plot_data = create_boxplot_calc(data, metric, hrvar, mingroup)
        
        # Summary table
        summary_table = (
            plot_data.groupby("group", as_index=False)[metric]
            .agg(["mean", "median", "std", "min", "max", "count"])
            .rename(columns={"mean": "mean", "median": "median", "std": "sd", "min": "min", "max": "max", "count": "n"})
        )
        return(summary_table)




[docs]
def create_boxplot_viz(data: pd.DataFrame, metric, hrvar, mingroup):    
        
        # Get max value
        max_point = data[metric].max() * 1.2
        
        # Clean labels for plotting
        clean_nm = metric.replace("_", " ")
        cap_str = extract_date_range(data, return_type = 'text')
        
        # Boxplot Vizualization
        col_highlight = Colors.HIGHLIGHT_NEGATIVE.value
        col_main = Colors.PRIMARY.value
        
        # Setup plot size.
        fig, ax = plt.subplots(figsize=(7,4))
        
        # Create grid 
        # Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data.
        # ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1)
        ax.grid(False)
        
        # Remove splines. Can be done one at a time or can slice with a list.
        ax.spines[['top','right','left']].set_visible(False)
        
        # Generate boxplot
        sns.boxplot(x=hrvar, y=metric, data= data, ax=ax)
        
        # Add in line and tag
        ax.plot(
            [0, .9], # Set width of line, previously [-0.08, .9]
            [0.9, 0.9], # Set height of line
            # [1.17, 1.17], # Set height of line
            transform = fig.transFigure, # Set location relative to plot
            clip_on = False,
            color = col_highlight,
            linewidth = .6
        )
        ax.add_patch(
            plt.Rectangle(
                (0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9]
                0.05, # Width of rectangle
                -0.025, # Height of rectangle
                facecolor = col_highlight,
                transform = fig.transFigure,
                clip_on = False,
                linewidth = 0
                )
        )
        
        # Set title
        ax.text(
            x = 0, y = 1.00,
            s = (f"Distribution of {clean_nm.lower()}"),
            transform = fig.transFigure,
            ha = 'left',
            fontsize = 13,
            weight = 'bold',
            alpha = .8
        )
        
        # Set subtitle
        ax.text(
            x = 0, y = 0.95,
            s = f'By {hrvar}',
            transform = fig.transFigure,
            ha = 'left',
            fontsize = 11,        
            alpha = .8
        )
        
        # Set caption
        ax.text(x=0, y=-0.08, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7)

        # plt.show()
        # return the plot object
        return fig    
    
        """ ggplot implementation - legacy
        plot_object = (
        ggplot(plot_data, aes(x="group", y=metric)) +
        geom_boxplot(color="#578DB8",varwidth=True) +
        ylim(0, max_point) +
        theme(figure_size=(16,8),
              axis_text=element_text(size=12),
              axis_text_x=element_text(angle=0, hjust=1,size=12, linespacing=10,ha='center',va='center'),
              plot_title=element_text(color="grey", face="bold", size=18),
              legend_position="bottom",
              legend_title=element_text(size=14),
              legend_text=element_text(size=14)) +
        labs(title=clean_nm + (f"\n\nDistribution of {clean_nm.lower()} by {hrvar.lower().replace('_', ' ')}"),
             x=hrvar,
             y=f"Average {clean_nm}",
             caption=(extract_date_range(data, return_type="text"))
             ))
        return(plot_object)
        """

    

[docs]
def create_boxplot(data: pd.DataFrame, metric: str, hrvar: str ="Organization", mingroup=5, return_type: str = "plot"):
    """
    Name
    -----
    create_boxplot

    Description
    -----------
    This function creates a boxplot visualization and summary table for a given metric and HR variable
    in a pandas DataFrame.
    
    Parameters
    ----------
    data : pandas dataframe
        A pandas DataFrame containing the data for analysis.
    metric : str
        The `metric` parameter is a string that represents the variable or metric for which you want to create the boxplot visualization and summary table. This variable should be present in the input data` DataFrame.
    hrvar : str, optional
        The `hrvar` parameter is the HR variable that you want to use for grouping the data. By default, it is set to "Organization", but you can pass a different HR variable if needed.
    mingroup: int, optional
        The `mingroup` parameter is an optional parameter that specifies the minimum number of observations required in each group for the boxplot to be created. If a group has fewer observations than the `mingroup` value, it will be excluded from the boxplot. The default value is 5.
    return_type : str, optional
        The `return_type` parameter determines the type of output that the function will return. It can take one of three values:
    
    Returns
    -------
    The function `create_boxplot` returns different outputs based on the value of the `return_type` parameter

    Example
    -------
    >>> import vivainsights as vi
    >>> pq_data = vi.load_pq_data()
    >>> create_boxplot(pq_data, metric = "Collaboration_hours", hrvar = "Organization", return_type = "plot")
    
    """
    # Check inputs
    required_variables = ["MetricDate", metric, "PersonId"]

    # Error message if variables are not present and Nothing happens if all present
    assert all(var in data.columns for var in required_variables), f"Missing required variable(s): {set(required_variables) - set(data.columns)}"

    # Handling NULL values passed to hrvar
    if hrvar is None:
        data = totals_col(data)
        hrvar = "Total"    

    # Summary table
    summary_table = create_boxplot_summary(data, metric, hrvar, mingroup)
        
    # Group order
    group_ord = summary_table.sort_values(by="mean", ascending=True)["group"].tolist()
    
    # Main output
    if return_type == "table":        
        return pd.DataFrame(summary_table).reset_index()
    elif return_type == "plot":
        # Boxplot vizualization    
        plot_object = create_boxplot_viz(data, metric, hrvar, mingroup)
        return plot_object
    elif return_type == "data":
        # Data calculations
        plot_data = create_boxplot_calc(data, metric, hrvar, mingroup)
        return plot_data.assign(group=pd.Categorical(plot_data.group, categories=group_ord)).sort_values(by="group", ascending=False)
    else:
        raise ValueError("Please enter a valid input for `return`.")