Source code for vivainsights.create_boxplot

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The function `create_boxplot` creates a boxplot visualization and summary table for a given metric
and grouping variable in a dataset.
"""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.extract_date_range import extract_date_range
from vivainsights.color_codes import *
from vivainsights.totals_col import *

[docs] def create_boxplot_calc(data: pd.DataFrame, metric, hrvar, mingroup): # Data calculations plot_data = ( data.rename(columns={hrvar: "group"}) # Rename hrvar to "group" .groupby(["PersonId", "group"], as_index=False)[metric] .mean() .merge( data.rename(columns={hrvar: "group"}) # Rename hrvar to "group" .groupby("group", as_index=False)["PersonId"] .nunique() .rename(columns={"PersonId": "Employee_Count"}), on="group" ) .query("Employee_Count >= @mingroup") ) # Data legend calculations plot_legend = ( plot_data.groupby("group", as_index=False) .first() .merge( plot_data.groupby("group", as_index=False)["Employee_Count"] .first() .rename(columns={"Employee_Count": "n"}), on="group" ) .assign(Employee_Count=lambda x: x["n"].astype(str)) .loc[:, ["group", "Employee_Count"]] ) return(plot_data)
[docs] def create_boxplot_summary(data: pd.DataFrame, metric, hrvar, mingroup): # Data calculations plot_data = create_boxplot_calc(data, metric, hrvar, mingroup) # Summary table summary_table = ( plot_data.groupby("group", as_index=False)[metric] .agg(["mean", "median", "std", "min", "max", "count"]) .rename(columns={"mean": "mean", "median": "median", "std": "sd", "min": "min", "max": "max", "count": "n"}) ) return(summary_table)
[docs] def create_boxplot_viz(data: pd.DataFrame, metric, hrvar, mingroup): # Get max value max_point = data[metric].max() * 1.2 # Clean labels for plotting clean_nm = metric.replace("_", " ") cap_str = extract_date_range(data, return_type = 'text') # Boxplot Vizualization col_highlight = Colors.HIGHLIGHT_NEGATIVE.value col_main = Colors.PRIMARY.value # Setup plot size. fig, ax = plt.subplots(figsize=(7,4)) # Create grid # Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data. # ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1) ax.grid(False) # Remove splines. Can be done one at a time or can slice with a list. ax.spines[['top','right','left']].set_visible(False) # Generate boxplot sns.boxplot(x=hrvar, y=metric, data= data, ax=ax) # Add in line and tag ax.plot( [0, .9], # Set width of line, previously [-0.08, .9] [0.9, 0.9], # Set height of line # [1.17, 1.17], # Set height of line transform = fig.transFigure, # Set location relative to plot clip_on = False, color = col_highlight, linewidth = .6 ) ax.add_patch( plt.Rectangle( (0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9] 0.05, # Width of rectangle -0.025, # Height of rectangle facecolor = col_highlight, transform = fig.transFigure, clip_on = False, linewidth = 0 ) ) # Set title ax.text( x = 0, y = 1.00, s = (f"Distribution of {clean_nm.lower()}"), transform = fig.transFigure, ha = 'left', fontsize = 13, weight = 'bold', alpha = .8 ) # Set subtitle ax.text( x = 0, y = 0.95, s = f'By {hrvar}', transform = fig.transFigure, ha = 'left', fontsize = 11, alpha = .8 ) # Set caption ax.text(x=0, y=-0.08, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7) # plt.show() # return the plot object return fig """ ggplot implementation - legacy plot_object = ( ggplot(plot_data, aes(x="group", y=metric)) + geom_boxplot(color="#578DB8",varwidth=True) + ylim(0, max_point) + theme(figure_size=(16,8), axis_text=element_text(size=12), axis_text_x=element_text(angle=0, hjust=1,size=12, linespacing=10,ha='center',va='center'), plot_title=element_text(color="grey", face="bold", size=18), legend_position="bottom", legend_title=element_text(size=14), legend_text=element_text(size=14)) + labs(title=clean_nm + (f"\n\nDistribution of {clean_nm.lower()} by {hrvar.lower().replace('_', ' ')}"), x=hrvar, y=f"Average {clean_nm}", caption=(extract_date_range(data, return_type="text")) )) return(plot_object) """
[docs] def create_boxplot(data: pd.DataFrame, metric: str, hrvar: str ="Organization", mingroup=5, return_type: str = "plot"): """ Name ----- create_boxplot Description ----------- This function creates a boxplot visualization and summary table for a given metric and HR variable in a pandas DataFrame. Parameters ---------- data : pandas dataframe A pandas DataFrame containing the data for analysis. metric : str The `metric` parameter is a string that represents the variable or metric for which you want to create the boxplot visualization and summary table. This variable should be present in the input data` DataFrame. hrvar : str, optional The `hrvar` parameter is the HR variable that you want to use for grouping the data. By default, it is set to "Organization", but you can pass a different HR variable if needed. mingroup: int, optional The `mingroup` parameter is an optional parameter that specifies the minimum number of observations required in each group for the boxplot to be created. If a group has fewer observations than the `mingroup` value, it will be excluded from the boxplot. The default value is 5. return_type : str, optional The `return_type` parameter determines the type of output that the function will return. It can take one of three values: Returns ------- The function `create_boxplot` returns different outputs based on the value of the `return_type` parameter Example ------- >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> create_boxplot(pq_data, metric = "Collaboration_hours", hrvar = "Organization", return_type = "plot") """ # Check inputs required_variables = ["MetricDate", metric, "PersonId"] # Error message if variables are not present and Nothing happens if all present assert all(var in data.columns for var in required_variables), f"Missing required variable(s): {set(required_variables) - set(data.columns)}" # Handling NULL values passed to hrvar if hrvar is None: data = totals_col(data) hrvar = "Total" # Summary table summary_table = create_boxplot_summary(data, metric, hrvar, mingroup) # Group order group_ord = summary_table.sort_values(by="mean", ascending=True)["group"].tolist() # Main output if return_type == "table": return pd.DataFrame(summary_table).reset_index() elif return_type == "plot": # Boxplot vizualization plot_object = create_boxplot_viz(data, metric, hrvar, mingroup) return plot_object elif return_type == "data": # Data calculations plot_data = create_boxplot_calc(data, metric, hrvar, mingroup) return plot_data.assign(group=pd.Categorical(plot_data.group, categories=group_ord)).sort_values(by="group", ascending=False) else: raise ValueError("Please enter a valid input for `return`.")