Source code for vivainsights.create_inc

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Analyze the proportion of a population above or below a metric threshold.
"""

__all__ = ['create_inc', 'create_inc_bar', 'create_inc_grid']

import typing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.color_codes import COLOR_PALLET_ALT_2
from vivainsights.create_bar import create_bar
from vivainsights.extract_date_range import extract_date_range

[docs] def create_inc(data: pd.DataFrame, metric: str, hrvar: typing.Union[typing.List, str], mingroup: int = 5, threshold: float = None, position: str = None, return_type: str = 'plot'): """ Create an incidence analysis showing the proportion of employees above or below a metric threshold. When a single ``hrvar`` is supplied, a bar chart is returned. When two ``hrvar`` values are supplied, a heatmap is returned. Parameters ---------- data : pandas.DataFrame Person query data. metric : str Metric column name, e.g. ``"Collaboration_hours"``. hrvar : str or list of str HR variable(s) for grouping (at most length 2). mingroup : int Minimum group size. Defaults to 5. threshold : float, optional Value to split the population. position : str, optional ``"above"`` or ``"below"``. return_type : str ``"plot"`` (default) or ``"table"``. Returns ------- matplotlib.figure.Figure or pandas.DataFrame Plot or table depending on ``return_type``. Raises ------ ValueError If ``hrvar`` has more than two elements. Examples -------- Bar chart showing incidence above a threshold (single HR variable): >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.create_inc( ... pq_data, ... metric="Collaboration_hours", ... hrvar="LevelDesignation", ... threshold=10, ... position="above", ... ) Heatmap showing incidence with two HR variables: >>> vi.create_inc( ... pq_data, ... metric="Collaboration_hours", ... hrvar=["LevelDesignation", "Organization"], ... threshold=15, ... position="below", ... ) Return a summary table instead of a plot: >>> vi.create_inc( ... pq_data, ... metric="Collaboration_hours", ... hrvar="Organization", ... threshold=10, ... position="above", ... return_type="table", ... ) """ if not isinstance(hrvar, list): hrvar = [hrvar] if len(hrvar) > 2: raise ValueError("`hrvar` can only accept a list of length 2.") if len(hrvar) == 1: return create_inc_bar(data, metric, hrvar[0], mingroup, threshold, position, return_type) else: return create_inc_grid(data, metric, hrvar, mingroup, threshold, position, return_type)
[docs] def create_inc_bar(data: pd.DataFrame, metric: str, hrvar: str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str='plot',figsize: tuple = None): """ Run incidence analysis with a single HR variable, returning a bar chart. Parameters ---------- data : pandas.DataFrame Person query data. metric : str Metric column name. hrvar : str HR variable for grouping. mingroup : int Minimum group size. Defaults to 5. threshold : float, optional Split threshold. position : str, optional ``"above"`` or ``"below"``. return_type : str ``"plot"`` (default) or ``"table"``. figsize : tuple, optional Figure size as ``(width, height)`` in inches. Defaults to ``(8, 6)``. Returns ------- matplotlib.figure.Figure or pandas.DataFrame Bar chart or summary table. Examples -------- Bar chart of incidence below a threshold: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.create_inc_bar( ... pq_data, ... metric="Collaboration_hours", ... hrvar="LevelDesignation", ... threshold=20, ... position="below", ... ) Return a summary table: >>> vi.create_inc_bar( ... pq_data, ... metric="Collaboration_hours", ... hrvar="Organization", ... threshold=10, ... position="above", ... return_type="table", ... ) Customize figure size: >>> vi.create_inc_bar( ... pq_data, ... metric="Collaboration_hours", ... hrvar="LevelDesignation", ... threshold=15, ... position="above", ... figsize=(10, 5), ... ) """ # Transform data so that metrics become proportions data_t = data.copy() if position == "above": data_t[metric] = data_t[metric] >= threshold elif position == "below": data_t[metric] = data_t[metric] <= threshold else: raise ValueError("Please enter a valid input for `position`.") title_text = f"Incidence of {metric} {position} {threshold}" # Set title text subtitle_text = f"Percentage and number of employees by {hrvar}" # Set subtitle text if return_type == 'data': return data_t else: return create_bar( data_t, metric, hrvar, mingroup, percent = True, plot_title = title_text, plot_subtitle = subtitle_text, return_type = return_type, figsize=figsize )
[docs] def create_inc_grid(data: pd.DataFrame, metric: str, hrvar: typing.List, mingroup: int=5, threshold: float=None, position: str=None, return_type: str='plot', figsize: tuple = None): """ Run incidence analysis with two HR variables, returning a heatmap. Parameters ---------- data : pandas.DataFrame Person query data. metric : str Metric column name. hrvar : list of str Two HR variables for the heatmap axes. mingroup : int Minimum group size. Defaults to 5. threshold : float, optional Split threshold. position : str, optional ``"above"`` or ``"below"``. figsize : tuple, optional Figure size as ``(width, height)`` in inches. Defaults to ``(8, 6)``. return_type : str ``"plot"`` (default) or ``"table"``. Returns ------- matplotlib.figure.Figure or pandas.DataFrame Heatmap or summary table. Raises ------ ValueError If ``hrvar`` is not a list of length 2. Examples -------- Generate a heatmap of incidence across two HR variables: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.create_inc_grid( ... pq_data, ... metric="Collaboration_hours", ... hrvar=["LevelDesignation", "Organization"], ... threshold=15, ... position="above", ... ) Return a summary table instead: >>> vi.create_inc_grid( ... pq_data, ... metric="Collaboration_hours", ... hrvar=["LevelDesignation", "Organization"], ... threshold=10, ... position="below", ... return_type="table", ... ) """ if not isinstance(hrvar, list) or len(hrvar) != 2: raise ValueError("`hrvar` must be a list of length 2.") metric_to_pass = np.where(data[metric] >= threshold, 1, 0) \ if position == "above" else np.where(data[metric] <= threshold, 1, 0) \ if position == "below" else {} myTable: pd.DataFrame = ( data .assign(metric_inc=metric_to_pass) .groupby(hrvar + ['PersonId'], as_index=False) .agg({'metric_inc': 'mean'}) .groupby(hrvar, as_index=False) .agg({'metric_inc': 'mean', 'PersonId': 'nunique'}) .rename(columns={'metric_inc': 'incidence', 'PersonId': 'count'}) .query('count >= @mingroup') .sort_values('incidence', ascending=False) ) if return_type == "table": return myTable elif return_type == "plot": # Set title text title_text = f"Incidence of {metric.replace('_', ' ').capitalize()} {position} {threshold}" # Set subtitle text subtitle_text = f"Percentage and number of employees by {hrvar[0]} and {hrvar[1]}" cap_str = extract_date_range(data, return_type = 'text') # Create the heatmap with the new annot DataFrame myTable['metric_text'] = myTable.apply(lambda row: f"{row['incidence']*100:.1f}% ({row['count']})", axis=1) # Order the columns and rows by the longest first to fit landscape plot if myTable[hrvar[0]].nunique() > myTable[hrvar[1]].nunique(): hrvar = [hrvar[1], hrvar[0]] # Annotation to pass to heatmap annot_df = myTable.pivot(index=hrvar[0], columns=hrvar[1], values='metric_text') # Setup plot size. fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6)) # Create grid # Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data. # ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1) ax.grid(False) # Remove tick marks ax.tick_params( which='both', # Both major and minor ticks are affected top=False, # Remove ticks from the top bottom=False, # Remove ticks from the bottom left=False, # Remove ticks from the left right=False # Remove ticks from the right ) sns.set_theme(font_scale=0.7) # plot heatmap sns.heatmap( myTable.pivot(index=hrvar[0], columns=hrvar[1], values='incidence'), annot = annot_df, fmt='', cmap=COLOR_PALLET_ALT_2, center=0.5, square=True, ax=ax ) # Add in line and tag ax.plot( [0, .9], # Set width of line, previously [-0.08, .9] [0.9, 0.9], # Set height of line # [1.17, 1.17], # Set height of line transform = fig.transFigure, # Set location relative to plot clip_on = False, color = '#fe7f4f', linewidth = .6 ) ax.add_patch( plt.Rectangle( (0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9] 0.05, # Width of rectangle -0.025, # Height of rectangle facecolor = '#fe7f4f', transform = fig.transFigure, clip_on = False, linewidth = 0 ) ) # Set title ax.text( x = 0, y = 1.00, s = title_text, transform = fig.transFigure, ha = 'left', fontsize = 13, weight = 'bold', alpha = .8 ) # Set subtitle ax.text( x = 0, y = 0.95, s = subtitle_text, transform = fig.transFigure, ha = 'left', fontsize = 11, alpha = .8 ) # Set caption ax.text(x=0, y=0.02, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7) # return the plot object return fig # plt.show() """ Legacy ax.set(title=title_text, xlabel=hrvar[1], ylabel=hrvar[0], aspect='equal') ax.text(1.1, 1.05, subtitle_text, transform=ax.transAxes, fontsize=14, va='center') """ else: raise ValueError("Please enter a valid input for `return_type`: Either `table` or `plot`.")