# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Analyze the proportion of a population above or below a metric threshold.
"""
__all__ = ['create_inc', 'create_inc_bar', 'create_inc_grid']
import typing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.color_codes import COLOR_PALLET_ALT_2
from vivainsights.create_bar import create_bar
from vivainsights.extract_date_range import extract_date_range
[docs]
def create_inc(data: pd.DataFrame, metric: str, hrvar: typing.Union[typing.List, str], mingroup: int = 5, threshold: float = None, position: str = None, return_type: str = 'plot'):
"""
Create an incidence analysis showing the proportion of employees above
or below a metric threshold.
When a single ``hrvar`` is supplied, a bar chart is returned. When two
``hrvar`` values are supplied, a heatmap is returned.
Parameters
----------
data : pandas.DataFrame
Person query data.
metric : str
Metric column name, e.g. ``"Collaboration_hours"``.
hrvar : str or list of str
HR variable(s) for grouping (at most length 2).
mingroup : int
Minimum group size. Defaults to 5.
threshold : float, optional
Value to split the population.
position : str, optional
``"above"`` or ``"below"``.
return_type : str
``"plot"`` (default) or ``"table"``.
Returns
-------
matplotlib.figure.Figure or pandas.DataFrame
Plot or table depending on ``return_type``.
Raises
------
ValueError
If ``hrvar`` has more than two elements.
Examples
--------
Bar chart showing incidence above a threshold (single HR variable):
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.create_inc(
... pq_data,
... metric="Collaboration_hours",
... hrvar="LevelDesignation",
... threshold=10,
... position="above",
... )
Heatmap showing incidence with two HR variables:
>>> vi.create_inc(
... pq_data,
... metric="Collaboration_hours",
... hrvar=["LevelDesignation", "Organization"],
... threshold=15,
... position="below",
... )
Return a summary table instead of a plot:
>>> vi.create_inc(
... pq_data,
... metric="Collaboration_hours",
... hrvar="Organization",
... threshold=10,
... position="above",
... return_type="table",
... )
"""
if not isinstance(hrvar, list):
hrvar = [hrvar]
if len(hrvar) > 2:
raise ValueError("`hrvar` can only accept a list of length 2.")
if len(hrvar) == 1:
return create_inc_bar(data, metric, hrvar[0], mingroup, threshold, position, return_type)
else:
return create_inc_grid(data, metric, hrvar, mingroup, threshold, position, return_type)
[docs]
def create_inc_bar(data: pd.DataFrame, metric: str, hrvar: str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str='plot',figsize: tuple = None):
"""
Run incidence analysis with a single HR variable, returning a bar chart.
Parameters
----------
data : pandas.DataFrame
Person query data.
metric : str
Metric column name.
hrvar : str
HR variable for grouping.
mingroup : int
Minimum group size. Defaults to 5.
threshold : float, optional
Split threshold.
position : str, optional
``"above"`` or ``"below"``.
return_type : str
``"plot"`` (default) or ``"table"``.
figsize : tuple, optional
Figure size as ``(width, height)`` in inches. Defaults to ``(8, 6)``.
Returns
-------
matplotlib.figure.Figure or pandas.DataFrame
Bar chart or summary table.
Examples
--------
Bar chart of incidence below a threshold:
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.create_inc_bar(
... pq_data,
... metric="Collaboration_hours",
... hrvar="LevelDesignation",
... threshold=20,
... position="below",
... )
Return a summary table:
>>> vi.create_inc_bar(
... pq_data,
... metric="Collaboration_hours",
... hrvar="Organization",
... threshold=10,
... position="above",
... return_type="table",
... )
Customize figure size:
>>> vi.create_inc_bar(
... pq_data,
... metric="Collaboration_hours",
... hrvar="LevelDesignation",
... threshold=15,
... position="above",
... figsize=(10, 5),
... )
"""
# Transform data so that metrics become proportions
data_t = data.copy()
if position == "above":
data_t[metric] = data_t[metric] >= threshold
elif position == "below":
data_t[metric] = data_t[metric] <= threshold
else:
raise ValueError("Please enter a valid input for `position`.")
title_text = f"Incidence of {metric} {position} {threshold}" # Set title text
subtitle_text = f"Percentage and number of employees by {hrvar}" # Set subtitle text
if return_type == 'data':
return data_t
else:
return create_bar(
data_t,
metric,
hrvar,
mingroup,
percent = True,
plot_title = title_text,
plot_subtitle = subtitle_text,
return_type = return_type,
figsize=figsize
)
[docs]
def create_inc_grid(data: pd.DataFrame, metric: str, hrvar: typing.List, mingroup: int=5, threshold: float=None, position: str=None, return_type: str='plot', figsize: tuple = None):
"""
Run incidence analysis with two HR variables, returning a heatmap.
Parameters
----------
data : pandas.DataFrame
Person query data.
metric : str
Metric column name.
hrvar : list of str
Two HR variables for the heatmap axes.
mingroup : int
Minimum group size. Defaults to 5.
threshold : float, optional
Split threshold.
position : str, optional
``"above"`` or ``"below"``.
figsize : tuple, optional
Figure size as ``(width, height)`` in inches. Defaults to ``(8, 6)``.
return_type : str
``"plot"`` (default) or ``"table"``.
Returns
-------
matplotlib.figure.Figure or pandas.DataFrame
Heatmap or summary table.
Raises
------
ValueError
If ``hrvar`` is not a list of length 2.
Examples
--------
Generate a heatmap of incidence across two HR variables:
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.create_inc_grid(
... pq_data,
... metric="Collaboration_hours",
... hrvar=["LevelDesignation", "Organization"],
... threshold=15,
... position="above",
... )
Return a summary table instead:
>>> vi.create_inc_grid(
... pq_data,
... metric="Collaboration_hours",
... hrvar=["LevelDesignation", "Organization"],
... threshold=10,
... position="below",
... return_type="table",
... )
"""
if not isinstance(hrvar, list) or len(hrvar) != 2:
raise ValueError("`hrvar` must be a list of length 2.")
metric_to_pass = np.where(data[metric] >= threshold, 1, 0) \
if position == "above" else np.where(data[metric] <= threshold, 1, 0) \
if position == "below" else {}
myTable: pd.DataFrame = (
data
.assign(metric_inc=metric_to_pass)
.groupby(hrvar + ['PersonId'], as_index=False)
.agg({'metric_inc': 'mean'})
.groupby(hrvar, as_index=False)
.agg({'metric_inc': 'mean', 'PersonId': 'nunique'})
.rename(columns={'metric_inc': 'incidence', 'PersonId': 'count'})
.query('count >= @mingroup')
.sort_values('incidence', ascending=False)
)
if return_type == "table":
return myTable
elif return_type == "plot":
# Set title text
title_text = f"Incidence of {metric.replace('_', ' ').capitalize()} {position} {threshold}"
# Set subtitle text
subtitle_text = f"Percentage and number of employees by {hrvar[0]} and {hrvar[1]}"
cap_str = extract_date_range(data, return_type = 'text')
# Create the heatmap with the new annot DataFrame
myTable['metric_text'] = myTable.apply(lambda row: f"{row['incidence']*100:.1f}% ({row['count']})", axis=1)
# Order the columns and rows by the longest first to fit landscape plot
if myTable[hrvar[0]].nunique() > myTable[hrvar[1]].nunique():
hrvar = [hrvar[1], hrvar[0]]
# Annotation to pass to heatmap
annot_df = myTable.pivot(index=hrvar[0], columns=hrvar[1], values='metric_text')
# Setup plot size.
fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6))
# Create grid
# Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data.
# ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1)
ax.grid(False)
# Remove tick marks
ax.tick_params(
which='both', # Both major and minor ticks are affected
top=False, # Remove ticks from the top
bottom=False, # Remove ticks from the bottom
left=False, # Remove ticks from the left
right=False # Remove ticks from the right
)
sns.set_theme(font_scale=0.7)
# plot heatmap
sns.heatmap(
myTable.pivot(index=hrvar[0], columns=hrvar[1], values='incidence'),
annot = annot_df,
fmt='',
cmap=COLOR_PALLET_ALT_2,
center=0.5,
square=True,
ax=ax
)
# Add in line and tag
ax.plot(
[0, .9], # Set width of line, previously [-0.08, .9]
[0.9, 0.9], # Set height of line
# [1.17, 1.17], # Set height of line
transform = fig.transFigure, # Set location relative to plot
clip_on = False,
color = '#fe7f4f',
linewidth = .6
)
ax.add_patch(
plt.Rectangle(
(0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9]
0.05, # Width of rectangle
-0.025, # Height of rectangle
facecolor = '#fe7f4f',
transform = fig.transFigure,
clip_on = False,
linewidth = 0
)
)
# Set title
ax.text(
x = 0, y = 1.00,
s = title_text,
transform = fig.transFigure,
ha = 'left',
fontsize = 13,
weight = 'bold',
alpha = .8
)
# Set subtitle
ax.text(
x = 0, y = 0.95,
s = subtitle_text,
transform = fig.transFigure,
ha = 'left',
fontsize = 11,
alpha = .8
)
# Set caption
ax.text(x=0, y=0.02, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7)
# return the plot object
return fig
# plt.show()
""" Legacy
ax.set(title=title_text, xlabel=hrvar[1], ylabel=hrvar[0], aspect='equal')
ax.text(1.1, 1.05, subtitle_text, transform=ax.transAxes, fontsize=14, va='center')
"""
else:
raise ValueError("Please enter a valid input for `return_type`: Either `table` or `plot`.")