# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module creates an incidence analysis reflecting the proportion of the population scoring above or below a specified threshold for a metric.
"""
import typing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vivainsights.color_codes import COLOR_PALLET_ALT_2
from vivainsights.create_bar import create_bar
from vivainsights.extract_date_range import extract_date_range
[docs]
def create_inc(data: pd.DataFrame, metric: str, hrvar: typing.List or str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str = 'plot'):
"""
Name
----
create_inc
Description
-----------
Create an incidence analysis reflecting proportion of population scoring above or below a threshold for a metric.
An incidence analysis is generated, with each value in the table reflecting the proportion of the population that
is above or below a threshold for a specified metric. There is an option to only provide a single `hrvar` in which a
bar plot is generated, or two `hrvar` values where an incidence table (heatmap) is generated.
Parameters
----------
data : pandas dataframe
A Standard Person Query dataset in the form of a Pandas DataFrame.
metric : str
Name of the metric, e.g. "Collaboration_hours".
hrvar : str or list
Name(s) of the HR Variable(s) by which to split metrics.
mingroup : int
Privacy threshold / minimum group size. Defaults to 5.
threshold : float
Threshold value to split the data based on the position argument. Defaults to None.
position : str
One of the below valid values:
- "above": show incidence of those equal to or above the threshold
- "below": show incidence of those equal to or below the threshold
return_type : str
What to return. This must be one of the following strings:
- "plot"
- "table"
Returns
-------
Output is returned depending on the value passed to the return_type argument:
- "plot": Matplotlib or Seaborn plot object
- "table": Pandas DataFrame
Raises
------
ValueError: If hrvar is not a string or list with at most length 2.
Example
-------
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.create_inc(
pq_data,
metric = 'Collaboration_hours',
hrvar = 'LevelDesignation',
mingroup = 5,
threshold = 10,
position = 'above',
return_type = 'plot'
)
"""
if not isinstance(hrvar, list):
hrvar = [hrvar]
if len(hrvar) > 2:
raise ValueError("`hrvar` can only accept a list of length 2.")
if len(hrvar) == 1:
return create_inc_bar(data, metric, hrvar[0], mingroup, threshold, position, return_type)
else:
return create_inc_grid(data, metric, hrvar, mingroup, threshold, position, return_type)
[docs]
def create_inc_bar(data: pd.DataFrame, metric: str, hrvar: str, mingroup: int = 5, threshold: float = None, position: str = None, return_type: str='plot'):
"""
Name
-----
create_inc_bar
Description
-----------
Run `create_inc` with only single `hrvar`. Returning a bar chart
Parameters
----------
data : pandas dataframe
A Standard Person Query dataset in the form of a Pandas DataFrame.
metric : str
Name of the metric, e.g. "Collaboration_hours".
hrvar : str
Name of the HR Variable by which to split metrics.
mingroup : int
Privacy threshold / minimum group size. Defaults to 5.
threshold : float
Threshold value to split the data based on the position argument. Defaults to None.
position : str
One of the below valid values:
- "above": show incidence of those equal to or above the threshold
- "below": show incidence of those equal to or below the threshold
return_type : str
What to return. This must be one of the following strings:
- "plot"
- "table"
Returns
-------
Output is returned depending on the value passed to the return_type argument:
- "plot": Matplotlib or Seaborn plot object
- "table": Pandas DataFrame
Raises
------
ValueError: If hrvar is not a string.
Example
-------
>>> create_inc_bar(data = pq_data, metric = "Collaboration_hours", hrvar = "LevelDesignation", threshold = 20, position = "below", return_type = "plot")
"""
# Transform data so that metrics become proportions
data_t = data.copy()
if position == "above":
data_t[metric] = data_t[metric] >= threshold
elif position == "below":
data_t[metric] = data_t[metric] <= threshold
else:
raise ValueError("Please enter a valid input for `position`.")
title_text = f"Incidence of {metric} {position} {threshold}" # Set title text
subtitle_text = f"Percentage and number of employees by {hrvar}" # Set subtitle text
if return_type == 'data':
return data_t
else:
return create_bar(
data_t,
metric,
hrvar,
mingroup,
percent = True,
plot_title = title_text,
plot_subtitle = subtitle_text,
return_type = return_type
)
[docs]
def create_inc_grid(data: pd.DataFrame, metric: str, hrvar: typing.List, mingroup: int=5, threshold: float=None, position: str=None, return_type: str='plot'):
"""
Name
-----
create_inc_grid
Description
-----------
Run `create_inc` with two `hrvar`.
Returning a heatmap
Parameters
----------
data : pandas dataframe
A Standard Person Query dataset in the form of a Pandas DataFrame.
metric : str
Name of the metric, e.g. "Collaboration_hours".
hrvar : list
Names of the HR Variables by which to split metrics.
mingroup : int
Privacy threshold / minimum group size. Defaults to 5.
threshold : float
Threshold value to split the data based on the position argument. Defaults to None.
position : str
One of the below valid values:
- "above": show incidence of those equal to or above the threshold
- "below": show incidence of those equal to or below the threshold
return_type : str
What to return. This must be one of the following strings:
- "plot"
- "table"
Returns
-------
Output is returned depending on the value passed to the return_type argument:
- "plot": Matplotlib or Seaborn plot object
- "table": Pandas DataFrame
Raises
------
ValueError: If hrvar is not a list of length 2.
"""
if not isinstance(hrvar, list) or len(hrvar) != 2:
raise ValueError("`hrvar` must be a list of length 2.")
metric_to_pass = np.where(data[metric] >= threshold, 1, 0) \
if position == "above" else np.where(data[metric] <= threshold, 1, 0) \
if position == "below" else {}
myTable: pd.DataFrame = (
data
.assign(metric_inc=metric_to_pass)
.groupby(hrvar + ['PersonId'], as_index=False)
.agg({'metric_inc': 'mean'})
.groupby(hrvar, as_index=False)
.agg({'metric_inc': 'mean', 'PersonId': 'nunique'})
.rename(columns={'metric_inc': 'incidence', 'PersonId': 'count'})
.query('count >= @mingroup')
.sort_values('incidence', ascending=False)
)
if return_type == "table":
return myTable
elif return_type == "plot":
# Set title text
title_text = f"Incidence of {metric.replace('_', ' ').capitalize()} {position} {threshold}"
# Set subtitle text
subtitle_text = f"Percentage and number of employees by {hrvar[0]} and {hrvar[1]}"
cap_str = extract_date_range(data, return_type = 'text')
# Create the heatmap with the new annot DataFrame
myTable['metric_text'] = myTable.apply(lambda row: f"{row['incidence']*100:.1f}% ({row['count']})", axis=1)
# Order the columns and rows by the longest first to fit landscape plot
if myTable[hrvar[0]].nunique() > myTable[hrvar[1]].nunique():
hrvar = [hrvar[1], hrvar[0]]
# Annotation to pass to heatmap
annot_df = myTable.pivot(index=hrvar[0], columns=hrvar[1], values='metric_text')
# Setup plot size.
fig, ax = plt.subplots(figsize=(7, 4))
# Create grid
# Zorder tells it which layer to put it on. We are setting this to 1 and our data to 2 so the grid is behind the data.
# ax.grid(which="major", axis='both', color='#758D99', alpha=0.6, zorder=1)
ax.grid(False)
# Remove tick marks
ax.tick_params(
which='both', # Both major and minor ticks are affected
top=False, # Remove ticks from the top
bottom=False, # Remove ticks from the bottom
left=False, # Remove ticks from the left
right=False # Remove ticks from the right
)
sns.set_theme(font_scale=0.7)
# plot heatmap
sns.heatmap(
myTable.pivot(index=hrvar[0], columns=hrvar[1], values='incidence'),
annot = annot_df,
fmt='',
cmap=COLOR_PALLET_ALT_2,
center=0.5,
square=True,
ax=ax
)
# Add in line and tag
ax.plot(
[0, .9], # Set width of line, previously [-0.08, .9]
[0.9, 0.9], # Set height of line
# [1.17, 1.17], # Set height of line
transform = fig.transFigure, # Set location relative to plot
clip_on = False,
color = '#fe7f4f',
linewidth = .6
)
ax.add_patch(
plt.Rectangle(
(0, 0.9), # Set location of rectangle by lower left corner, previously [-0.08, .9]
0.05, # Width of rectangle
-0.025, # Height of rectangle
facecolor = '#fe7f4f',
transform = fig.transFigure,
clip_on = False,
linewidth = 0
)
)
# Set title
ax.text(
x = 0, y = 1.00,
s = title_text,
transform = fig.transFigure,
ha = 'left',
fontsize = 13,
weight = 'bold',
alpha = .8
)
# Set subtitle
ax.text(
x = 0, y = 0.95,
s = subtitle_text,
transform = fig.transFigure,
ha = 'left',
fontsize = 11,
alpha = .8
)
# Set caption
ax.text(x=0, y=0.02, s=cap_str, transform=fig.transFigure, ha='left', fontsize=9, alpha=.7)
# return the plot object
return fig
# plt.show()
""" Legacy
ax.set(title=title_text, xlabel=hrvar[1], ylabel=hrvar[0], aspect='equal')
ax.text(1.1, 1.05, subtitle_text, transform=ax.transAxes, fontsize=14, va='center')
"""
else:
raise ValueError("Please enter a valid input for `return_type`: Either `table` or `plot`.")