Source code for vivainsights.identify_tenure
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Calculate and summarize employee tenure based on hire and metric dates.
The `identify_tenure` function provides various options for returning the results.
"""
__all__ = ['identify_tenure']
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *
[docs]
def identify_tenure(data: pd.DataFrame,
beg_date = "HireDate",
end_date = "MetricDate",
maxten = 40,
return_type = "message", # use return_type to avoid conflict with built-in function
date_format = "%Y-%m-%d"):
"""Calculate and summarize employee tenure.
Computes tenure in years from hire date to the latest metric date and
provides diagnostics, plots, or filtered datasets.
Parameters
----------
data : pandas.DataFrame
Person query data. Must include columns for hire and metric dates.
beg_date : str, default "HireDate"
Column name for the hire date.
end_date : str, default "MetricDate"
Column name for the end / metric date.
maxten : int, default 40
Maximum tenure threshold in years. Employees at or above this
value are flagged.
return_type : str, default "message"
``"message"`` prints a summary, ``"text"`` returns it as a string,
``"plot"`` displays a density curve, ``"data_cleaned"`` removes
flagged employees, ``"data_dirty"`` keeps only flagged employees,
``"data"`` returns per-person tenure.
date_format : str, default "%Y-%m-%d"
``strftime`` format of dates in the date columns.
Returns
-------
None, str, pandas.DataFrame, or matplotlib plot
A printed message, string, density plot, or DataFrame depending on
*return_type*.
Examples
--------
Return a text summary of tenure distribution:
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.identify_tenure(pq_data, return_type="text")
Return a density plot of tenure:
>>> vi.identify_tenure(pq_data, return_type="plot")
Return the dataset with a computed tenure column:
>>> vi.identify_tenure(pq_data, return_type="data")
Return only rows with short tenure (below threshold):
>>> vi.identify_tenure(pq_data, maxten=40, return_type="data_cleaned")
Specify custom date column names:
>>> vi.identify_tenure(pq_data, beg_date="HireDate", end_date="MetricDate", return_type="text")
"""
required_variables = [beg_date, end_date]
# check if required columns are not present
check_inputs(data, requirements = required_variables)
# Re-format and access columns by name, not by symbol
data[end_date] = pd.to_datetime(data[end_date], format = date_format)
data[beg_date] = pd.to_datetime(data[beg_date], format = date_format)
# Sort by end_date and get the last date
data_prep = data.sort_values(by = end_date)
last_date = data_prep[end_date].iloc[-1]
# graphing data
tenure_summary = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.groupby("tenure_years")
.size()
.reset_index(name = "n"))
# odd person IDs are the ones with tenure >= max tenure
oddpeople = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.query(f"tenure_years >= {maxten}")
.loc[:, "PersonId"])
# message
Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n"
f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n"
f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.")
if return_type == "text":
return Message
elif return_type == "message":
print(Message)
elif return_type == "plot":
# suppress warnings
import warnings
warnings.filterwarnings("ignore")
density = gaussian_kde(tenure_summary["tenure_years"])
# plot density
plt.figure()
plt.title("Tenure - Density")
plt.xlabel("Tenure in Years")
plt.ylabel("Density - number of employees")
xs = np.linspace(0, maxten, data.shape[0])
plt.plot(xs, density(xs), color = "#1d627e")
plt.show()
elif return_type == "data_cleaned":
return data[~data["PersonId"].isin(oddpeople)]
elif return_type == "data_dirty":
return data[data["PersonId"].isin(oddpeople)]
elif return_type == "data":
return (data_prep[data_prep["Date"] == last_date]
.assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.loc[:, ["PersonId", "TenureYear"]])
else:
raise ValueError("Error: please check inputs for `return`")