Source code for vivainsights.identify_tenure
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The `identify_tenure` function calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
"""
__all__ = ['identify_tenure']
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *
[docs]
def identify_tenure(data: pd.DataFrame,
beg_date = "HireDate",
end_date = "MetricDate",
maxten = 40,
return_type = "message", # use return_type to avoid conflict with built-in function
date_format = "%Y-%m-%d"):
'''
Name
----
identify_tenure
Description
-----------
The function `identify_tenure` calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
Parameters
----------
data : pandas dataframe
The `data` parameter is a pandas DataFrame that contains the employee data. It should have columns for the hire date (`beg_date`) and the metric date (`end_date`).
beg_date : optional
The `beg_date` parameter is the name of the column in the DataFrame that represents the start date of employment for each employee. By default, it is set to "HireDate".
end_date : optional
The `end_date` parameter is the name of the column in the `data` DataFrame that represents the end date of the tenure period for each employee.
maxten : optional
The `maxten` parameter is used to specify the maximum tenure in years. Employees with a tenure greater than or equal to `maxten` will be considered as "odd" employees.
return_type : optional
The `return_type` parameter determines the type of output that the function will return. It can have the following values:
- "message" (default)
- "plot"
- "data_cleaned"
- "data_dirty"
- "data"
- "text"
date_format : optional
The `date_format` parameter is used to specify the format of the date strings in the `beg_date` and `end_date` columns of the input DataFrame. It is set to "%Y-%m-%d" by default, which represents the format "YYYY-MM-DD".
Returns
-------
The function `identify_tenure` returns different outputs based on the value of the `return_type`
parameter. The possible return values are:
'''
required_variables = [beg_date, end_date]
# check if required columns are not present
check_inputs(data, requirements = required_variables)
# Re-format and access columns by name, not by symbol
data[end_date] = pd.to_datetime(data[end_date], format = date_format)
data[beg_date] = pd.to_datetime(data[beg_date], format = date_format)
# Sort by end_date and get the last date
data_prep = data.sort_values(by = end_date)
last_date = data_prep[end_date].iloc[-1]
# graphing data
tenure_summary = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.groupby("tenure_years")
.size()
.reset_index(name = "n"))
# odd person IDs are the ones with tenure >= max tenure
oddpeople = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.query(f"tenure_years >= {maxten}")
.loc[:, "PersonId"])
# message
Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n"
f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n"
f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.")
if return_type == "text":
return Message
elif return_type == "message":
print(Message)
elif return_type == "plot":
# suppress warnings
import warnings
warnings.filterwarnings("ignore")
density = gaussian_kde(tenure_summary["tenure_years"])
# plot density
plt.figure()
plt.title("Tenure - Density")
plt.xlabel("Tenure in Years")
plt.ylabel("Density - number of employees")
xs = np.linspace(0, maxten, data.shape[0])
plt.plot(xs, density(xs), color = "#1d627e")
plt.show()
elif return_type == "data_cleaned":
return data[~data["PersonId"].isin(oddpeople)]
elif return_type == "data_dirty":
return data[data["PersonId"].isin(oddpeople)]
elif return_type == "data":
return (data_prep[data_prep["Date"] == last_date]
.assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.loc[:, ["PersonId", "TenureYear"]])
else:
raise ValueError("Error: please check inputs for `return`")