Source code for vivainsights.identify_tenure
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The `identify_tenure` function calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *
[docs]
def identify_tenure(data: pd.DataFrame,
beg_date = "HireDate",
end_date = "MetricDate",
maxten = 40,
return_type = "message", # use return_type to avoid conflict with built-in function
date_format = "%Y-%m-%d"):
'''
Name
----
identify_tenure
Description
-----------
The function `identify_tenure` calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
Parameters
----------
data : pandas dataframe
The `data` parameter is a pandas DataFrame that contains the employee data. It should have columns for the hire date (`beg_date`) and the metric date (`end_date`).
beg_date : optional
The `beg_date` parameter is the name of the column in the DataFrame that represents the start date of employment for each employee. By default, it is set to "HireDate".
end_date : optional
The `end_date` parameter is the name of the column in the `data` DataFrame that represents the end date of the tenure period for each employee.
maxten : optional
The `maxten` parameter is used to specify the maximum tenure in years. Employees with a tenure greater than or equal to `maxten` will be considered as "odd" employees.
return_type : optional
The `return_type` parameter determines the type of output that the function will return. It can have the following values:
- "message" (default)
- "plot"
- "data_cleaned"
- "data_dirty"
- "data"
- "text"
date_format : optional
The `date_format` parameter is used to specify the format of the date strings in the `beg_date` and `end_date` columns of the input DataFrame. It is set to "%Y-%m-%d" by default, which represents the format "YYYY-MM-DD".
Returns
-------
The function `identify_tenure` returns different outputs based on the value of the `return_type`
parameter. The possible return values are:
'''
required_variables = [beg_date, end_date]
# check if required columns are not present
check_inputs(data, requirements = required_variables)
# Re-format and access columns by name, not by symbol
data[end_date] = pd.to_datetime(data[end_date], format = date_format)
data[beg_date] = pd.to_datetime(data[beg_date], format = date_format)
# Sort by end_date and get the last date
data_prep = data.sort_values(by = end_date)
last_date = data_prep[end_date].iloc[-1]
# graphing data
tenure_summary = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.groupby("tenure_years")
.size()
.reset_index(name = "n"))
# odd person IDs are the ones with tenure >= max tenure
oddpeople = (data_prep[data_prep[end_date] == last_date]
.assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.query(f"tenure_years >= {maxten}")
.loc[:, "PersonId"])
# message
Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n"
f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n"
f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.")
if return_type == "text":
return Message
elif return_type == "message":
print(Message)
elif return_type == "plot":
# suppress warnings
import warnings
warnings.filterwarnings("ignore")
density = gaussian_kde(tenure_summary["tenure_years"])
# plot density
plt.figure()
plt.title("Tenure - Density")
plt.xlabel("Tenure in Years")
plt.ylabel("Density - number of employees")
xs = np.linspace(0, maxten, data.shape[0])
plt.plot(xs, density(xs), color = "#1d627e")
plt.show()
elif return_type == "data_cleaned":
return data[~data["PersonId"].isin(oddpeople)]
elif return_type == "data_dirty":
return data[data["PersonId"].isin(oddpeople)]
elif return_type == "data":
return (data_prep[data_prep["Date"] == last_date]
.assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365)
.loc[:, ["PersonId", "TenureYear"]])
else:
raise ValueError("Error: please check inputs for `return`")