Source code for vivainsights.identify_tenure

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The `identify_tenure` function calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *

[docs] def identify_tenure(data: pd.DataFrame, beg_date = "HireDate", end_date = "MetricDate", maxten = 40, return_type = "message", # use return_type to avoid conflict with built-in function date_format = "%Y-%m-%d"): ''' Name ---- identify_tenure Description ----------- The function `identify_tenure` calculates and summarizes employee tenure based on hire and metric dates, and provides various options for returning the results. Parameters ---------- data : pandas dataframe The `data` parameter is a pandas DataFrame that contains the employee data. It should have columns for the hire date (`beg_date`) and the metric date (`end_date`). beg_date : optional The `beg_date` parameter is the name of the column in the DataFrame that represents the start date of employment for each employee. By default, it is set to "HireDate". end_date : optional The `end_date` parameter is the name of the column in the `data` DataFrame that represents the end date of the tenure period for each employee. maxten : optional The `maxten` parameter is used to specify the maximum tenure in years. Employees with a tenure greater than or equal to `maxten` will be considered as "odd" employees. return_type : optional The `return_type` parameter determines the type of output that the function will return. It can have the following values: - "message" (default) - "plot" - "data_cleaned" - "data_dirty" - "data" - "text" date_format : optional The `date_format` parameter is used to specify the format of the date strings in the `beg_date` and `end_date` columns of the input DataFrame. It is set to "%Y-%m-%d" by default, which represents the format "YYYY-MM-DD". Returns ------- The function `identify_tenure` returns different outputs based on the value of the `return_type` parameter. The possible return values are: ''' required_variables = [beg_date, end_date] # check if required columns are not present check_inputs(data, requirements = required_variables) # Re-format and access columns by name, not by symbol data[end_date] = pd.to_datetime(data[end_date], format = date_format) data[beg_date] = pd.to_datetime(data[beg_date], format = date_format) # Sort by end_date and get the last date data_prep = data.sort_values(by = end_date) last_date = data_prep[end_date].iloc[-1] # graphing data tenure_summary = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .groupby("tenure_years") .size() .reset_index(name = "n")) # odd person IDs are the ones with tenure >= max tenure oddpeople = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .query(f"tenure_years >= {maxten}") .loc[:, "PersonId"]) # message Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n" f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n" f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.") if return_type == "text": return Message elif return_type == "message": print(Message) elif return_type == "plot": # suppress warnings import warnings warnings.filterwarnings("ignore") density = gaussian_kde(tenure_summary["tenure_years"]) # plot density plt.figure() plt.title("Tenure - Density") plt.xlabel("Tenure in Years") plt.ylabel("Density - number of employees") xs = np.linspace(0, maxten, data.shape[0]) plt.plot(xs, density(xs), color = "#1d627e") plt.show() elif return_type == "data_cleaned": return data[~data["PersonId"].isin(oddpeople)] elif return_type == "data_dirty": return data[data["PersonId"].isin(oddpeople)] elif return_type == "data": return (data_prep[data_prep["Date"] == last_date] .assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .loc[:, ["PersonId", "TenureYear"]]) else: raise ValueError("Error: please check inputs for `return`")