Source code for vivainsights.identify_tenure

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
The `identify_tenure` function calculates and summarizes employee tenure based on hire and metric
dates, and provides various options for returning the results.
"""

__all__ = ['identify_tenure']

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *

[docs] def identify_tenure(data: pd.DataFrame, beg_date = "HireDate", end_date = "MetricDate", maxten = 40, return_type = "message", # use return_type to avoid conflict with built-in function date_format = "%Y-%m-%d"): ''' Name ---- identify_tenure Description ----------- The function `identify_tenure` calculates and summarizes employee tenure based on hire and metric dates, and provides various options for returning the results. Parameters ---------- data : pandas dataframe The `data` parameter is a pandas DataFrame that contains the employee data. It should have columns for the hire date (`beg_date`) and the metric date (`end_date`). beg_date : optional The `beg_date` parameter is the name of the column in the DataFrame that represents the start date of employment for each employee. By default, it is set to "HireDate". end_date : optional The `end_date` parameter is the name of the column in the `data` DataFrame that represents the end date of the tenure period for each employee. maxten : optional The `maxten` parameter is used to specify the maximum tenure in years. Employees with a tenure greater than or equal to `maxten` will be considered as "odd" employees. return_type : optional The `return_type` parameter determines the type of output that the function will return. It can have the following values: - "message" (default) - "plot" - "data_cleaned" - "data_dirty" - "data" - "text" date_format : optional The `date_format` parameter is used to specify the format of the date strings in the `beg_date` and `end_date` columns of the input DataFrame. It is set to "%Y-%m-%d" by default, which represents the format "YYYY-MM-DD". Returns ------- The function `identify_tenure` returns different outputs based on the value of the `return_type` parameter. The possible return values are: ''' required_variables = [beg_date, end_date] # check if required columns are not present check_inputs(data, requirements = required_variables) # Re-format and access columns by name, not by symbol data[end_date] = pd.to_datetime(data[end_date], format = date_format) data[beg_date] = pd.to_datetime(data[beg_date], format = date_format) # Sort by end_date and get the last date data_prep = data.sort_values(by = end_date) last_date = data_prep[end_date].iloc[-1] # graphing data tenure_summary = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .groupby("tenure_years") .size() .reset_index(name = "n")) # odd person IDs are the ones with tenure >= max tenure oddpeople = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .query(f"tenure_years >= {maxten}") .loc[:, "PersonId"]) # message Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n" f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n" f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.") if return_type == "text": return Message elif return_type == "message": print(Message) elif return_type == "plot": # suppress warnings import warnings warnings.filterwarnings("ignore") density = gaussian_kde(tenure_summary["tenure_years"]) # plot density plt.figure() plt.title("Tenure - Density") plt.xlabel("Tenure in Years") plt.ylabel("Density - number of employees") xs = np.linspace(0, maxten, data.shape[0]) plt.plot(xs, density(xs), color = "#1d627e") plt.show() elif return_type == "data_cleaned": return data[~data["PersonId"].isin(oddpeople)] elif return_type == "data_dirty": return data[data["PersonId"].isin(oddpeople)] elif return_type == "data": return (data_prep[data_prep["Date"] == last_date] .assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .loc[:, ["PersonId", "TenureYear"]]) else: raise ValueError("Error: please check inputs for `return`")