Source code for vivainsights.identify_tenure

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Calculate and summarize employee tenure based on hire and metric dates.

The `identify_tenure` function provides various options for returning the results.
"""

__all__ = ['identify_tenure']

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from vivainsights.check_inputs import *

[docs] def identify_tenure(data: pd.DataFrame, beg_date = "HireDate", end_date = "MetricDate", maxten = 40, return_type = "message", # use return_type to avoid conflict with built-in function date_format = "%Y-%m-%d"): """Calculate and summarize employee tenure. Computes tenure in years from hire date to the latest metric date and provides diagnostics, plots, or filtered datasets. Parameters ---------- data : pandas.DataFrame Person query data. Must include columns for hire and metric dates. beg_date : str, default "HireDate" Column name for the hire date. end_date : str, default "MetricDate" Column name for the end / metric date. maxten : int, default 40 Maximum tenure threshold in years. Employees at or above this value are flagged. return_type : str, default "message" ``"message"`` prints a summary, ``"text"`` returns it as a string, ``"plot"`` displays a density curve, ``"data_cleaned"`` removes flagged employees, ``"data_dirty"`` keeps only flagged employees, ``"data"`` returns per-person tenure. date_format : str, default "%Y-%m-%d" ``strftime`` format of dates in the date columns. Returns ------- None, str, pandas.DataFrame, or matplotlib plot A printed message, string, density plot, or DataFrame depending on *return_type*. Examples -------- Return a text summary of tenure distribution: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.identify_tenure(pq_data, return_type="text") Return a density plot of tenure: >>> vi.identify_tenure(pq_data, return_type="plot") Return the dataset with a computed tenure column: >>> vi.identify_tenure(pq_data, return_type="data") Return only rows with short tenure (below threshold): >>> vi.identify_tenure(pq_data, maxten=40, return_type="data_cleaned") Specify custom date column names: >>> vi.identify_tenure(pq_data, beg_date="HireDate", end_date="MetricDate", return_type="text") """ required_variables = [beg_date, end_date] # check if required columns are not present check_inputs(data, requirements = required_variables) # Re-format and access columns by name, not by symbol data[end_date] = pd.to_datetime(data[end_date], format = date_format) data[beg_date] = pd.to_datetime(data[beg_date], format = date_format) # Sort by end_date and get the last date data_prep = data.sort_values(by = end_date) last_date = data_prep[end_date].iloc[-1] # graphing data tenure_summary = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .groupby("tenure_years") .size() .reset_index(name = "n")) # odd person IDs are the ones with tenure >= max tenure oddpeople = (data_prep[data_prep[end_date] == last_date] .assign(tenure_years = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .query(f"tenure_years >= {maxten}") .loc[:, "PersonId"]) # message Message = (f"The mean tenure is {round(tenure_summary['tenure_years'].mean(), 1)} years.\n" f"The max tenure is {round(tenure_summary['tenure_years'].max(), 1)}.\n" f"There are {len(tenure_summary[tenure_summary['tenure_years'] >= maxten])} employees with a tenure greater than {maxten} years.") if return_type == "text": return Message elif return_type == "message": print(Message) elif return_type == "plot": # suppress warnings import warnings warnings.filterwarnings("ignore") density = gaussian_kde(tenure_summary["tenure_years"]) # plot density plt.figure() plt.title("Tenure - Density") plt.xlabel("Tenure in Years") plt.ylabel("Density - number of employees") xs = np.linspace(0, maxten, data.shape[0]) plt.plot(xs, density(xs), color = "#1d627e") plt.show() elif return_type == "data_cleaned": return data[~data["PersonId"].isin(oddpeople)] elif return_type == "data_dirty": return data[data["PersonId"].isin(oddpeople)] elif return_type == "data": return (data_prep[data_prep["Date"] == last_date] .assign(TenureYear = lambda x: (x[end_date] - x[beg_date]).dt.days / 365) .loc[:, ["PersonId", "TenureYear"]]) else: raise ValueError("Error: please check inputs for `return`")