Source code for vivainsights.identify_churn

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module identifies and counts the number of employees who have churned from the dataset.
This is done by measuring whether an employee who is present in the first `n` (n1) weeks of the data,
is also present in the last `n` (n2) weeks of the data.
An additional use case of this function is the ability to identify "new-joiners" by using the argument `flip`.
"""
import pandas as pd


[docs]
def identify_churn(data: pd.DataFrame,
                   n1 = 6,
                   n2 = 6,
                   return_type: str = "message", # avoid using return as a variable name
                   flip = False,
                   date_column: str = "MetricDate",
                   date_format = "%Y-%m-%d"):
  """
  Name
  ----
  identify_churn

  Description
  -----------
  This module identifies and counts the number of employees who have churned from the dataset.

  Parameters
  ---------
  data : pandas dataframe
     The dataframe to export
  n1 : int
     First `n` weeks of data to check for the person's presence
  n2 : int
    Last `n` weeks of data to check for the person's presence
  return_type : str
     Type of return expected
  flip : boolean
    Flag to switch between identifying churned users vs new users
  date_column : str
     DateTime column based on which churn is calculated, defaults to MetricDate for Nova
  date_format : datetime
     DateTime format in input file, defaults to YYYY-mm-dd
    
  Returns
  -------
  A different output is returned depending on the value passed to the `return_type` argument:
  - "message"`: Message on console. A diagnostic message.
  - "text"`: String. A diagnostic message.
  - "data"`: Character vector containing the the `PersonId` of employees who have been identified as churned.
  """

  data[date_column] = pd.to_datetime(data[date_column], format = date_format) # Ensure correct format

  unique_dates = data[date_column].unique() # Array of unique dates

  # First and last n weeks
  firstnweeks = sorted(unique_dates)[:n1]
  lastnweeks = sorted(unique_dates, reverse = True)[:n2]

  # People in the first week
  first_peeps = data[data[date_column].isin(firstnweeks)]['PersonId'].unique()

  # People in the last week
  final_peeps = data[data[date_column].isin(lastnweeks)]['PersonId'].unique()

  if flip == False:

    # In first, not in last
    churner_id = set(first_peeps) - set(final_peeps)

    # Message
    printMessage = (f"Churn:\nThere are {len(churner_id)} employees from "
                    f"{min(firstnweeks).date()} to {max(firstnweeks).date()} "
                    f"({n1} weeks) who are no longer present in "
                    f"{min(lastnweeks).date()} to {max(lastnweeks).date()} "
                    f"({n2} weeks).")

  elif flip == True:

    # In last, not in first
    # new joiners
    churner_id = set(final_peeps) - set(first_peeps)

    # Message
    printMessage = (f"New joiners:\nThere are {len(churner_id)} employees from "
                    f"{min(lastnweeks).date()} to {max(lastnweeks).date()} "
                    f"({n2} weeks) who were not present in "
                    f"{min(firstnweeks).date()} to {max(firstnweeks).date()} "
                    f"({n1} weeks).")

  else:
    raise ValueError("Invalid argument for `flip`")

  if return_type == "message":
    print(printMessage)

  elif return_type == "text":
    return printMessage

  elif return_type == "data":
    return churner_id

  else:
    raise ValueError("Invalid `return`")