Source code for vivainsights.identify_holidayweeks

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
""" 
This function scans a standard query output for weeks where collaboration
hours is far outside the mean. Returns a list of weeks that appear to be
holiday weeks and optionally an edited dataframe with outliers removed. By
default, missing values are excluded.
"""

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator


[docs] def identify_holidayweeks(data: pd.DataFrame, sd = 1, return_type = "text"): """" Name ----- identify_holidayweeks Description ----------- Identify Holiday Weeks based on outliers. This function scans a standard query output for weeks where collaboration hours is far outside the mean. Returns a list of weeks that appear to be holiday weeks and optionally an edited dataframe with outliers removed. By default, missing values are excluded. As best practice, run this function prior to any analysis to remove atypical collaboration weeks from your dataset. Parameters ---------- data : pandas dataframe A Standard Person Query dataset in the form of a data frame. sd : int The standard deviation below the mean for collaboration hours that should define an outlier week. Enter a positive number. Default is 1 standard deviation. return_type : str String specifying what to return. This must be one of the following strings: - "text" (default) - "labelled_data" or "dirty_data" or "data_dirty" - "cleaned_data" or "data_cleaned" - "holidayweeks_data" - "plot" Returns ------- A different output is returned depending on the value passed to return_type: text : str A message is printed identifying holiday weeks. data_cleaned / cleaned_data : pandas dataframe A dataset with outlier weeks removed is returned. data_dirty / dirty_data / labelled_data : pandas dataframe A dataset with only outlier weeks is returned. holidayweeks_data : pandas dataframe A dataset with only outlier weeks is returned. plot : matplotlib plot A line plot of Collaboration Hours with holiday weeks highlighted. Examples -------- >>> identify_holidayweeks(pq_data, sd = .75, return_type = "text") "The weeks where collaboration was 0.75 standard deviations below the mean (18.7) are: `05/22/2022`" >>> identify_holidayweeks(pq_data, sd = .75, return_type = "plot") >>> identify_holidayweeks(pq_data, sd = .75, return_type = "cleaned_data") >>> identify_holidayweeks(pq_data, sd = .75, return_type = "holidayweeks_data") """ try: # convert `MetricDate` to datetime data['MetricDate'] = pd.to_datetime(data.MetricDate) # Calculate the mean and z-score of collaboration hours by date Calc = data.dropna(subset=['MetricDate', 'Collaboration_hours']).groupby("MetricDate").agg(mean_collab = ("Collaboration_hours", "mean")).reset_index() Calc["z_score"] = (Calc["mean_collab"] - Calc["mean_collab"].mean())/ Calc["mean_collab"].std() # Find the outliers that are below the given standard deviation Outliers = (Calc["MetricDate"][Calc["z_score"] < -sd]) Calc = Calc.assign(Outlier = Calc["MetricDate"].isin(Outliers)) # Return the message or the plot depending on the argument if return_type== "text": # Calculate the total return_type and the message mean_collab_hrs = Calc["mean_collab"].mean() if len(Outliers) == 0: Message = 'There are no weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ').' else: Message = 'The weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ') are: ' Message += ', '.join(Outliers.apply(lambda x: "`" + x.strftime("%m/%d/%Y") + "`")) return Message elif return_type in ["labelled_data", "dirty_data", "data_dirty"]: data_labelled = data.assign(holidayweek = data["MetricDate"].isin(Outliers)) return data_labelled elif return_type == "cleaned_data" or return_type == "data_cleaned": # Calculate the three dataframe outputs data_cleaned = data[~data["MetricDate"].isin(Outliers)] return data_cleaned elif return_type == "holidayweeks_data": data_hw = data[data["MetricDate"].isin(Outliers)] return data_hw elif return_type == "plot": # Generate a line plot with matplotlib for the collaboration hours fig, ax = plt.subplots(figsize=(10, 6)) # Plot the collaboration hours ax.plot(Calc["MetricDate"], Calc["mean_collab"].round(0), color="#1d627e", linewidth=3) # Add a marker to indicate the holiday weeks ax.scatter(Calc[Calc.Outlier==True]["MetricDate"], Calc[Calc.Outlier==True]["mean_collab"].round(0), color="#fe7f4f", marker="o", s=150, zorder=3) # Create the strings for the title, subtitle and caption subtitle_str = "Average collaboration hours where markers indicate holiday weeks" cap_str = "Data from week of {} to week of {}".format(Calc["MetricDate"].min().strftime("%b %d, '%y"), Calc["MetricDate"].max().strftime("%b %d, '%y")) # Set the title, subtitle, labels and limits of the plot ax.set_xlabel("Date", fontsize=12, fontweight="bold") ax.set_ylabel("Collaboration Hours", fontsize=12, fontweight="bold") ax.set_ylim(0, None) ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.10, s="Holiday Weeks", fontsize=16, fontweight="bold") ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.05, s=subtitle_str, fontsize=12) ax.text(x=ax.get_xlim()[0]-5,y=ax.get_ylim()[0]-5.5,s=cap_str, fontsize=12) ax.xaxis.set_major_locator(FixedLocator(range(len(Calc)))) # Set the tick positions ax.set_xticklabels(pd.to_datetime(Calc['MetricDate']).dt.strftime("%b %d, '%y"), rotation=45, ha="right") ax.grid(False) return fig else: raise ValueError("The `return_type` argument must be one of the following strings: 'text', 'labeled_data', 'cleaned_data', 'holidayweeks_data', or 'plot'.") except: # Check for the error in the input data required_cols = ['MetricDate', 'Collaboration_hours'] for i in required_cols: if i not in data.columns: raise ValueError("The required variable {} is not present in the dataframe.".format(i)) if pd.api.types.is_datetime64_any_dtype(data['MetricDate'])==False: raise ValueError("`MetricDate` appears not to be properly formatted. It needs to be in the format YYYY-MM-DD. Also check for missing values or stray values with inconsistent formats.")