Source code for vivainsights.identify_holidayweeks

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Detect holiday weeks by scanning for anomalous collaboration hours.

Returns a list of weeks that appear to be
holiday weeks and optionally an edited dataframe with outliers removed. By
default, missing values are excluded.
"""

__all__ = ['identify_holidayweeks']

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator


[docs] def identify_holidayweeks(data: pd.DataFrame, sd = 1, return_type = "text",figsize: tuple = None): """Detect holiday weeks by scanning for anomalous collaboration hours. Scans a standard person query for weeks where collaboration hours fall far below the mean. As best practice, run this function before other analyses to remove atypical weeks from the dataset. Parameters ---------- data : pandas.DataFrame Person query data. Must contain ``MetricDate`` and ``Collaboration_hours``. sd : int or float, default 1 Number of standard deviations below the mean to flag as an outlier. Enter a positive number. return_type : str, default "text" One of ``"text"``, ``"labelled_data"`` / ``"dirty_data"`` / ``"data_dirty"``, ``"cleaned_data"`` / ``"data_cleaned"``, ``"holidayweeks_data"``, or ``"plot"``. figsize : tuple or None, default None Figure size ``(width, height)`` in inches. Defaults to ``(8, 6)``. Returns ------- str, pandas.DataFrame, or matplotlib.figure.Figure A diagnostic string, a filtered dataset, or a line chart depending on *return_type*. Examples -------- Return a text summary of detected holiday weeks: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.identify_holidayweeks(pq_data, sd=0.75, return_type="text") Return a line chart highlighting holiday weeks: >>> vi.identify_holidayweeks(pq_data, sd=0.75, return_type="plot") Return a cleaned dataset with holiday weeks removed: >>> vi.identify_holidayweeks(pq_data, sd=0.75, return_type="cleaned_data") Return the dataset with holiday weeks labelled: >>> vi.identify_holidayweeks(pq_data, sd=0.75, return_type="labelled_data") """ try: # convert `MetricDate` to datetime data['MetricDate'] = pd.to_datetime(data.MetricDate) # Calculate the mean and z-score of collaboration hours by date Calc = data.dropna(subset=['MetricDate', 'Collaboration_hours']).groupby("MetricDate").agg(mean_collab = ("Collaboration_hours", "mean")).reset_index() Calc["z_score"] = (Calc["mean_collab"] - Calc["mean_collab"].mean())/ Calc["mean_collab"].std() # Find the outliers that are below the given standard deviation Outliers = (Calc["MetricDate"][Calc["z_score"] < -sd]) Calc = Calc.assign(Outlier = Calc["MetricDate"].isin(Outliers)) # Return the message or the plot depending on the argument if return_type== "text": # Calculate the total return_type and the message mean_collab_hrs = Calc["mean_collab"].mean() if len(Outliers) == 0: Message = 'There are no weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ').' else: Message = 'The weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ') are: ' Message += ', '.join(Outliers.apply(lambda x: "`" + x.strftime("%m/%d/%Y") + "`")) return Message elif return_type in ["labelled_data", "dirty_data", "data_dirty"]: data_labelled = data.assign(holidayweek = data["MetricDate"].isin(Outliers)) return data_labelled elif return_type in ["cleaned_data", "data_cleaned"]: data_cleaned = data[~data["MetricDate"].isin(Outliers)] if len(Outliers) == 0: print(f"No holiday weeks were removed. Standard deviation threshold was {sd}.") else: outlier_dates = ', '.join(Outliers.dt.strftime("%Y-%m-%d")) print(f"The weeks {outlier_dates} have been flagged as holiday weeks and removed from the data.") print(f"This is based on a standard deviation of {sd} below the mean collaboration hours.") return data_cleaned elif return_type == "holidayweeks_data": data_hw = data[data["MetricDate"].isin(Outliers)] return data_hw elif return_type == "plot": # Generate a line plot with matplotlib for the collaboration hours fig, ax = plt.subplots(figsize=figsize if figsize else (8, 6)) # Plot the collaboration hours ax.plot(Calc["MetricDate"], Calc["mean_collab"].round(0), color="#1d627e", linewidth=3) # Add a marker to indicate the holiday weeks ax.scatter(Calc[Calc.Outlier==True]["MetricDate"], Calc[Calc.Outlier==True]["mean_collab"].round(0), color="#fe7f4f", marker="o", s=150, zorder=3) # Create the strings for the title, subtitle and caption subtitle_str = "Average collaboration hours where markers indicate holiday weeks" cap_str = "Data from week of {} to week of {}".format(Calc["MetricDate"].min().strftime("%b %d, '%y"), Calc["MetricDate"].max().strftime("%b %d, '%y")) # Set the title, subtitle, labels and limits of the plot ax.set_xlabel("Date", fontsize=12, fontweight="bold") ax.set_ylabel("Collaboration Hours", fontsize=12, fontweight="bold") ax.set_ylim(0, None) ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.10, s="Holiday Weeks", fontsize=16, fontweight="bold") ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.05, s=subtitle_str, fontsize=12) ax.text(x=ax.get_xlim()[0]-5,y=ax.get_ylim()[0]-5.5,s=cap_str, fontsize=12) ax.xaxis.set_major_locator(FixedLocator(range(len(Calc)))) # Set the tick positions ax.set_xticklabels(pd.to_datetime(Calc['MetricDate']).dt.strftime("%b %d, '%y"), rotation=45, ha="right") ax.grid(False) return fig else: raise ValueError("The `return_type` argument must be one of the following strings: 'text', 'labeled_data', 'cleaned_data', 'holidayweeks_data', or 'plot'.") except: # Check for the error in the input data required_cols = ['MetricDate', 'Collaboration_hours'] for i in required_cols: if i not in data.columns: raise ValueError("The required variable {} is not present in the dataframe.".format(i)) if pd.api.types.is_datetime64_any_dtype(data['MetricDate'])==False: raise ValueError("`MetricDate` appears not to be properly formatted. It needs to be in the format YYYY-MM-DD. Also check for missing values or stray values with inconsistent formats.")