# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This function scans a standard query output for weeks where collaboration
hours is far outside the mean. Returns a list of weeks that appear to be
holiday weeks and optionally an edited dataframe with outliers removed. By
default, missing values are excluded.
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FixedLocator
[docs]
def identify_holidayweeks(data: pd.DataFrame, sd = 1, return_type = "text"):
""""
Name
-----
identify_holidayweeks
Description
-----------
Identify Holiday Weeks based on outliers.
This function scans a standard query output for weeks where collaboration
hours is far outside the mean. Returns a list of weeks that appear to be
holiday weeks and optionally an edited dataframe with outliers removed. By
default, missing values are excluded.
As best practice, run this function prior to any analysis to remove atypical
collaboration weeks from your dataset.
Parameters
----------
data : pandas dataframe
A Standard Person Query dataset in the form of a data frame.
sd : int
The standard deviation below the mean for collaboration hours that should
define an outlier week. Enter a positive number.
Default is 1 standard deviation.
return_type : str
String specifying what to return. This must be one of the following strings:
- "text" (default)
- "labelled_data" or "dirty_data" or "data_dirty"
- "cleaned_data" or "data_cleaned"
- "holidayweeks_data"
- "plot"
Returns
-------
A different output is returned depending on the value passed to return_type:
text : str
A message is printed identifying holiday weeks.
data_cleaned / cleaned_data : pandas dataframe
A dataset with outlier weeks removed is returned.
data_dirty / dirty_data / labelled_data : pandas dataframe
A dataset with only outlier weeks is returned.
holidayweeks_data : pandas dataframe
A dataset with only outlier weeks is returned.
plot : matplotlib plot
A line plot of Collaboration Hours with holiday weeks highlighted.
Examples
--------
>>> identify_holidayweeks(pq_data, sd = .75, return_type = "text")
"The weeks where collaboration was 0.75 standard deviations below the mean (18.7) are: `05/22/2022`"
>>> identify_holidayweeks(pq_data, sd = .75, return_type = "plot")
>>> identify_holidayweeks(pq_data, sd = .75, return_type = "cleaned_data")
>>> identify_holidayweeks(pq_data, sd = .75, return_type = "holidayweeks_data")
"""
try:
# convert `MetricDate` to datetime
data['MetricDate'] = pd.to_datetime(data.MetricDate)
# Calculate the mean and z-score of collaboration hours by date
Calc = data.dropna(subset=['MetricDate', 'Collaboration_hours']).groupby("MetricDate").agg(mean_collab = ("Collaboration_hours", "mean")).reset_index()
Calc["z_score"] = (Calc["mean_collab"] - Calc["mean_collab"].mean())/ Calc["mean_collab"].std()
# Find the outliers that are below the given standard deviation
Outliers = (Calc["MetricDate"][Calc["z_score"] < -sd])
Calc = Calc.assign(Outlier = Calc["MetricDate"].isin(Outliers))
# Return the message or the plot depending on the argument
if return_type== "text":
# Calculate the total return_type and the message
mean_collab_hrs = Calc["mean_collab"].mean()
if len(Outliers) == 0:
Message = 'There are no weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ').'
else:
Message = 'The weeks where collaboration was ' + str(sd) + ' standard deviations below the mean (' + str(round(mean_collab_hrs, 1)) + ') are: '
Message += ', '.join(Outliers.apply(lambda x: "`" + x.strftime("%m/%d/%Y") + "`"))
return Message
elif return_type in ["labelled_data", "dirty_data", "data_dirty"]:
data_labelled = data.assign(holidayweek = data["MetricDate"].isin(Outliers))
return data_labelled
elif return_type == "cleaned_data" or return_type == "data_cleaned":
# Calculate the three dataframe outputs
data_cleaned = data[~data["MetricDate"].isin(Outliers)]
return data_cleaned
elif return_type == "holidayweeks_data":
data_hw = data[data["MetricDate"].isin(Outliers)]
return data_hw
elif return_type == "plot":
# Generate a line plot with matplotlib for the collaboration hours
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the collaboration hours
ax.plot(Calc["MetricDate"], Calc["mean_collab"].round(0), color="#1d627e", linewidth=3)
# Add a marker to indicate the holiday weeks
ax.scatter(Calc[Calc.Outlier==True]["MetricDate"], Calc[Calc.Outlier==True]["mean_collab"].round(0), color="#fe7f4f", marker="o", s=150, zorder=3)
# Create the strings for the title, subtitle and caption
subtitle_str = "Average collaboration hours where markers indicate holiday weeks"
cap_str = "Data from week of {} to week of {}".format(Calc["MetricDate"].min().strftime("%b %d, '%y"), Calc["MetricDate"].max().strftime("%b %d, '%y"))
# Set the title, subtitle, labels and limits of the plot
ax.set_xlabel("Date", fontsize=12, fontweight="bold")
ax.set_ylabel("Collaboration Hours", fontsize=12, fontweight="bold")
ax.set_ylim(0, None)
ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.10, s="Holiday Weeks", fontsize=16, fontweight="bold")
ax.text(x=ax.get_xlim()[0]-5, y=ax.get_ylim()[1]*1.05, s=subtitle_str, fontsize=12)
ax.text(x=ax.get_xlim()[0]-5,y=ax.get_ylim()[0]-5.5,s=cap_str, fontsize=12)
ax.xaxis.set_major_locator(FixedLocator(range(len(Calc)))) # Set the tick positions
ax.set_xticklabels(pd.to_datetime(Calc['MetricDate']).dt.strftime("%b %d, '%y"), rotation=45, ha="right")
ax.grid(False)
return fig
else:
raise ValueError("The `return_type` argument must be one of the following strings: 'text', 'labeled_data', 'cleaned_data', 'holidayweeks_data', or 'plot'.")
except:
# Check for the error in the input data
required_cols = ['MetricDate', 'Collaboration_hours']
for i in required_cols:
if i not in data.columns:
raise ValueError("The required variable {} is not present in the dataframe.".format(i))
if pd.api.types.is_datetime64_any_dtype(data['MetricDate'])==False:
raise ValueError("`MetricDate` appears not to be properly formatted. It needs to be in the format YYYY-MM-DD. Also check for missing values or stray values with inconsistent formats.")