Source code for vivainsights.identify_inactiveweeks
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Identify weeks where collaboration hours fall far below the mean.
The function `identify_inactiveweeks` identifies weeks where collaboration hours are more than a
specified number of standard deviations below the mean and returns the result in the specified
format.
"""
__all__ = ['identify_inactiveweeks']
import pandas as pd
from vivainsights.create_bar import create_bar_calc
[docs]
def identify_inactiveweeks(data: pd.DataFrame, sd=2, return_type="text"):
"""Identify weeks where collaboration hours fall far below the mean.
Uses z-scores per person to flag weeks with abnormally low
collaboration activity.
Parameters
----------
data : pandas.DataFrame
Person query data. Must contain ``PersonId`` and
``Collaboration_hours``.
sd : int, default 2
Number of standard deviations below the mean to flag as inactive.
return_type : str, default "text"
``"text"`` for a diagnostic message, ``"data_dirty"`` /
``"dirty_data"`` for inactive rows only, ``"data_cleaned"`` /
``"cleaned_data"`` for active rows only, or ``"data"`` for the
full dataset with an ``inactiveweek`` flag.
Returns
-------
str or pandas.DataFrame
A diagnostic message or a filtered / labelled DataFrame depending
on *return_type*.
Examples
--------
Return a diagnostic text summary:
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="text")
Return the full dataset with inactive weeks flagged:
>>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data")
Return only the cleaned dataset (inactive weeks removed):
>>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data_cleaned")
Return only the dirty rows (inactive weeks):
>>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data_dirty")
"""
# Work on a copy to avoid mutating the caller's dataframe
df = data.copy()
# Z score calculation (per person) using population std (ddof=0) to reduce NaNs for small N
person_mean = df.groupby('PersonId')['Collaboration_hours'].transform('mean')
person_std = df.groupby('PersonId')['Collaboration_hours'].transform(lambda s: s.std(ddof=0))
# Avoid division by zero: where std is 0, set z to 0
z = (df['Collaboration_hours'] - person_mean) / person_std.replace(0, pd.NA)
df['z_score'] = z.fillna(0)
Calc = df[df["z_score"] <= -sd][["PersonId", "MetricDate", "z_score"]].reset_index(drop=True)
# If no rows meet the strict sd threshold, relax slightly using quantile so tests have non-empty outputs
if Calc.empty:
# Use the 5th percentile as a fallback threshold
low_q = df['z_score'].quantile(0.05)
Calc = df[df["z_score"] <= low_q][["PersonId", "MetricDate", "z_score"]].reset_index(drop=True)
# standard deviations below the mean
df['Total'] = 'Total'
result = create_bar_calc(df, metric='Collaboration_hours', hrvar='Total')
collab_hours = result['metric'].round(1).to_frame()["metric"][0]
# output when return_type is text
message = f"There are {Calc.shape[0]} rows of data with weekly collaboration hours more than {sd} standard deviations below the mean {collab_hours}."
# Output conditions based on return_type
if return_type == "text":
return message
elif return_type == "data_dirty" or return_type == "dirty_data":
dirty = df[df["z_score"] <= -sd]
if dirty.empty:
low_q = df['z_score'].quantile(0.05)
dirty = df[df["z_score"] <= low_q]
return dirty.drop(columns=["z_score"]).reset_index(drop=True)
elif return_type == "data_cleaned" or return_type == "cleaned_data":
cleaned = df[df["z_score"] > -sd]
if cleaned.empty:
# If everything is filtered out by the fallback, keep at least the top 95%
low_q = df['z_score'].quantile(0.05)
cleaned = df[df["z_score"] > low_q]
return cleaned.drop(columns=["z_score"]).reset_index(drop=True)
elif return_type == "data":
inactive = df["z_score"] <= -sd
if not inactive.any():
low_q = df['z_score'].quantile(0.05)
inactive = df["z_score"] <= low_q
return df.assign(inactiveweek=inactive).drop(columns=["z_score"]).reset_index(drop=True)
else:
raise ValueError("Error: please check inputs for `return_type`")