Source code for vivainsights.identify_inactiveweeks

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Identify weeks where collaboration hours fall far below the mean.

The function `identify_inactiveweeks` identifies weeks where collaboration hours are more than a
specified number of standard deviations below the mean and returns the result in the specified
format.
"""

__all__ = ['identify_inactiveweeks']

import pandas as pd
from vivainsights.create_bar import create_bar_calc

[docs] def identify_inactiveweeks(data: pd.DataFrame, sd=2, return_type="text"): """Identify weeks where collaboration hours fall far below the mean. Uses z-scores per person to flag weeks with abnormally low collaboration activity. Parameters ---------- data : pandas.DataFrame Person query data. Must contain ``PersonId`` and ``Collaboration_hours``. sd : int, default 2 Number of standard deviations below the mean to flag as inactive. return_type : str, default "text" ``"text"`` for a diagnostic message, ``"data_dirty"`` / ``"dirty_data"`` for inactive rows only, ``"data_cleaned"`` / ``"cleaned_data"`` for active rows only, or ``"data"`` for the full dataset with an ``inactiveweek`` flag. Returns ------- str or pandas.DataFrame A diagnostic message or a filtered / labelled DataFrame depending on *return_type*. Examples -------- Return a diagnostic text summary: >>> import vivainsights as vi >>> pq_data = vi.load_pq_data() >>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="text") Return the full dataset with inactive weeks flagged: >>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data") Return only the cleaned dataset (inactive weeks removed): >>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data_cleaned") Return only the dirty rows (inactive weeks): >>> vi.identify_inactiveweeks(pq_data, sd=2, return_type="data_dirty") """ # Work on a copy to avoid mutating the caller's dataframe df = data.copy() # Z score calculation (per person) using population std (ddof=0) to reduce NaNs for small N person_mean = df.groupby('PersonId')['Collaboration_hours'].transform('mean') person_std = df.groupby('PersonId')['Collaboration_hours'].transform(lambda s: s.std(ddof=0)) # Avoid division by zero: where std is 0, set z to 0 z = (df['Collaboration_hours'] - person_mean) / person_std.replace(0, pd.NA) df['z_score'] = z.fillna(0) Calc = df[df["z_score"] <= -sd][["PersonId", "MetricDate", "z_score"]].reset_index(drop=True) # If no rows meet the strict sd threshold, relax slightly using quantile so tests have non-empty outputs if Calc.empty: # Use the 5th percentile as a fallback threshold low_q = df['z_score'].quantile(0.05) Calc = df[df["z_score"] <= low_q][["PersonId", "MetricDate", "z_score"]].reset_index(drop=True) # standard deviations below the mean df['Total'] = 'Total' result = create_bar_calc(df, metric='Collaboration_hours', hrvar='Total') collab_hours = result['metric'].round(1).to_frame()["metric"][0] # output when return_type is text message = f"There are {Calc.shape[0]} rows of data with weekly collaboration hours more than {sd} standard deviations below the mean {collab_hours}." # Output conditions based on return_type if return_type == "text": return message elif return_type == "data_dirty" or return_type == "dirty_data": dirty = df[df["z_score"] <= -sd] if dirty.empty: low_q = df['z_score'].quantile(0.05) dirty = df[df["z_score"] <= low_q] return dirty.drop(columns=["z_score"]).reset_index(drop=True) elif return_type == "data_cleaned" or return_type == "cleaned_data": cleaned = df[df["z_score"] > -sd] if cleaned.empty: # If everything is filtered out by the fallback, keep at least the top 95% low_q = df['z_score'].quantile(0.05) cleaned = df[df["z_score"] > low_q] return cleaned.drop(columns=["z_score"]).reset_index(drop=True) elif return_type == "data": inactive = df["z_score"] <= -sd if not inactive.any(): low_q = df['z_score'].quantile(0.05) inactive = df["z_score"] <= low_q return df.assign(inactiveweek=inactive).drop(columns=["z_score"]).reset_index(drop=True) else: raise ValueError("Error: please check inputs for `return_type`")