Source code for vivainsights.identify_nkw
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Identify non-knowledge workers based on collaboration activity thresholds.
"""
__all__ = ['identify_nkw']
import pandas as pd
import numpy as np
from datetime import timedelta
import vivainsights as vi
from scipy import stats
[docs]
def identify_nkw(
data: pd.DataFrame,
collab_threshold = 5,
return_type = 'data_summary'
):
"""Identify non-knowledge workers based on collaboration activity.
Groups the data by ``PersonId`` and ``Organization``, computes mean
collaboration hours, and flags employees below the threshold as
non-knowledge workers.
Parameters
----------
data : pandas.DataFrame
Person query data. Must contain ``PersonId``, ``Organization``,
and ``Collaboration_hours``.
collab_threshold : int, default 5
Average weekly collaboration hours below which a person is
considered a non-knowledge worker.
return_type : str, default "data_summary"
``"data_with_flag"`` adds a ``flag_nkw`` column, ``"data_summary"``
returns per-organization counts, ``"text"`` returns a diagnostic
message, ``"data_clean"`` / ``"data_cleaned"`` returns only
knowledge workers.
Returns
-------
pandas.DataFrame or str
Depending on *return_type*.
Examples
--------
Return a text summary of non-knowledge workers:
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.identify_nkw(pq_data, collab_threshold=15, return_type="text")
Return a flagged dataset with ``flag_nkw`` column:
>>> vi.identify_nkw(pq_data, collab_threshold=15, return_type="data_with_flag")
Return a summary table of NKW counts by organization:
>>> vi.identify_nkw(pq_data, collab_threshold=15, return_type="data_summary")
Return only the cleaned data (non-knowledge workers removed):
>>> vi.identify_nkw(pq_data, collab_threshold=15, return_type="data_clean")
"""
summary_byPersonId = (
data.groupby(['PersonId', 'Organization'])
.agg(mean_collab=('Collaboration_hours', 'mean'))
.reset_index()
)
summary_byPersonId['flag_nkw'] = ['kw' if x >= collab_threshold else 'nkw' for x in summary_byPersonId['mean_collab']]
data_with_flag = pd.merge(data, summary_byPersonId[['PersonId', 'flag_nkw']], on='PersonId', how='left')
summary_byOrganization = (
summary_byPersonId.groupby(['Organization', 'flag_nkw'])
.size()
.reset_index(name='total')
)
summary_byOrganization['perc'] = summary_byOrganization.groupby('Organization')['total'].apply(lambda x: x / x.sum()).reset_index(drop = True)
summary_byOrganization = summary_byOrganization[summary_byOrganization['flag_nkw'] == 'nkw']
summary_byOrganization = summary_byOrganization.rename(columns={'total': 'n_nkw', 'perc': 'perc_nkw'})
summary_byOrganization = summary_byOrganization[['Organization', 'n_nkw', 'perc_nkw']]
n_nkw = (summary_byPersonId['flag_nkw'] == 'nkw').sum()
if n_nkw == 0:
flagMessage = f"[Pass] There are no non-knowledge workers identified (average collaboration hours below {collab_threshold} hours)."
else:
flagMessage = f"[Warning] Out of a population of {data['PersonId'].nunique()}, there are {n_nkw} employees who may be non-knowledge workers (average collaboration hours below {collab_threshold} hours)."
if return_type == "data_with_flag":
return data_with_flag
elif return_type == 'data_summary':
return summary_byOrganization
elif return_type == 'text':
return flagMessage
elif return_type in ["data_clean", "data_cleaned"]:
return data_with_flag[data_with_flag['flag_nkw'] == 'kw']
else:
print('Invalid value supplied to `return_type`')