Source code for vivainsights.extract_hr
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
Extract HR or organizational attribute columns from a Viva Insights dataset.
There is an option to return either just a list of the variable names
or a DataFrame containing only the variables themselves.
"""
__all__ = ['extract_hr']
import pandas as pd
[docs]
def extract_hr(
data: pd.DataFrame,
max_unique: int = 50,
exclude_constants: bool = True,
return_type: str = "names"):
"""
Extract HR attributes (organizational data) by detecting variable class
and number of unique values.
Parameters
----------
data : pandas.DataFrame
Data from which to extract HR variables.
max_unique : int
Maximum number of unique values a column can have to be included.
Defaults to 50.
exclude_constants : bool
Whether to exclude columns with only one unique value.
Defaults to ``True``.
return_type : str
Output type. ``"names"`` (default) prints column names,
``"vars"`` returns the filtered DataFrame,
``"suggestion"`` returns a list of column names.
Returns
-------
pandas.DataFrame, list of str, or None
Depends on ``return_type``: a DataFrame of HR columns, a list of
column names, or prints names to console.
Examples
--------
Print HR variable names to console (default):
>>> import vivainsights as vi
>>> pq_data = vi.load_pq_data()
>>> vi.extract_hr(data=pq_data)
Return the HR columns as a filtered DataFrame:
>>> vi.extract_hr(data=pq_data, return_type="vars")
Return a list of suggested HR column names:
>>> vi.extract_hr(data=pq_data, return_type="suggestion")
Adjust the maximum unique values threshold:
>>> vi.extract_hr(data=pq_data, max_unique=50, return_type="names")
"""
try:
if((isinstance(max_unique, int)) and (isinstance(exclude_constants, bool))\
and (return_type.lower() == "names") or (return_type.lower() == "vars") or (return_type.lower() == "suggestion")):
unqdf = data.loc[:,data.nunique()<=max_unique]
if exclude_constants == False:
unqdf = unqdf.loc[:,unqdf.nunique()!=1]
elif not isinstance(max_unique, int):
error ="Error! var max_unique should be an integer value. Please try again."
elif not isinstance(exclude_constants, bool):
error ="Error! var exclude_constants should be an boolean(True/False) value. Please try again."
elif (return_type.lower() != "names") or (return_type.lower() != "vars") or (return_type.lower() != "suggestion"):
error = "Please check input to `return_type`."
if return_type == "vars":
return unqdf.select_dtypes(['object'])
if return_type == "suggestion":
return unqdf.select_dtypes(['object']).columns.tolist()
#return print(*unqdf.columns+',\n')
return print(*unqdf.select_dtypes(['object']).columns+',\n')
except:
print(error)