Source code for vivainsights.network_p2p

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
"""
This module performs network analysis with a person-to-person query
"""
import vivainsights as vi
import pandas as pd
import igraph as ig
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.legend_handler import HandlerTuple
import matplotlib.lines as mlines
from matplotlib.backends.backend_pdf import PdfPages
import random
from sklearn.preprocessing import minmax_scale
import warnings
import time

[docs] def network_p2p(data, hrvar = "Organization", return_type = "plot", centrality = None, community = None, weight = None, comm_args = None, layout = "mds", path = "", style = "igraph", bg_fill = "#FFFFFF", font_col = "grey20", legend_pos = "best", palette = "rainbow", node_alpha = 0.7, edge_alpha = 1, edge_col = "#777777", node_sizes = [1, 20], node_scale = 1, seed = 1, legend_ncols=0 ): """ Name ---- network_p2p Description ------------ This function returns a network plot given a data frame containing a person-to-person query. Parameters ---------- data : dataframe Data frame containing a person-to-person query. hrvar : str String containing the label for the HR attribute. return_type : str A different output is returned depending on the value passed to the `return_type` argument: - `'plot'` (default) - `'plot-pdf'` - `'sankey'` - `'table'` - `'data'` - `'network'` centrality : str string to determines which centrality measure is used to scale the size of the nodes. All centrality measures are automatically calculated when it is set to one of the below values, and reflected in the `'network'` and `'data'` outputs. Measures include: - `betweenness` - `closeness` - `degree` - `eigenvector` - `pagerank` When `centrality` is set to None, no centrality is calculated in the outputs and all the nodes would have the same size. community : str String determining which community detection algorithms to apply. Valid values include: - `None` (default): compute analysis or visuals without computing communities. - `"multilevel"` (a version of louvain) - `"leiden"` - `"edge_betweenness"` - `"fastgreedy"` - `"infomap"` - `"label_propagation"` - `"leading_eigenvector"` - `"optimal_modularity"` - `"spinglass"` - `"walk_trap"` weight : str String to specify which column to use as weights for the network. To create a graph without weights, supply `None` to this argument. comm_args : list list containing the arguments to be passed through to igraph's clustering algorithms. Arguments must be named. See examples section on how to supply arguments in a named list. layout : str String to specify the node placement algorithm to be used. Defaults to `"mds"` for the deterministic multi-dimensional scaling of nodes. See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options. path : str (file path) File path for saving the PDF output. Defaults to a timestamped path based on current parameters. bg_fill : str String to specify background fill color. font_col : str String to specify font color. legend_pos : str String to specify position of legend. Valid values include: String to specify position of legend. Valid values include: - `"best"` - `"upper right"` - `"upper left"` - `"lower left"` - `"right"` - `"center left"` - `"center right"` - `"lower center"` - `"upper center"` - `"center"` palette : str String specifying the function to generate a color palette with a single argument `n`. Uses `"rainbow"` by default. node_alpha : int A numeric value between 0 and 1 to specify the transparency of the nodes. Defaults to 0.7. :param edge_alpha : int A numeric value between 0 and 1 to specify the transparency of the edges (only for 'ggraph' mode). Defaults to 1. edge_col: String to specify edge link color. node_sizes: int Numeric vector of length two to specify the range of node sizes to rescale to, when `centrality` is set to a non-null value. node_scale: int A numeric value to multiply or divide the size of the nodes. This is applied to the 'node_size' attribute in the graph to increase or decrease the size of the nodes. seed : int Seed for the random number generator passed to either `set.seed()` when the louvain or leiden community detection algorithm is used, to ensure consistency. Only applicable when `community` is set to one of the valid non-null values. legend_ncols : int Value is either 0 or 1, Parameter to change the orientation horizontal to vertical of legend in the plot. Returns ------- A different output is returned depending on the value passed to the `return_type` argument: - `'plot'`: return a network plot, interactively within R. - `'plot-pdf'`: save a network plot as PDF. This option is recommended when the graph is large, which make take a long time to run if `return_type = 'plot'` is selected. Use this together with `path` to control the save location. - `'sankey'`: return a sankey plot combining communities and HR attribute. This is only valid if a community detection method is selected at community`. - `'table'`: return a vertex summary table with counts in communities and HR attribute. When `centrality` is non-NULL, the average centrality values are calculated per group. - `'data'`: return a vertex data file that matches vertices with communities and HR attributes. - `'network'`: return 'igraph' object. Examples -------- >>> vi.network_p2p(data = p2p_data, return_type = "plot") # Return a network visual >>> vi.network_p2p(data = p2p_data, community = "leiden", comm_args = {"resolution": 0.01}, return_type = "table") # Return the vertex table with counts in communities and HR attribute # Resolution is set to a low value to yield fewer communities >>> vi.network_p2p(data = p2p_data, centrality = "betweenness", return_type = "table") # Return the vertex table with centrality calculations >>> vi.network_p2p( data = p2p_data, # or whatever your query is stored node_scale = 50, # adjust this parameter to make nodes bigger/smaller return_type = "plot" ) >>> vi.network_p2p( data=p2p_data, # or whatever your query is stored return_type = "sankey", # another return type for visualization centrality = "betweenness", # centrality can be set as per requirement community = "leiden" # Adjust community ) # Return the sankey output based on centrality and community >>> vi.network_p2p( data=p2p_data, # or whatever your query is stored return_type = "plot", font_col = "grey20", # Color change option for fonts in chart legend_pos = "upper left", # Adjust the legend position using this parameter legend_ncols = 1 # Adjust this parameter to 0 or 1 to change legend orientation from vertical to horizontal ) # Return the plot output based on different color scheme, legend orientation and position, font color change """ path ="p2p" + ("" if community is None else '_' + community) # `style` is currently a placeholder as only igraph is supported # legacy argument from the R implementation style = "igraph" if len(node_sizes) != 2: raise ValueError("`node_sizes` must be of length 2") #Set data frame for edges if weight is None: edges = data.assign(NoWeight = 1).loc[:, ["PrimaryCollaborator_PersonId", "SecondaryCollaborator_PersonId", "NoWeight"]].rename(columns = {"NoWeight": "weight"}) else: edges = data.loc[:, ["PrimaryCollaborator_PersonId", "SecondaryCollaborator_PersonId", weight]] pc_hrvar = "PrimaryCollaborator_" + hrvar sc_hrvar = "SecondaryCollaborator_" + hrvar # TieOrigin = PrimaryCollaborator tieOrigin = ( edges[["PrimaryCollaborator_PersonId"]].drop_duplicates() .merge(data[["PrimaryCollaborator_PersonId", pc_hrvar]], on = "PrimaryCollaborator_PersonId", how = "left") #left join .rename(columns = {"PrimaryCollaborator_PersonId": "node"}) .assign(**{hrvar: lambda row: row[pc_hrvar]}) #assign new column .drop(columns = [pc_hrvar]) ) # TieDest = SecondaryCollaborator tieDest = ( edges[["SecondaryCollaborator_PersonId"]].drop_duplicates() .merge(data[["SecondaryCollaborator_PersonId", sc_hrvar]], on = "SecondaryCollaborator_PersonId", how = "left") .rename(columns = {"SecondaryCollaborator_PersonId": "node"}) .assign(**{hrvar: lambda row: row[sc_hrvar]}) .drop(columns = [sc_hrvar]) ) # Vertices data frame to provide meta-data vert_ft = pd.concat([tieOrigin, tieDest]).drop_duplicates() # Create igraph object g_raw = ig.Graph.TupleList(edges.itertuples(index=False), directed=True, weights=True) # Assign vertex attributes - HR attribute and node g_raw.vs[hrvar] = vert_ft[hrvar].tolist() g_raw.vs["node"] = vert_ft["node"].tolist() # Assign weights g_raw.es["weight"] = edges["weight"] # allowed community values valid_comm = ["leiden", "multilevel", "edge_betweenness", "fastgreedy", "infomap", "label_propagation", "leading_eigenvector", "optimal_modularity", "spinglass", "walk_trap"] # Finalise `g` object # If community detection is selected, this is where the communities are appended if community is None: # g = g_raw.simplify() g = g_raw # Note: NOT simplified as simplification may remove too many edges v_attr = hrvar elif community in valid_comm: random.seed(seed) g_ud = g_raw.as_undirected() # Convert to undirected graph # combine arguments to clustering algorithms comm_func = getattr(ig.Graph, "community_" + community) if comm_args is None: comm_args = {} # call community detection function comm_out = comm_func(graph = g_ud, **comm_args) # g = g_ud.simplify() g = g_ud # Note: NOT simplified as simplification may remove too many edges g.vs["cluster"] = [str(member) for member in comm_out.membership] #Name of vertex attribute v_attr = "cluster" else: raise ValueError("Please enter a valid input for `community`.") # centrality calculations ------------------------ # valid values of `centrality` valid_cent = ['betweenness', 'closeness', 'degree', 'eigenvector', 'pagerank'] # attach centrality calculations if `centrality` is not None if centrality in valid_cent: g = vi.network_summary(g, return_type="network") node_sizes = (node_sizes[1] - node_sizes[0]) node_sizes *= minmax_scale(g.vs[centrality]) + node_sizes #min and max values g.vs["node_size"] = node_sizes/100 #scale for plotting elif centrality is None: # all nodes with the same size if centrality is not calculated # adjust for plotting formats if style == "igraph": g.vs["node_size"] = [0.08] * g.vcount() elif style == "ggraph": g.vs["node_size"] = [0.08] * g.vcount() node_sizes = [0.03,0.03] #fix node size else: raise ValueError("Please enter a valid input for `centrality`.") # Common area ------------------- ---------------- # vertex table vert_ft = vert_ft.rename(columns = {"node": "name"}) if centrality is not None: if community is None: vert_tb = pd.DataFrame({ "name": g.vs["name"], "betweenness": g.vs["betweenness"], "closeness": g.vs["closeness"], "degree": g.vs["degree"], "eigenvector": g.vs["eigenvector"], "pagerank": g.vs["pagerank"], }) else : vert_tb = pd.DataFrame({ "name": g.vs["name"], "cluster": g.vs[v_attr], "betweenness": g.vs["betweenness"], "closeness": g.vs["closeness"], "degree": g.vs["degree"], "eigenvector": g.vs["eigenvector"], "pagerank": g.vs["pagerank"], }) else: if community is None: vert_tb = pd.DataFrame({ "name": g.vs["name"], }) else: vert_tb = pd.DataFrame({ "name": g.vs["name"], "cluster": g.vs[v_attr] }) vert_tb = vert_tb.merge(vert_ft, on = "name", how = "left").drop_duplicates() #merge hrvar to vertex table g_layout = g.layout(layout) out_path = path + '_' + time.strftime("%y%m%d_%H%M%S") + '.pdf' # Return outputs --------------------------------------- #use fast plotting method if return_type in ["plot", "plot-pdf"]: def rainbow(n): return [f"#{random.randint(0, 0xFFFFFF):06x}" for _ in range(n)] # Set colours vert_tb = vert_tb.drop_duplicates() colour_tb = ( pd.DataFrame({v_attr: g.vs[v_attr]}) .assign(colour = eval(f"{palette}(len(vert_tb))")) ) # Colour vector colour_v = ( pd.DataFrame({v_attr: g.vs[v_attr]}) .merge(colour_tb, on = v_attr, how = "left") .loc[:, "colour"] ) if style == "igraph": # Set graph plot colours # color_names = list(mcolors.CSS4_COLORS.keys()) # g.vs["color"] = [mcolors.to_rgba(color_names[i % len(color_names)], alpha=node_alpha) for i in range(len(g.vs))] g.vs["frame_color"] = None g.es["width"] = 1 #Internal basic plotting function used inside 'network_p2p()' def plot_basic_graph(lpos = legend_pos, pdf=False, node_scale=node_scale): fig, ax = plt.subplots(figsize=(10, 10)) plt.rcParams["figure.facecolor"] = bg_fill layout_func = getattr(ig.Graph, f"layout_{layout}") # Get the unique values of the vertex attribute unique_values = list(set(g.vs[v_attr])) # Create a colormap with one color for each unique value cmap = mcolors.ListedColormap([plt.get_cmap('tab20')(i) for i in range(len(unique_values))]) handles = [] labels = [] # Map each unique value to an index in the colormap value_to_index = {value: i for i, value in enumerate(unique_values)} # Legend for i, value in enumerate(unique_values): marker = mlines.Line2D([0], [0], marker='o', color='w', label=value, markerfacecolor=cmap(i), markersize=5) handles.append(marker) labels.append(value) # Set node colours for i, value in enumerate(g.vs[v_attr]): index = value_to_index[g.vs[i][v_attr]] color = cmap(index) g.vs[i]["color"] = color g.vs["node_size"] = [x*node_scale for x in g.vs["node_size"]] # scale the size of the nodes ig.plot( g, layout = layout_func(g), target=ax, vertex_label = None, vertex_size = g.vs["node_size"], edge_arrow_mode = "0", edge_arrow_size=0, edge_color = "#adadad", ) # Number of legend columns if legend_ncols==0: if len(handles)<=10: leg_cols=len(handles) elif 10<len(handles)<=20: leg_cols = (len(handles) // 2) else: leg_cols = (len(handles) // 4) warnings.warn("There are over 20 unique node categories. Consider changing your grouping variable, merging existing groups, or tweaking algorithm parameters (if applicable).", UserWarning) else: leg_cols=1 plt.legend( handles = handles, labels = labels, handler_map={tuple: HandlerTuple(ndivide=20)}, loc = legend_pos, edgecolor= edge_col, frameon = True, markerscale = 1, fontsize= 5, labelcolor = 'grey', ncols = leg_cols ) if pdf: return fig return plt.show() #return 'ggplot' object # Default PDF output unless None supplied to path if return_type == "plot": plot_basic_graph(lpos = legend_pos) elif return_type == "plot-pdf": with PdfPages(out_path) as pdf: pdf.savefig(plot_basic_graph(pdf=True)) print(f"Saved to {out_path}.") else: raise ValueError("Invalid input for `style`.") elif return_type == "data": vert_tb = vert_tb.reset_index(drop = True) return vert_tb elif return_type == "network": return g elif return_type == "sankey": if community is None: raise ValueError("Note: no sankey return option is available if `None` is selected at `community`. Please specify a valid community detection algorithm.") elif community in valid_comm: vi.create_sankey(data = vert_tb.groupby([hrvar, 'cluster']).size().reset_index(name='n'), var1=hrvar, var2='cluster') elif return_type == "table": if community is None: if centrality is None: vert_tb = vert_tb.groupby(hrvar).size().reset_index(name='n') else: vert_tb = vert_tb.groupby(hrvar).agg( n=('betweenness', 'size'), betweenness=('betweenness', 'mean'), closeness=('closeness', 'mean'), degree=('degree', 'mean'), eigenvector=('eigenvector', 'mean'), pagerank=('pagerank', 'mean') ) elif community in valid_comm: if centrality is None: vert_tb = vert_tb.groupby([hrvar, 'cluster']).size().reset_index(name='n') else: vert_tb = vert_tb.groupby([hrvar, 'cluster']).agg( n=('betweenness', 'size'), betweenness=('betweenness', 'mean'), closeness=('closeness', 'mean'), degree=('degree', 'mean'), eigenvector=('eigenvector', 'mean'), pagerank=('pagerank', 'mean') ) return vert_tb else: raise ValueError("invalid input for `return_type`.")