Skip to content
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

Run Presidio on structured / semi-structured data

This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame. It introduces methods for the analysis and anonymization of both lists and dicts.

Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.

Set up imports

from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

import pandas as pd

from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import EngineResult

Example using sample tabular data

columns = ["name phrase", "phone number phrase", "integer", "boolean" ]
sample_data = [
        ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),
        ('You should talk to Mike', 'his number is 978-428-7111', 2, False),
        ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)
]
# Create Pandas DataFrame
df  = pd.DataFrame(sample_data,columns=columns)

df
name phrase phone number phrase integer boolean
0 Charlie likes this Please call 212-555-1234 after 2pm 1 True
1 You should talk to Mike his number is 978-428-7111 2 False
2 Mary had a little startup Phone number: 202-342-1234 3 False
# DataFrame to dict
df_dict = df.to_dict(orient="list")
pprint.pprint(df_dict)
{'boolean': [True, False, False],
 'integer': [1, 2, 3],
 'name phrase': ['Charlie likes this',
                 'You should talk to Mike',
                 'Mary had a little startup'],
 'phone number phrase': ['Please call 212-555-1234 after 2pm',
                         'his number is 978-428-7111',
                         'Phone number: 202-342-1234']}

analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
analyzer_results = list(analyzer_results)
analyzer_results
[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),
 DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),
 DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),
 DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
scrubbed_df = pd.DataFrame(anonymizer_results)
scrubbed_df
name phrase phone number phrase integer boolean
0 <PERSON> likes this Please call <PHONE_NUMBER> after <DATE_TIME> 1 True
1 You should talk to <PERSON> his number is <PHONE_NUMBER> 2 False
2 <PERSON> had a little startup Phone number: <PHONE_NUMBER> 3 False

Example using JSON

nested_dict = {
    "key_a": {"key_a1": "My phone number is 212-121-1424"},
    "key_b": {"www.abc.com"},
    "key_c": 3,
    "names": ["James Bond", "Clark Kent", "Hakeem Olajuwon", "No name here!"]
}

pprint.pprint(nested_dict)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': {'www.abc.com'},
 'key_c': 3,
 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}

# Analyze dict
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en")

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is <PHONE_NUMBER>'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}

Ignoring specific keys

keys_to_skip=["key_a1", "names"]
analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}

Ignoring nested keys

keys_to_skip = ["key_a.key_a1"]

analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language="en", keys_to_skip=keys_to_skip)

# Anonymize dict
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)
pprint.pprint(anonymizer_results)
{'key_a': {'key_a1': 'My phone number is 212-121-1424'},
 'key_b': ['<URL>'],
 'key_c': 3,
 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}

Note!

JSON files with objects within lists, e.g.:

{
  "key": [
    {
      "key2": "Peter Parker"
    },
    {
      "key3": "555-1234"
    }
  ]
}

Are not yet supported. Consider breaking the JSON to parts if needed.