# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg
Use Presidio Anonymizer for Pseudonymization of PII data¶
Pseudonymization is a data management and de-identification procedure by which personally identifiable information fields within a data record are replaced by one or more artificial identifiers, or pseudonyms. (https://en.wikipedia.org/wiki/Pseudonymization)
In this notebook, we'll show an example of how to use the Presidio Anonymizer library to pseudonymize PII data. In this example, we will replace each value with a unique identifier (e.g. <PERSON_14>). Then, we'll de-anonymize the data by replacing the unique identifiers back with their mapped PII values.
Important: The following logic is not thread-safe and may produce incorrect results if run concurrently in a multi-threaded environment, since the mapping has to be shared between threads/workers/processes.¶
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine, OperatorConfig
from presidio_anonymizer.operators import Operator, OperatorType
from typing import Dict
from pprint import pprint
1. Using the AnalyzerEngine
to identify PII in a text¶
text = "Peter gave his book to Heidi which later gave it to Nicole. Peter lives in London and Nicole lives in Tashkent."
print("original text:")
pprint(text)
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text, language="en")
print("analyzer results:")
pprint(analyzer_results)
original text: ('Peter gave his book to Heidi which later gave it to Nicole. Peter lives in ' 'London and Nicole lives in Tashkent.') analyzer results: [type: PERSON, start: 0, end: 5, score: 0.85, type: PERSON, start: 23, end: 28, score: 0.85, type: PERSON, start: 52, end: 58, score: 0.85, type: PERSON, start: 60, end: 65, score: 0.85, type: LOCATION, start: 75, end: 81, score: 0.85, type: PERSON, start: 86, end: 92, score: 0.85, type: LOCATION, start: 102, end: 110, score: 0.85]
2. Creating a custom Anonymizer (called Operator) which replaces each text with a unique identifier.¶
To create a custom anonymizer, we need to create a class that inherits from Operator
and implement the operate
method. This method receives the original text and a dictionary called params
with the configuration defined by the user. The method should return the anonymized text.
In this example we also implement the validate
method to check that the input parameters are available, i.e. that the entity_type
and entity_mapping
parameters are defined, as they are required for this specific anonymizer. entity_mapping
is a dictionary that maps each entity value to a unique identifier, for each entity type.
class InstanceCounterAnonymizer(Operator):
"""
Anonymizer which replaces the entity value
with an instance counter per entity.
"""
REPLACING_FORMAT = "<{entity_type}_{index}>"
def operate(self, text: str, params: Dict = None) -> str:
"""Anonymize the input text."""
entity_type: str = params["entity_type"]
# entity_mapping is a dict of dicts containing mappings per entity type
entity_mapping: Dict[Dict:str] = params["entity_mapping"]
entity_mapping_for_type = entity_mapping.get(entity_type)
if not entity_mapping_for_type:
new_text = self.REPLACING_FORMAT.format(
entity_type=entity_type, index=0
)
entity_mapping[entity_type] = {}
else:
if text in entity_mapping_for_type:
return entity_mapping_for_type[text]
previous_index = self._get_last_index(entity_mapping_for_type)
new_text = self.REPLACING_FORMAT.format(
entity_type=entity_type, index=previous_index + 1
)
entity_mapping[entity_type][text] = new_text
return new_text
@staticmethod
def _get_last_index(entity_mapping_for_type: Dict) -> int:
"""Get the last index for a given entity type."""
def get_index(value: str) -> int:
return int(value.split("_")[-1][:-1])
indices = [get_index(v) for v in entity_mapping_for_type.values()]
return max(indices)
def validate(self, params: Dict = None) -> None:
"""Validate operator parameters."""
if "entity_mapping" not in params:
raise ValueError("An input Dict called `entity_mapping` is required.")
if "entity_type" not in params:
raise ValueError("An entity_type param is required.")
def operator_name(self) -> str:
return "entity_counter"
def operator_type(self) -> OperatorType:
return OperatorType.Anonymize
3. Passing the new operator to the AnonymizerEngine
and use it to anonymize the text.¶
# Create Anonymizer engine and add the custom anonymizer
anonymizer_engine = AnonymizerEngine()
anonymizer_engine.add_anonymizer(InstanceCounterAnonymizer)
# Create a mapping between entity types and counters
entity_mapping = dict()
# Anonymize the text
anonymized_result = anonymizer_engine.anonymize(
text,
analyzer_results,
{
"DEFAULT": OperatorConfig(
"entity_counter", {"entity_mapping": entity_mapping}
)
},
)
print(anonymized_result.text)
<PERSON_1> gave his book to <PERSON_2> which later gave it to <PERSON_0>. <PERSON_1> lives in <LOCATION_1> and <PERSON_0> lives in <LOCATION_0>.
Note that the order is reversed due to the way entities are replaced in Presidio.
Since the user/client is holding the entity_mapping, it is possible to use it for de-anonymization as well. First, let's look at its contents.
pprint(entity_mapping, indent=2)
{ 'LOCATION': {'London': '<LOCATION_1>', 'Tashkent': '<LOCATION_0>'}, 'PERSON': { 'Heidi': '<PERSON_2>', 'Nicole': '<PERSON_0>', 'Peter': '<PERSON_1>'}}
4. De-anonymizing the text using the entity_mapping¶
Similar to the anonymization operator, we need to create a custom de-anonymization operator. This operator will replace the unique identifiers with the original values.
class InstanceCounterDeanonymizer(Operator):
"""
Deanonymizer which replaces the unique identifier
with the original text.
"""
def operate(self, text: str, params: Dict = None) -> str:
"""Anonymize the input text."""
entity_type: str = params["entity_type"]
# entity_mapping is a dict of dicts containing mappings per entity type
entity_mapping: Dict[Dict:str] = params["entity_mapping"]
if entity_type not in entity_mapping:
raise ValueError(f"Entity type {entity_type} not found in entity mapping!")
if text not in entity_mapping[entity_type].values():
raise ValueError(f"Text {text} not found in entity mapping for entity type {entity_type}!")
return self._find_key_by_value(entity_mapping[entity_type], text)
@staticmethod
def _find_key_by_value(entity_mapping, value):
for key, val in entity_mapping.items():
if val == value:
return key
return None
def validate(self, params: Dict = None) -> None:
"""Validate operator parameters."""
if "entity_mapping" not in params:
raise ValueError("An input Dict called `entity_mapping` is required.")
if "entity_type" not in params:
raise ValueError("An entity_type param is required.")
def operator_name(self) -> str:
return "entity_counter_deanonymizer"
def operator_type(self) -> OperatorType:
return OperatorType.Deanonymize
deanonymizer_engine = DeanonymizeEngine()
deanonymizer_engine.add_deanonymizer(InstanceCounterDeanonymizer)
deanonymized = deanonymizer_engine.deanonymize(
anonymized_result.text,
anonymized_result.items,
{"DEFAULT": OperatorConfig("entity_counter_deanonymizer",
params={"entity_mapping": entity_mapping})}
)
print("anonymized text:")
pprint(anonymized_result.text)
print("de-anonymized text:")
pprint(deanonymized.text)
anonymized text: ('<PERSON_1> gave his book to <PERSON_2> which later gave it to <PERSON_0>. ' '<PERSON_1> lives in <LOCATION_1> and <PERSON_0> lives in <LOCATION_0>.') de-anonymized text: ('Peter gave his book to Heidi which later gave it to Nicole. Peter lives in ' 'London and Nicole lives in Tashkent.')