Presidio Anonymizer API Reference
Anonymizer root module.
anonymizer_engine
Handles the entire logic of the Presidio-anonymizer and text anonymizing.
AnonymizerEngine (EngineBase)
AnonymizerEngine class.
Handles the entire logic of the Presidio-anonymizer. Gets the original text and replaces the PII entities with the desired anonymizers.
Source code in presidio_anonymizer/anonymizer_engine.py
class AnonymizerEngine(EngineBase):
"""
AnonymizerEngine class.
Handles the entire logic of the Presidio-anonymizer. Gets the original text
and replaces the PII entities with the desired anonymizers.
"""
logger = logging.getLogger("presidio-anonymizer")
def __init__(self):
EngineBase.__init__(self)
def anonymize(
self,
text: str,
analyzer_results: List[RecognizerResult],
operators: Optional[Dict[str, OperatorConfig]] = None,
) -> EngineResult:
"""Anonymize method to anonymize the given text.
:param text: the text we are anonymizing
:param analyzer_results: A list of RecognizerResult class -> The results we
received from the analyzer
:param operators: The configuration of the anonymizers we would like
to use for each entity e.g.: {"PHONE_NUMBER":OperatorConfig("redact", {})}
received from the analyzer
:return: the anonymized text and a list of information about the
anonymized entities.
:example:
>>> from presidio_anonymizer import AnonymizerEngine
>>> from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
>>> # Initialize the engine with logger.
>>> engine = AnonymizerEngine()
>>> # Invoke the anonymize function with the text, analyzer results and
>>> # Operators to define the anonymization type.
>>> result = engine.anonymize(
>>> text="My name is Bond, James Bond",
>>> analyzer_results=[RecognizerResult(entity_type="PERSON",
>>> start=11,
>>> end=15,
>>> score=0.8),
>>> RecognizerResult(entity_type="PERSON",
>>> start=17,
>>> end=27,
>>> score=0.8)],
>>> operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})}
>>> )
>>> print(result)
text: My name is BIP, BIP.
items:
[
{'start': 16, 'end': 19, 'entity_type': 'PERSON',
'text': 'BIP', 'operator': 'replace'},
{'start': 11, 'end': 14, 'entity_type': 'PERSON',
'text': 'BIP', 'operator': 'replace'}
]
"""
analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
)
operators = self.__check_or_add_default_operator(operators)
return self._operate(text, analyzer_results, operators, OperatorType.Anonymize)
def _remove_conflicts_and_get_text_manipulation_data(
self, analyzer_results: List[RecognizerResult]
) -> List[RecognizerResult]:
"""
Iterate the list and create a sorted unique results list from it.
Only insert results which are:
1. Indices are not contained in other result.
2. Have the same indices as other results but with larger score.
:return: List
"""
tmp_analyzer_results = []
# This list contains all elements which we need to check a single result
# against. If a result is dropped, it can also be dropped from this list
# since it is intersecting with another result and we selected the other one.
other_elements = analyzer_results.copy()
for result in analyzer_results:
other_elements.remove(result)
is_merge_same_entity_type = False
for other_element in other_elements:
if other_element.entity_type != result.entity_type:
continue
if result.intersects(other_element) == 0:
continue
other_element.start = min(result.start, other_element.start)
other_element.end = max(result.end, other_element.end)
other_element.score = max(result.score, other_element.score)
is_merge_same_entity_type = True
break
if not is_merge_same_entity_type:
other_elements.append(result)
tmp_analyzer_results.append(result)
else:
self.logger.debug(f"removing element {result} from "
f"results list due to merge")
unique_text_metadata_elements = []
# This list contains all elements which we need to check a single result
# against. If a result is dropped, it can also be dropped from this list
# since it is intersecting with another result and we selected the other one.
other_elements = tmp_analyzer_results.copy()
for result in tmp_analyzer_results:
other_elements.remove(result)
result_conflicted = self.__is_result_conflicted_with_other_elements(
other_elements, result
)
if not result_conflicted:
other_elements.append(result)
unique_text_metadata_elements.append(result)
else:
self.logger.debug(
f"removing element {result} from results list due to conflict"
)
return unique_text_metadata_elements
def get_anonymizers(self) -> List[str]:
"""Return a list of supported anonymizers."""
names = [p for p in self.operators_factory.get_anonymizers().keys()]
return names
@staticmethod
def __is_result_conflicted_with_other_elements(other_elements, result):
return any(
[result.has_conflict(other_element) for other_element in other_elements]
)
@staticmethod
def __check_or_add_default_operator(
operators: Dict[str, OperatorConfig]
) -> Dict[str, OperatorConfig]:
default_operator = OperatorConfig(DEFAULT)
if not operators:
return {"DEFAULT": default_operator}
if not operators.get("DEFAULT"):
operators["DEFAULT"] = default_operator
return operators
anonymize(self, text, analyzer_results, operators=None)
Anonymize method to anonymize the given text.
:param text: the text we are anonymizing :param analyzer_results: A list of RecognizerResult class -> The results we received from the analyzer :param operators: The configuration of the anonymizers we would like to use for each entity e.g.: {"PHONE_NUMBER":OperatorConfig("redact", {})} received from the analyzer :return: the anonymized text and a list of information about the anonymized entities.
:example:
from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
Initialize the engine with logger.
engine = AnonymizerEngine()
Invoke the anonymize function with the text, analyzer results and
Operators to define the anonymization type.
result = engine.anonymize( text="My name is Bond, James Bond", analyzer_results=[RecognizerResult(entity_type="PERSON", start=11, end=15, score=0.8), RecognizerResult(entity_type="PERSON", start=17, end=27, score=0.8)], operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})} )
print(result) text: My name is BIP, BIP. items: [ {'start': 16, 'end': 19, 'entity_type': 'PERSON', 'text': 'BIP', 'operator': 'replace'}, {'start': 11, 'end': 14, 'entity_type': 'PERSON', 'text': 'BIP', 'operator': 'replace'} ]
Source code in presidio_anonymizer/anonymizer_engine.py
def anonymize(
self,
text: str,
analyzer_results: List[RecognizerResult],
operators: Optional[Dict[str, OperatorConfig]] = None,
) -> EngineResult:
"""Anonymize method to anonymize the given text.
:param text: the text we are anonymizing
:param analyzer_results: A list of RecognizerResult class -> The results we
received from the analyzer
:param operators: The configuration of the anonymizers we would like
to use for each entity e.g.: {"PHONE_NUMBER":OperatorConfig("redact", {})}
received from the analyzer
:return: the anonymized text and a list of information about the
anonymized entities.
:example:
>>> from presidio_anonymizer import AnonymizerEngine
>>> from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
>>> # Initialize the engine with logger.
>>> engine = AnonymizerEngine()
>>> # Invoke the anonymize function with the text, analyzer results and
>>> # Operators to define the anonymization type.
>>> result = engine.anonymize(
>>> text="My name is Bond, James Bond",
>>> analyzer_results=[RecognizerResult(entity_type="PERSON",
>>> start=11,
>>> end=15,
>>> score=0.8),
>>> RecognizerResult(entity_type="PERSON",
>>> start=17,
>>> end=27,
>>> score=0.8)],
>>> operators={"PERSON": OperatorConfig("replace", {"new_value": "BIP"})}
>>> )
>>> print(result)
text: My name is BIP, BIP.
items:
[
{'start': 16, 'end': 19, 'entity_type': 'PERSON',
'text': 'BIP', 'operator': 'replace'},
{'start': 11, 'end': 14, 'entity_type': 'PERSON',
'text': 'BIP', 'operator': 'replace'}
]
"""
analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
)
operators = self.__check_or_add_default_operator(operators)
return self._operate(text, analyzer_results, operators, OperatorType.Anonymize)
get_anonymizers(self)
Return a list of supported anonymizers.
Source code in presidio_anonymizer/anonymizer_engine.py
def get_anonymizers(self) -> List[str]:
"""Return a list of supported anonymizers."""
names = [p for p in self.operators_factory.get_anonymizers().keys()]
return names
core
special
The core text functionality.
engine_base
Handle the entire text operations using the operators.
EngineBase (ABC)
Handle the logic of operations over the text using the operators.
Source code in presidio_anonymizer/core/engine_base.py
class EngineBase(ABC):
"""Handle the logic of operations over the text using the operators."""
def __init__(self):
self.logger = logging.getLogger("presidio-anonymizer")
self.operators_factory = OperatorsFactory()
def _operate(
self,
text: str,
pii_entities: List[PIIEntity],
operators_metadata: Dict[str, OperatorConfig],
operator_type: OperatorType,
) -> EngineResult:
"""
Operate will do the operations required by the user over the text.
:param text: the text we need to operate on.
:param pii_entities: data about the text entities we want to operate over.
:param operators_metadata: dictionary where the key is the entity_type and what
:type operator_type: either anonymize or deanonymize
we want to perform over this entity_type.
:return:
"""
text_replace_builder = TextReplaceBuilder(original_text=text)
engine_result = EngineResult()
sorted_pii_entities = sorted(pii_entities, reverse=True)
for operator in sorted_pii_entities:
text_to_operate_on = text_replace_builder.get_text_in_position(
operator.start, operator.end
)
self.logger.debug(f"performing operation {operator}")
operator_metadata = self.__get_entity_operator_metadata(
operator.entity_type, operators_metadata
)
changed_text = self.__operate_on_text(
operator, text_to_operate_on, operator_metadata, operator_type
)
index_from_end = text_replace_builder.replace_text_get_insertion_index(
changed_text, operator.start, operator.end
)
# The following creates an intermediate list of result entities,
# ordered from end to start, and the indexes will be normalized
# from start to end once the loop ends and the text length is deterministic.
result_item = OperatorResult(
0,
index_from_end,
operator.entity_type,
changed_text,
operator_metadata.operator_name,
)
engine_result.add_item(result_item)
engine_result.set_text(text_replace_builder.output_text)
engine_result.normalize_item_indexes()
return engine_result
def __operate_on_text(
self,
text_metadata: PIIEntity,
text_to_operate_on: str,
operator_metadata: OperatorConfig,
operator_type: OperatorType,
) -> str:
entity_type = text_metadata.entity_type
self.logger.debug(f"getting operator for {entity_type}")
operator = self.operators_factory.create_operator_class(
operator_metadata.operator_name, operator_type
)
self.logger.debug(f"validating operator {operator} for {entity_type}")
operator.validate(params=operator_metadata.params)
params = operator_metadata.params
params["entity_type"] = entity_type
self.logger.debug(f"operating on {entity_type} with {operator}")
operated_on_text = operator.operate(params=params, text=text_to_operate_on)
return operated_on_text
@staticmethod
def __get_entity_operator_metadata(
entity_type: str, operators_metadata: Dict = None
) -> OperatorConfig:
# We try to get the operator from the list by entity_type.
# If it does not exist, we get the default from the list.
if operators_metadata is None:
operators_metadata = {}
operator = operators_metadata.get(entity_type)
if operator:
return operator
else:
return operators_metadata.get("DEFAULT")
text_replace_builder
Handles the original text and creates a new one according to changes requests.
TextReplaceBuilder
Creates new text according to users request.
Source code in presidio_anonymizer/core/text_replace_builder.py
class TextReplaceBuilder:
"""Creates new text according to users request."""
def __init__(self, original_text: str):
self.logger = logging.getLogger("presidio-anonymizer")
self.output_text = original_text
self.original_text = original_text
self.text_len = len(original_text)
self.last_replacement_index = self.text_len
def get_text_in_position(self, start: int, end: int) -> str:
"""
Get part of the text inside the original text.
:param start: start position of inner text
:param end: end position of inner text
:return: str - part of the original text
"""
self.__validate_position_in_text(start, end)
return self.original_text[start:end]
def replace_text_get_insertion_index(
self, replacement_text: str, start: int, end: int
) -> int:
"""
Replace text in a specific position with the text.
:param replacement_text: new text to replace the old text according to indices
:param start: the startpoint to replace the text
:param end: the endpoint to replace the text
:return: The index of inserted text
"""
end_of_text_index = min(end, self.last_replacement_index)
self.last_replacement_index = start
before_text = self.output_text[:start]
after_text = self.output_text[end_of_text_index:]
self.output_text = before_text + replacement_text + after_text
# The replace algorithm is replacing the text from end to start.
# calculate and return the start point from the end.
return len(after_text) + len(replacement_text)
def __validate_position_in_text(self, start: int, end: int):
"""Validate the start and end position match the text length."""
if self.text_len < start or end > self.text_len:
err_msg = (
f"Invalid analyzer result, start: {start} and end: "
f"{end}, while text length is only {self.text_len}."
)
raise InvalidParamException(err_msg)
get_text_in_position(self, start, end)
Get part of the text inside the original text.
:param start: start position of inner text :param end: end position of inner text :return: str - part of the original text
Source code in presidio_anonymizer/core/text_replace_builder.py
def get_text_in_position(self, start: int, end: int) -> str:
"""
Get part of the text inside the original text.
:param start: start position of inner text
:param end: end position of inner text
:return: str - part of the original text
"""
self.__validate_position_in_text(start, end)
return self.original_text[start:end]
replace_text_get_insertion_index(self, replacement_text, start, end)
Replace text in a specific position with the text.
:param replacement_text: new text to replace the old text according to indices :param start: the startpoint to replace the text :param end: the endpoint to replace the text :return: The index of inserted text
Source code in presidio_anonymizer/core/text_replace_builder.py
def replace_text_get_insertion_index(
self, replacement_text: str, start: int, end: int
) -> int:
"""
Replace text in a specific position with the text.
:param replacement_text: new text to replace the old text according to indices
:param start: the startpoint to replace the text
:param end: the endpoint to replace the text
:return: The index of inserted text
"""
end_of_text_index = min(end, self.last_replacement_index)
self.last_replacement_index = start
before_text = self.output_text[:start]
after_text = self.output_text[end_of_text_index:]
self.output_text = before_text + replacement_text + after_text
# The replace algorithm is replacing the text from end to start.
# calculate and return the start point from the end.
return len(after_text) + len(replacement_text)
deanonymize_engine
Deanonymize anonymized text by using deanonymize operators.
DeanonymizeEngine (EngineBase)
Deanonymize text that was previously anonymized.
Source code in presidio_anonymizer/deanonymize_engine.py
class DeanonymizeEngine(EngineBase):
"""Deanonymize text that was previously anonymized."""
def __init__(self):
self.logger = logging.getLogger("presidio-anonymizer")
EngineBase.__init__(self)
def deanonymize(
self,
text: str,
entities: List[OperatorResult],
operators: Dict[str, OperatorConfig],
) -> EngineResult:
"""
Receive the text, entities and operators to perform deanonymization over.
:param operators: the operators to apply on the anonymizer result entities
:param text: the full text with the encrypted entities
:param entities: list of encrypted entities
:return: EngineResult - the new text and data about the deanonymized entities.
"""
return self._operate(text, entities, operators, OperatorType.Deanonymize)
def get_deanonymizers(self) -> List[str]:
"""Return a list of supported deanonymizers."""
names = [p for p in self.operators_factory.get_deanonymizers().keys()]
return names
deanonymize(self, text, entities, operators)
Receive the text, entities and operators to perform deanonymization over.
:param operators: the operators to apply on the anonymizer result entities :param text: the full text with the encrypted entities :param entities: list of encrypted entities :return: EngineResult - the new text and data about the deanonymized entities.
Source code in presidio_anonymizer/deanonymize_engine.py
def deanonymize(
self,
text: str,
entities: List[OperatorResult],
operators: Dict[str, OperatorConfig],
) -> EngineResult:
"""
Receive the text, entities and operators to perform deanonymization over.
:param operators: the operators to apply on the anonymizer result entities
:param text: the full text with the encrypted entities
:param entities: list of encrypted entities
:return: EngineResult - the new text and data about the deanonymized entities.
"""
return self._operate(text, entities, operators, OperatorType.Deanonymize)
get_deanonymizers(self)
Return a list of supported deanonymizers.
Source code in presidio_anonymizer/deanonymize_engine.py
def get_deanonymizers(self) -> List[str]:
"""Return a list of supported deanonymizers."""
names = [p for p in self.operators_factory.get_deanonymizers().keys()]
return names
entities
special
Handles all the entities objects (structs) of the anonymizer.
engine
special
Engine request entities.
operator_config
OperatorConfig
Hold the data of the required operator.
Source code in presidio_anonymizer/entities/engine/operator_config.py
class OperatorConfig:
"""Hold the data of the required operator."""
def __init__(self, operator_name: str, params: Dict = None):
"""
Create an operator config instance.
:param operator_name: the name of the operator we want to work with
:param params: the parameters the operator needs in order to work
"""
self.logger = logging.getLogger("presidio-anonymizer")
self.operator_name = operator_name
if not params:
params = {}
self.params = params
self.__validate_fields()
def __repr__(self):
"""Return a string representation of the object."""
return f"operator_name: {self.operator_name}, params: {self.params}"
@classmethod
def from_json(cls, params: Dict) -> "OperatorConfig":
"""
Create OperatorConfig from json.
:param params: json e.g.: {
"type": "mask",
"masking_char": "*",
"chars_to_mask": 4,
"from_end": true
}
:return: OperatorConfig
"""
operator_name = params.get("type")
if operator_name:
params.pop("type")
return cls(operator_name, params)
def __eq__(self, other: "OperatorConfig"):
"""Verify two OperatorConfigs are equal."""
operator_name = self.operator_name == other.operator_name
return self.params == other.params and operator_name
def __validate_fields(self):
validate_parameter_not_empty(
self.operator_name, "operator config", "operator_name"
)
__eq__(self, other)
special
Verify two OperatorConfigs are equal.
Source code in presidio_anonymizer/entities/engine/operator_config.py
def __eq__(self, other: "OperatorConfig"):
"""Verify two OperatorConfigs are equal."""
operator_name = self.operator_name == other.operator_name
return self.params == other.params and operator_name
__init__(self, operator_name, params=None)
special
Create an operator config instance.
:param operator_name: the name of the operator we want to work with :param params: the parameters the operator needs in order to work
Source code in presidio_anonymizer/entities/engine/operator_config.py
def __init__(self, operator_name: str, params: Dict = None):
"""
Create an operator config instance.
:param operator_name: the name of the operator we want to work with
:param params: the parameters the operator needs in order to work
"""
self.logger = logging.getLogger("presidio-anonymizer")
self.operator_name = operator_name
if not params:
params = {}
self.params = params
self.__validate_fields()
__repr__(self)
special
Return a string representation of the object.
Source code in presidio_anonymizer/entities/engine/operator_config.py
def __repr__(self):
"""Return a string representation of the object."""
return f"operator_name: {self.operator_name}, params: {self.params}"
from_json(params)
classmethod
Create OperatorConfig from json.
:param params: json e.g.: { "type": "mask", "masking_char": "*", "chars_to_mask": 4, "from_end": true } :return: OperatorConfig
Source code in presidio_anonymizer/entities/engine/operator_config.py
@classmethod
def from_json(cls, params: Dict) -> "OperatorConfig":
"""
Create OperatorConfig from json.
:param params: json e.g.: {
"type": "mask",
"masking_char": "*",
"chars_to_mask": 4,
"from_end": true
}
:return: OperatorConfig
"""
operator_name = params.get("type")
if operator_name:
params.pop("type")
return cls(operator_name, params)
pii_entity
PIIEntity (ABC)
Abstract class to hold the text we are going to operate on metadata.
Source code in presidio_anonymizer/entities/engine/pii_entity.py
class PIIEntity(ABC):
"""Abstract class to hold the text we are going to operate on metadata."""
logger = logging.getLogger("presidio-anonymizer")
def __init__(self, start: int, end: int, entity_type: str):
self.start = start
self.end = end
self.entity_type = entity_type
self.__validate_fields()
def __repr__(self):
"""Return a string representation of the object."""
return (
f"start: {self.start}"
f"end: {self.end},"
f"entity_type: {self.entity_type}"
)
def __gt__(self, other):
"""Check one entity is greater then other by the text end index."""
return self.start > other.start
def __eq__(self, other):
"""Check two text metadata entities are equal."""
return (
self.start == other.start
and self.end == other.end
and self.entity_type == other.entity_type
)
def __validate_fields(self):
validate_parameter_exists(self.start, "result", "start")
validate_type(self.start, "start", int)
validate_parameter_exists(self.end, "result", "end")
validate_type(self.end, "end", int)
validate_parameter_not_empty(self.entity_type, "result", "entity_type")
if self.start < 0 or self.end < 0:
raise InvalidParamException(
"Invalid input, result start and end must be positive"
)
if self.start > self.end:
raise InvalidParamException(
f"Invalid input, start index '{self.start}' "
f"must be smaller than end index '{self.end}'"
)
__eq__(self, other)
special
Check two text metadata entities are equal.
Source code in presidio_anonymizer/entities/engine/pii_entity.py
def __eq__(self, other):
"""Check two text metadata entities are equal."""
return (
self.start == other.start
and self.end == other.end
and self.entity_type == other.entity_type
)
__gt__(self, other)
special
Check one entity is greater then other by the text end index.
Source code in presidio_anonymizer/entities/engine/pii_entity.py
def __gt__(self, other):
"""Check one entity is greater then other by the text end index."""
return self.start > other.start
__repr__(self)
special
Return a string representation of the object.
Source code in presidio_anonymizer/entities/engine/pii_entity.py
def __repr__(self):
"""Return a string representation of the object."""
return (
f"start: {self.start}"
f"end: {self.end},"
f"entity_type: {self.entity_type}"
)
recognizer_result
RecognizerResult is an exact copy of the RecognizerResult object from presidio-analyzer.
Represents the findings of detected entity.
RecognizerResult (PIIEntity)
Recognizer Result represents the findings of the detected entity.
Result of a recognizer analyzing the text.
:param entity_type: the type of the entity :param start: the start location of the detected entity :param end: the end location of the detected entity :param score: the score of the detection
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
class RecognizerResult(PIIEntity):
"""
Recognizer Result represents the findings of the detected entity.
Result of a recognizer analyzing the text.
:param entity_type: the type of the entity
:param start: the start location of the detected entity
:param end: the end location of the detected entity
:param score: the score of the detection
"""
logger = logging.getLogger("presidio-anonymizer")
def __init__(self, entity_type: str, start: int, end: int, score: float):
PIIEntity.__init__(self, start, end, entity_type)
self.score = score
validate_parameter_exists(score, "analyzer result", "score")
@classmethod
def from_json(cls, data: Dict):
"""
Create RecognizerResult from json.
:param data: e.g. {
"start": 24,
"end": 32,
"score": 0.8,
"entity_type": "NAME"
}
:return: RecognizerResult
"""
score = data.get("score")
entity_type = data.get("entity_type")
start = data.get("start")
end = data.get("end")
return cls(entity_type, start, end, score)
def __gt__(self, other):
"""
Check if one result is greater by using the results indices in the text.
:param other: another RecognizerResult
:return: bool
"""
if self.start == other.start:
return self.end > other.end
return self.start > other.start
def __eq__(self, other):
"""
Check two results are equal by using all class fields.
:param other: another RecognizerResult
:return: bool
"""
equal_type = self.entity_type == other.entity_type
equal_score = self.score == other.score
return self.equal_indices(other) and equal_type and equal_score
def __hash__(self):
"""
Hash the result data by using all class fields.
:return: int
"""
return hash(
f"{str(self.start)} {str(self.end)} {str(self.score)} {self.entity_type}"
)
def __str__(self) -> str:
"""Return a string representation of the instance."""
return (
f"type: {self.entity_type}, "
f"start: {self.start}, "
f"end: {self.end}, "
f"score: {self.score}"
)
def has_conflict(self, other):
"""
Check if two recognizer results are conflicted or not.
I have a conflict if:
1. My indices are the same as the other and my score is lower.
2. If my indices are contained in another.
:param other: RecognizerResult
:return:
"""
if self.equal_indices(other):
return self.score <= other.score
return other.contains(self)
def contains(self, other):
"""
Check if one result is contained or equal to another result.
:param other: another RecognizerResult
:return: bool
"""
return self.start <= other.start and self.end >= other.end
def equal_indices(self, other):
"""
Check if the indices are equal between two results.
:param other: another RecognizerResult
:return:
"""
return self.start == other.start and self.end == other.end
def intersects(self, other) -> int:
"""
Check if self intersects with a different RecognizerResult.
:return: If intersecting, returns the number of
intersecting characters.
If not, returns 0
"""
# if they do not overlap the intersection is 0
if self.end < other.start or other.end < self.start:
return 0
# otherwise the intersection is min(end) - max(start)
return min(self.end, other.end) - max(self.start, other.start)
__eq__(self, other)
special
Check two results are equal by using all class fields.
:param other: another RecognizerResult :return: bool
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def __eq__(self, other):
"""
Check two results are equal by using all class fields.
:param other: another RecognizerResult
:return: bool
"""
equal_type = self.entity_type == other.entity_type
equal_score = self.score == other.score
return self.equal_indices(other) and equal_type and equal_score
__gt__(self, other)
special
Check if one result is greater by using the results indices in the text.
:param other: another RecognizerResult :return: bool
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def __gt__(self, other):
"""
Check if one result is greater by using the results indices in the text.
:param other: another RecognizerResult
:return: bool
"""
if self.start == other.start:
return self.end > other.end
return self.start > other.start
__hash__(self)
special
Hash the result data by using all class fields.
:return: int
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def __hash__(self):
"""
Hash the result data by using all class fields.
:return: int
"""
return hash(
f"{str(self.start)} {str(self.end)} {str(self.score)} {self.entity_type}"
)
__str__(self)
special
Return a string representation of the instance.
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def __str__(self) -> str:
"""Return a string representation of the instance."""
return (
f"type: {self.entity_type}, "
f"start: {self.start}, "
f"end: {self.end}, "
f"score: {self.score}"
)
contains(self, other)
Check if one result is contained or equal to another result.
:param other: another RecognizerResult :return: bool
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def contains(self, other):
"""
Check if one result is contained or equal to another result.
:param other: another RecognizerResult
:return: bool
"""
return self.start <= other.start and self.end >= other.end
equal_indices(self, other)
Check if the indices are equal between two results.
:param other: another RecognizerResult :return:
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def equal_indices(self, other):
"""
Check if the indices are equal between two results.
:param other: another RecognizerResult
:return:
"""
return self.start == other.start and self.end == other.end
from_json(data)
classmethod
Create RecognizerResult from json.
:param data: e.g. { "start": 24, "end": 32, "score": 0.8, "entity_type": "NAME" } :return: RecognizerResult
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
@classmethod
def from_json(cls, data: Dict):
"""
Create RecognizerResult from json.
:param data: e.g. {
"start": 24,
"end": 32,
"score": 0.8,
"entity_type": "NAME"
}
:return: RecognizerResult
"""
score = data.get("score")
entity_type = data.get("entity_type")
start = data.get("start")
end = data.get("end")
return cls(entity_type, start, end, score)
has_conflict(self, other)
Check if two recognizer results are conflicted or not.
I have a conflict if: 1. My indices are the same as the other and my score is lower. 2. If my indices are contained in another.
:param other: RecognizerResult :return:
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def has_conflict(self, other):
"""
Check if two recognizer results are conflicted or not.
I have a conflict if:
1. My indices are the same as the other and my score is lower.
2. If my indices are contained in another.
:param other: RecognizerResult
:return:
"""
if self.equal_indices(other):
return self.score <= other.score
return other.contains(self)
intersects(self, other)
Check if self intersects with a different RecognizerResult.
:return: If intersecting, returns the number of intersecting characters. If not, returns 0
Source code in presidio_anonymizer/entities/engine/recognizer_result.py
def intersects(self, other) -> int:
"""
Check if self intersects with a different RecognizerResult.
:return: If intersecting, returns the number of
intersecting characters.
If not, returns 0
"""
# if they do not overlap the intersection is 0
if self.end < other.start or other.end < self.start:
return 0
# otherwise the intersection is min(end) - max(start)
return min(self.end, other.end) - max(self.start, other.start)
result
special
Engine result items either for anonymize or decrypt.
engine_result
Handle a serializable anonymizer result.
EngineResult
Engine result.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
class EngineResult:
"""Engine result."""
def __init__(self, text: str = None, items: List[OperatorResult] = None):
"""Create EngineResult entity.
:param text: The anonymized text.
:param items: List of PII entities and the indices
of their replacements in the anonymized text.
"""
if items is None:
items = []
self.text = text
self.items = items
def set_text(self, text: str):
"""Set a text."""
self.text = text
def add_item(self, item: OperatorResult):
"""Add an item.
:param item: an item to add to the list.
"""
self.items.append(item)
def normalize_item_indexes(self):
"""Normalize the indexes to be index from start."""
text_len = len(self.text)
for result_item in self.items:
result_item.start = text_len - result_item.end
result_item.end = result_item.start + len(result_item.text)
def to_json(self) -> str:
"""Return a json string serializing this instance."""
return json.dumps(self, default=lambda x: x.__dict__)
def __repr__(self):
"""Return a string representation of the object."""
items_repr = (
",\n ".join([str(item) for item in self.items]) if self.items else ""
)
return f"text: {self.text}\nitems:\n[\n {items_repr}\n]\n"
def __eq__(self, other) -> bool:
"""Verify two instances are equal.
Returns true if the two instances are equal, false otherwise.
"""
return self.text == other.text and all(
map(lambda x, y: x == y, self.items, other.items)
)
__eq__(self, other)
special
Verify two instances are equal.
Returns true if the two instances are equal, false otherwise.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def __eq__(self, other) -> bool:
"""Verify two instances are equal.
Returns true if the two instances are equal, false otherwise.
"""
return self.text == other.text and all(
map(lambda x, y: x == y, self.items, other.items)
)
__init__(self, text=None, items=None)
special
Create EngineResult entity.
:param text: The anonymized text. :param items: List of PII entities and the indices of their replacements in the anonymized text.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def __init__(self, text: str = None, items: List[OperatorResult] = None):
"""Create EngineResult entity.
:param text: The anonymized text.
:param items: List of PII entities and the indices
of their replacements in the anonymized text.
"""
if items is None:
items = []
self.text = text
self.items = items
__repr__(self)
special
Return a string representation of the object.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def __repr__(self):
"""Return a string representation of the object."""
items_repr = (
",\n ".join([str(item) for item in self.items]) if self.items else ""
)
return f"text: {self.text}\nitems:\n[\n {items_repr}\n]\n"
add_item(self, item)
Add an item.
:param item: an item to add to the list.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def add_item(self, item: OperatorResult):
"""Add an item.
:param item: an item to add to the list.
"""
self.items.append(item)
normalize_item_indexes(self)
Normalize the indexes to be index from start.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def normalize_item_indexes(self):
"""Normalize the indexes to be index from start."""
text_len = len(self.text)
for result_item in self.items:
result_item.start = text_len - result_item.end
result_item.end = result_item.start + len(result_item.text)
set_text(self, text)
Set a text.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def set_text(self, text: str):
"""Set a text."""
self.text = text
to_json(self)
Return a json string serializing this instance.
Source code in presidio_anonymizer/entities/engine/result/engine_result.py
def to_json(self) -> str:
"""Return a json string serializing this instance."""
return json.dumps(self, default=lambda x: x.__dict__)
operator_result
OperatorResult (PIIEntity)
A class to hold data for engines results either anonymize or deanonymize.
Source code in presidio_anonymizer/entities/engine/result/operator_result.py
class OperatorResult(PIIEntity):
"""A class to hold data for engines results either anonymize or deanonymize."""
def __init__(
self,
start: int,
end: int,
entity_type: str,
text: str = None,
operator: str = None,
):
PIIEntity.__init__(self, start, end, entity_type)
self.text = text
self.operator = operator
def __repr__(self):
"""Return a string representation of the object."""
return str(self.to_dict())
def to_dict(self) -> Dict:
"""Return object as Dict."""
return self.__dict__
def __str__(self):
"""Return a string representation of the object."""
return str(self.to_dict())
def __eq__(self, other: "OperatorResult") -> bool:
"""
Verify two OperatorResults are equal.
:param other: OperatorResult
:return: bool
"""
return (
self.start == other.start
and self.end == other.end
and self.entity_type == other.entity_type
and self.operator == other.operator
and self.text == other.text
)
@classmethod
def from_json(cls, json: Dict) -> "OperatorResult":
"""
Create OperatorResult from user json.
:param json: json representation for this operator result. For example:
{
"start": 0,
"end": 10,
"key": "1111111111111111",
"entity_type":"PERSON",
"text":"resulted_text",
"operator":"encrypt",
}
"""
start = json.get("start")
end = json.get("end")
entity_type = json.get("entity_type")
text = json.get("text")
operator = json.get("operator")
return cls(
start=start,
end=end,
entity_type=entity_type,
text=text,
operator=operator,
)
__eq__(self, other)
special
Verify two OperatorResults are equal.
:param other: OperatorResult :return: bool
Source code in presidio_anonymizer/entities/engine/result/operator_result.py
def __eq__(self, other: "OperatorResult") -> bool:
"""
Verify two OperatorResults are equal.
:param other: OperatorResult
:return: bool
"""
return (
self.start == other.start
and self.end == other.end
and self.entity_type == other.entity_type
and self.operator == other.operator
and self.text == other.text
)
__str__(self)
special
Return a string representation of the object.
Source code in presidio_anonymizer/entities/engine/result/operator_result.py
def __str__(self):
"""Return a string representation of the object."""
return str(self.to_dict())
from_json(json)
classmethod
Create OperatorResult from user json.
:param json: json representation for this operator result. For example: { "start": 0, "end": 10, "key": "1111111111111111", "entity_type":"PERSON", "text":"resulted_text", "operator":"encrypt", }
Source code in presidio_anonymizer/entities/engine/result/operator_result.py
@classmethod
def from_json(cls, json: Dict) -> "OperatorResult":
"""
Create OperatorResult from user json.
:param json: json representation for this operator result. For example:
{
"start": 0,
"end": 10,
"key": "1111111111111111",
"entity_type":"PERSON",
"text":"resulted_text",
"operator":"encrypt",
}
"""
start = json.get("start")
end = json.get("end")
entity_type = json.get("entity_type")
text = json.get("text")
operator = json.get("operator")
return cls(
start=start,
end=end,
entity_type=entity_type,
text=text,
operator=operator,
)
to_dict(self)
Return object as Dict.
Source code in presidio_anonymizer/entities/engine/result/operator_result.py
def to_dict(self) -> Dict:
"""Return object as Dict."""
return self.__dict__
invalid_exception
Exception to indicate the request we received is invalid.
InvalidParamException (Exception)
Throw exception with error when user input is not valid.
param msg: Message to be added to the exception
Source code in presidio_anonymizer/entities/invalid_exception.py
class InvalidParamException(Exception):
"""Throw exception with error when user input is not valid.
param msg: Message to be added to the exception
"""
def __init__(self, msg: str):
self.err_msg = msg
super().__init__(self.err_msg)
operators
special
Initializing all the existing anonymizers.
aes_cipher
AESCipher
Advanced Encryption Standard (aka Rijndael) en/decryption in CBC mode.
Source code in presidio_anonymizer/operators/aes_cipher.py
class AESCipher:
"""Advanced Encryption Standard (aka Rijndael) en/decryption in CBC mode."""
@staticmethod
def encrypt(key: bytes, text: str) -> str:
"""
Encrypts a text using AES cypher in CBC mode.
Uses padding and random IV.
:param key: AES encryption key in bytes.
:param text: The text for encryption.
:returns: The encrypted text.
"""
encoded_text = text.encode("utf-8")
padded_text = pad(encoded_text, AES.block_size)
iv = Random.new().read(AES.block_size)
cipher = AES.new(key, AES.MODE_CBC, iv)
encrypted_text = base64.b64encode(iv + cipher.encrypt(padded_text))
return encrypted_text.decode()
@staticmethod
def decrypt(key: bytes, text: str) -> str:
"""
Decrypts a previously AES-CBC encrypted text.
:param key: AES encryption key in bytes.
:param text: The text for decryption.
:returns: The decrypted text.
"""
decoded_text = base64.b64decode(text)
iv = decoded_text[: AES.block_size]
cipher = AES.new(key, AES.MODE_CBC, iv)
decrypted_text = unpad(
cipher.decrypt(decoded_text[AES.block_size :]), AES.block_size
)
return decrypted_text.decode("utf-8")
@staticmethod
def is_valid_key_size(key: bytes) -> bool:
"""
Validate key size for AES.
:param key: AES encryption key in bytes.
:returns: True if the key is of valid size, False otherwise.
"""
return len(key) in AES.key_size
decrypt(key, text)
staticmethod
Decrypts a previously AES-CBC encrypted text.
:param key: AES encryption key in bytes. :param text: The text for decryption. :returns: The decrypted text.
Source code in presidio_anonymizer/operators/aes_cipher.py
@staticmethod
def decrypt(key: bytes, text: str) -> str:
"""
Decrypts a previously AES-CBC encrypted text.
:param key: AES encryption key in bytes.
:param text: The text for decryption.
:returns: The decrypted text.
"""
decoded_text = base64.b64decode(text)
iv = decoded_text[: AES.block_size]
cipher = AES.new(key, AES.MODE_CBC, iv)
decrypted_text = unpad(
cipher.decrypt(decoded_text[AES.block_size :]), AES.block_size
)
return decrypted_text.decode("utf-8")
encrypt(key, text)
staticmethod
Encrypts a text using AES cypher in CBC mode.
Uses padding and random IV. :param key: AES encryption key in bytes. :param text: The text for encryption. :returns: The encrypted text.
Source code in presidio_anonymizer/operators/aes_cipher.py
@staticmethod
def encrypt(key: bytes, text: str) -> str:
"""
Encrypts a text using AES cypher in CBC mode.
Uses padding and random IV.
:param key: AES encryption key in bytes.
:param text: The text for encryption.
:returns: The encrypted text.
"""
encoded_text = text.encode("utf-8")
padded_text = pad(encoded_text, AES.block_size)
iv = Random.new().read(AES.block_size)
cipher = AES.new(key, AES.MODE_CBC, iv)
encrypted_text = base64.b64encode(iv + cipher.encrypt(padded_text))
return encrypted_text.decode()
is_valid_key_size(key)
staticmethod
Validate key size for AES.
:param key: AES encryption key in bytes. :returns: True if the key is of valid size, False otherwise.
Source code in presidio_anonymizer/operators/aes_cipher.py
@staticmethod
def is_valid_key_size(key: bytes) -> bool:
"""
Validate key size for AES.
:param key: AES encryption key in bytes.
:returns: True if the key is of valid size, False otherwise.
"""
return len(key) in AES.key_size
custom
Replaces the PII text with function result.
Custom (Operator)
Replace PII text entity with the results of a function executed on the PII text.
The function retrun type must be a string
Source code in presidio_anonymizer/operators/custom.py
class Custom(Operator):
"""
Replace PII text entity with the results of a function executed on the PII text.
The function retrun type must be a string
"""
LAMBDA = "lambda"
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: result of function executed on the text."""
new_val = params.get(self.LAMBDA)
return new_val(text)
def validate(self, params: Dict) -> None:
"""Validate the provided function is returning a string."""
new_val = params.get(self.LAMBDA)
if callable(new_val):
if not type(new_val("PII")) == str:
raise InvalidParamException("Function return type must be a str")
else:
raise InvalidParamException("New value must be a callable function")
def operator_name(self) -> str:
"""Return operator name."""
return "custom"
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
operate(self, text=None, params=None)
:return: result of function executed on the text.
Source code in presidio_anonymizer/operators/custom.py
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: result of function executed on the text."""
new_val = params.get(self.LAMBDA)
return new_val(text)
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/custom.py
def operator_name(self) -> str:
"""Return operator name."""
return "custom"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/custom.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params)
Validate the provided function is returning a string.
Source code in presidio_anonymizer/operators/custom.py
def validate(self, params: Dict) -> None:
"""Validate the provided function is returning a string."""
new_val = params.get(self.LAMBDA)
if callable(new_val):
if not type(new_val("PII")) == str:
raise InvalidParamException("Function return type must be a str")
else:
raise InvalidParamException("New value must be a callable function")
decrypt
Decrypt (Operator)
Decrypt text to from its encrypted form.
Source code in presidio_anonymizer/operators/decrypt.py
class Decrypt(Operator):
"""Decrypt text to from its encrypted form."""
NAME = "decrypt"
KEY = "key"
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Decrypt the text.
:param text: The text for decryption.
:param params:
**key* The key supplied by the user for the encryption.
:return: The encrypted text
"""
encoded_key = params.get(self.KEY).encode("utf8")
decrypted_text = AESCipher.decrypt(key=encoded_key, text=text)
return decrypted_text
def validate(self, params: Dict = None) -> None:
"""
Validate Decrypt parameters.
:param params:
* *key* The key supplied by the user for the encryption.
Should be a string of 128, 192 or 256 bits length.
:raises InvalidParamException in case on an invalid parameter.
"""
key = params.get(self.KEY)
validate_parameter(key, self.KEY, str)
if not AESCipher.is_valid_key_size(key.encode("utf8")):
raise InvalidParamException(
f"Invalid input, {self.KEY} must be of length 128, 192 or 256 bits"
)
def operator_name(self) -> str:
"""Return operator name."""
return self.NAME
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Deanonymize
operate(self, text=None, params=None)
Decrypt the text.
:param text: The text for decryption. :param params: *key The key supplied by the user for the encryption. :return: The encrypted text
Source code in presidio_anonymizer/operators/decrypt.py
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Decrypt the text.
:param text: The text for decryption.
:param params:
**key* The key supplied by the user for the encryption.
:return: The encrypted text
"""
encoded_key = params.get(self.KEY).encode("utf8")
decrypted_text = AESCipher.decrypt(key=encoded_key, text=text)
return decrypted_text
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/decrypt.py
def operator_name(self) -> str:
"""Return operator name."""
return self.NAME
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/decrypt.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Deanonymize
validate(self, params=None)
Validate Decrypt parameters.
:param params: * key The key supplied by the user for the encryption. Should be a string of 128, 192 or 256 bits length. :raises InvalidParamException in case on an invalid parameter.
Source code in presidio_anonymizer/operators/decrypt.py
def validate(self, params: Dict = None) -> None:
"""
Validate Decrypt parameters.
:param params:
* *key* The key supplied by the user for the encryption.
Should be a string of 128, 192 or 256 bits length.
:raises InvalidParamException in case on an invalid parameter.
"""
key = params.get(self.KEY)
validate_parameter(key, self.KEY, str)
if not AESCipher.is_valid_key_size(key.encode("utf8")):
raise InvalidParamException(
f"Invalid input, {self.KEY} must be of length 128, 192 or 256 bits"
)
encrypt
Encrypt (Operator)
Anonymizes text to an encrypted form, or it to be restored using decrypted.
Source code in presidio_anonymizer/operators/encrypt.py
class Encrypt(Operator):
"""Anonymizes text to an encrypted form, or it to be restored using decrypted."""
KEY = "key"
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Anonymize the text with an encrypted text.
:param text: The text for encryption.
:param params:
* *key* The key supplied by the user for the encryption.
:return: The encrypted text
"""
encoded_key = params.get(self.KEY).encode("utf8")
encrypted_text = AESCipher.encrypt(encoded_key, text)
return encrypted_text
def validate(self, params: Dict = None) -> None:
"""
Validate Encrypt parameters.
:param params:
* *key* The key supplied by the user for the encryption.
Should be a string of 128, 192 or 256 bits length.
:raises InvalidParamException in case on an invalid parameter.
"""
key = params.get(self.KEY)
validate_parameter(key, self.KEY, str)
if not AESCipher.is_valid_key_size(key.encode("utf8")):
raise InvalidParamException(
f"Invalid input, {self.KEY} must be of length 128, 192 or 256 bits"
)
def operator_name(self) -> str:
"""Return operator name."""
return "encrypt"
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
operate(self, text=None, params=None)
Anonymize the text with an encrypted text.
:param text: The text for encryption. :param params: * key The key supplied by the user for the encryption. :return: The encrypted text
Source code in presidio_anonymizer/operators/encrypt.py
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Anonymize the text with an encrypted text.
:param text: The text for encryption.
:param params:
* *key* The key supplied by the user for the encryption.
:return: The encrypted text
"""
encoded_key = params.get(self.KEY).encode("utf8")
encrypted_text = AESCipher.encrypt(encoded_key, text)
return encrypted_text
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/encrypt.py
def operator_name(self) -> str:
"""Return operator name."""
return "encrypt"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/encrypt.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params=None)
Validate Encrypt parameters.
:param params: * key The key supplied by the user for the encryption. Should be a string of 128, 192 or 256 bits length. :raises InvalidParamException in case on an invalid parameter.
Source code in presidio_anonymizer/operators/encrypt.py
def validate(self, params: Dict = None) -> None:
"""
Validate Encrypt parameters.
:param params:
* *key* The key supplied by the user for the encryption.
Should be a string of 128, 192 or 256 bits length.
:raises InvalidParamException in case on an invalid parameter.
"""
key = params.get(self.KEY)
validate_parameter(key, self.KEY, str)
if not AESCipher.is_valid_key_size(key.encode("utf8")):
raise InvalidParamException(
f"Invalid input, {self.KEY} must be of length 128, 192 or 256 bits"
)
hash
Hashes the PII text entity.
Hash (Operator)
Hash given text with sha256/sha512/md5 algorithm.
Source code in presidio_anonymizer/operators/hash.py
class Hash(Operator):
"""Hash given text with sha256/sha512/md5 algorithm."""
HASH_TYPE = "hash_type"
SHA256 = "sha256"
SHA512 = "sha512"
MD5 = "md5"
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Hash given value using sha256.
:return: hashed original text
"""
hash_type = self._get_hash_type_or_default(params)
hash_switcher = {
self.SHA256: lambda s: sha256(s),
self.SHA512: lambda s: sha512(s),
self.MD5: lambda s: md5(s),
}
return hash_switcher.get(hash_type)(text.encode()).hexdigest()
def validate(self, params: Dict = None) -> None:
"""Validate the hash type is string and in range of allowed hash types."""
validate_parameter_in_range(
[self.SHA256, self.SHA512, self.MD5],
self._get_hash_type_or_default(params),
self.HASH_TYPE,
str,
)
pass
def operator_name(self) -> str:
"""Return operator name."""
return "hash"
def _get_hash_type_or_default(self, params: Dict = None):
return params.get(self.HASH_TYPE, self.SHA256)
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
operate(self, text=None, params=None)
Hash given value using sha256.
:return: hashed original text
Source code in presidio_anonymizer/operators/hash.py
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Hash given value using sha256.
:return: hashed original text
"""
hash_type = self._get_hash_type_or_default(params)
hash_switcher = {
self.SHA256: lambda s: sha256(s),
self.SHA512: lambda s: sha512(s),
self.MD5: lambda s: md5(s),
}
return hash_switcher.get(hash_type)(text.encode()).hexdigest()
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/hash.py
def operator_name(self) -> str:
"""Return operator name."""
return "hash"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/hash.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params=None)
Validate the hash type is string and in range of allowed hash types.
Source code in presidio_anonymizer/operators/hash.py
def validate(self, params: Dict = None) -> None:
"""Validate the hash type is string and in range of allowed hash types."""
validate_parameter_in_range(
[self.SHA256, self.SHA512, self.MD5],
self._get_hash_type_or_default(params),
self.HASH_TYPE,
str,
)
pass
mask
Mask some or all given text entity PII with given character.
Mask (Operator)
Mask the given text with given value.
Source code in presidio_anonymizer/operators/mask.py
class Mask(Operator):
"""Mask the given text with given value."""
CHARS_TO_MASK = "chars_to_mask"
FROM_END = "from_end"
MASKING_CHAR = "masking_char"
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Mask a given amount of text with a given character.
:param text: the text to be masked
:param params:
masking_char: The character to be masked with
chars_to_mask: The amount of characters to mask
from_end: Whether to mask the text from it's end
:return: the masked text
"""
effective_chars_to_mask = self._get_effective_chars_to_mask(
text, params.get(self.CHARS_TO_MASK)
)
from_end = params.get(self.FROM_END)
masking_char = params.get(self.MASKING_CHAR)
return self._get_anonymized_text(
text, effective_chars_to_mask, from_end, masking_char
)
def validate(self, params: Dict = None) -> None:
"""
Validate the parameters for mask.
:param params:
masking_char: The character to be masked with
chars_to_mask: The amount of characters to mask
from_end: Whether to mask the text from it's end
"""
masking_char = params.get(self.MASKING_CHAR)
validate_parameter(masking_char, self.MASKING_CHAR, str)
if len(masking_char) > 1:
raise InvalidParamException(
f"Invalid input, {self.MASKING_CHAR} must be a character"
)
validate_parameter(params.get(self.CHARS_TO_MASK), self.CHARS_TO_MASK, int)
validate_parameter(params.get(self.FROM_END), self.FROM_END, bool)
def operator_name(self) -> str:
"""Return operator name."""
return "mask"
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
@staticmethod
def _get_effective_chars_to_mask(text, chars_to_mask):
return min(len(text), chars_to_mask) if chars_to_mask > 0 else 0
@staticmethod
def _get_anonymized_text(text, chars_to_mask, from_end, masking_char):
if not from_end:
return masking_char * chars_to_mask + text[chars_to_mask:]
else:
mask_from_index = len(text) - chars_to_mask
return text[:mask_from_index] + masking_char * chars_to_mask
operate(self, text=None, params=None)
Mask a given amount of text with a given character.
:param text: the text to be masked :param params: masking_char: The character to be masked with chars_to_mask: The amount of characters to mask from_end: Whether to mask the text from it's end :return: the masked text
Source code in presidio_anonymizer/operators/mask.py
def operate(self, text: str = None, params: Dict = None) -> str:
"""
Mask a given amount of text with a given character.
:param text: the text to be masked
:param params:
masking_char: The character to be masked with
chars_to_mask: The amount of characters to mask
from_end: Whether to mask the text from it's end
:return: the masked text
"""
effective_chars_to_mask = self._get_effective_chars_to_mask(
text, params.get(self.CHARS_TO_MASK)
)
from_end = params.get(self.FROM_END)
masking_char = params.get(self.MASKING_CHAR)
return self._get_anonymized_text(
text, effective_chars_to_mask, from_end, masking_char
)
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/mask.py
def operator_name(self) -> str:
"""Return operator name."""
return "mask"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/mask.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params=None)
Validate the parameters for mask.
:param params: masking_char: The character to be masked with chars_to_mask: The amount of characters to mask from_end: Whether to mask the text from it's end
Source code in presidio_anonymizer/operators/mask.py
def validate(self, params: Dict = None) -> None:
"""
Validate the parameters for mask.
:param params:
masking_char: The character to be masked with
chars_to_mask: The amount of characters to mask
from_end: Whether to mask the text from it's end
"""
masking_char = params.get(self.MASKING_CHAR)
validate_parameter(masking_char, self.MASKING_CHAR, str)
if len(masking_char) > 1:
raise InvalidParamException(
f"Invalid input, {self.MASKING_CHAR} must be a character"
)
validate_parameter(params.get(self.CHARS_TO_MASK), self.CHARS_TO_MASK, int)
validate_parameter(params.get(self.FROM_END), self.FROM_END, bool)
operator
Operator abstraction - each operator should implement this class.
Operator (ABC)
Operator abstract class to be implemented by each operator.
Source code in presidio_anonymizer/operators/operator.py
class Operator(ABC):
"""Operator abstract class to be implemented by each operator."""
@abstractmethod
def operate(self, text: str, params: Dict = None) -> str:
"""Operate method to be implemented in each operator."""
pass
@abstractmethod
def validate(self, params: Dict = None) -> None:
"""Validate each operator parameters."""
pass
@abstractmethod
def operator_name(self) -> str:
"""Return operator name."""
pass
@abstractmethod
def operator_type(self) -> OperatorType:
"""Return operator type."""
pass
operate(self, text, params=None)
Operate method to be implemented in each operator.
Source code in presidio_anonymizer/operators/operator.py
@abstractmethod
def operate(self, text: str, params: Dict = None) -> str:
"""Operate method to be implemented in each operator."""
pass
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/operator.py
@abstractmethod
def operator_name(self) -> str:
"""Return operator name."""
pass
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/operator.py
@abstractmethod
def operator_type(self) -> OperatorType:
"""Return operator type."""
pass
validate(self, params=None)
Validate each operator parameters.
Source code in presidio_anonymizer/operators/operator.py
@abstractmethod
def validate(self, params: Dict = None) -> None:
"""Validate each operator parameters."""
pass
OperatorType (Enum)
Operator type either anonymize or decrypt to separate the operators.
Source code in presidio_anonymizer/operators/operator.py
class OperatorType(Enum):
"""Operator type either anonymize or decrypt to separate the operators."""
Anonymize = 1
Deanonymize = 2
operators_factory
OperatorsFactory
Operators factory to get the correct operator class.
Source code in presidio_anonymizer/operators/operators_factory.py
class OperatorsFactory:
"""Operators factory to get the correct operator class."""
_anonymizers: Dict = None
_deanonymizers: Dict = None
_operator_class: Dict = None
def __init__(self):
self.logger = logging.getLogger("presidio-anonymizer")
def create_operator_class(
self, operator_name: str, operator_type: OperatorType
) -> Operator:
"""
Extract the operator class from the operators list.
:param operator_type: Either Anonymize or Decrypt to defer between operators.
:type operator_name: operator name.
:return: operator class entity.
"""
operators_by_type = self.__get_operators_classes().get(operator_type)
if not operators_by_type:
self.logger.error(f"No such operator type {operator_type}")
raise InvalidParamException(f"Invalid operator type '{operator_type}'.")
operator_class = operators_by_type.get(operator_name)
if not operator_class:
self.logger.error(f"No such operator class {operator_name}")
raise InvalidParamException(f"Invalid operator class '{operator_name}'.")
self.logger.debug(f"applying class {operator_class}")
return operator_class()
@staticmethod
def __get_operators_classes():
if not OperatorsFactory._operator_class:
OperatorsFactory._operator_class = {
OperatorType.Anonymize: OperatorsFactory.get_anonymizers(),
OperatorType.Deanonymize: OperatorsFactory.get_deanonymizers(),
}
return OperatorsFactory._operator_class
@staticmethod
def get_anonymizers() -> Dict[str, "Operator"]:
"""Return all anonymizers classes currently available."""
if not OperatorsFactory._anonymizers:
OperatorsFactory._anonymizers = OperatorsFactory.__get_operators_by_type(
OperatorType.Anonymize
)
return OperatorsFactory._anonymizers
@staticmethod
def get_deanonymizers() -> Dict[str, "Operator"]:
"""Return all deanonymizers classes currently available."""
if not OperatorsFactory._deanonymizers:
OperatorsFactory._deanonymizers = OperatorsFactory.__get_operators_by_type(
OperatorType.Deanonymize
)
return OperatorsFactory._deanonymizers
@staticmethod
def __get_operators_by_type(operator_type: OperatorType):
operators = Operator.__subclasses__()
return {
cls.operator_name(cls): cls
for cls in operators
if cls.operator_type(cls) == operator_type
}
create_operator_class(self, operator_name, operator_type)
Extract the operator class from the operators list.
:param operator_type: Either Anonymize or Decrypt to defer between operators. :type operator_name: operator name. :return: operator class entity.
Source code in presidio_anonymizer/operators/operators_factory.py
def create_operator_class(
self, operator_name: str, operator_type: OperatorType
) -> Operator:
"""
Extract the operator class from the operators list.
:param operator_type: Either Anonymize or Decrypt to defer between operators.
:type operator_name: operator name.
:return: operator class entity.
"""
operators_by_type = self.__get_operators_classes().get(operator_type)
if not operators_by_type:
self.logger.error(f"No such operator type {operator_type}")
raise InvalidParamException(f"Invalid operator type '{operator_type}'.")
operator_class = operators_by_type.get(operator_name)
if not operator_class:
self.logger.error(f"No such operator class {operator_name}")
raise InvalidParamException(f"Invalid operator class '{operator_name}'.")
self.logger.debug(f"applying class {operator_class}")
return operator_class()
get_anonymizers()
staticmethod
Return all anonymizers classes currently available.
Source code in presidio_anonymizer/operators/operators_factory.py
@staticmethod
def get_anonymizers() -> Dict[str, "Operator"]:
"""Return all anonymizers classes currently available."""
if not OperatorsFactory._anonymizers:
OperatorsFactory._anonymizers = OperatorsFactory.__get_operators_by_type(
OperatorType.Anonymize
)
return OperatorsFactory._anonymizers
get_deanonymizers()
staticmethod
Return all deanonymizers classes currently available.
Source code in presidio_anonymizer/operators/operators_factory.py
@staticmethod
def get_deanonymizers() -> Dict[str, "Operator"]:
"""Return all deanonymizers classes currently available."""
if not OperatorsFactory._deanonymizers:
OperatorsFactory._deanonymizers = OperatorsFactory.__get_operators_by_type(
OperatorType.Deanonymize
)
return OperatorsFactory._deanonymizers
redact
Replaces the PII text entity with empty string.
Redact (Operator)
Redact the string - empty value.
Source code in presidio_anonymizer/operators/redact.py
class Redact(Operator):
"""Redact the string - empty value."""
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: an empty value."""
return ""
def validate(self, params: Dict = None) -> None:
"""Redact does not require any paramters so no validation is needed."""
pass
def operator_name(self) -> str:
"""Return operator name."""
return "redact"
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
operate(self, text=None, params=None)
:return: an empty value.
Source code in presidio_anonymizer/operators/redact.py
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: an empty value."""
return ""
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/redact.py
def operator_name(self) -> str:
"""Return operator name."""
return "redact"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/redact.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params=None)
Redact does not require any paramters so no validation is needed.
Source code in presidio_anonymizer/operators/redact.py
def validate(self, params: Dict = None) -> None:
"""Redact does not require any paramters so no validation is needed."""
pass
replace
Replaces the PII text entity with new string.
Replace (Operator)
Receives new text to replace old PII text entity with.
Source code in presidio_anonymizer/operators/replace.py
class Replace(Operator):
"""Receives new text to replace old PII text entity with."""
NEW_VALUE = "new_value"
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: new_value."""
new_val = params.get(self.NEW_VALUE)
if not new_val:
return f"<{params.get('entity_type')}>"
return new_val
def validate(self, params: Dict = None) -> None:
"""Validate the new value is string."""
validate_type(params.get(self.NEW_VALUE), self.NEW_VALUE, str)
pass
def operator_name(self) -> str:
"""Return operator name."""
return "replace"
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
operate(self, text=None, params=None)
:return: new_value.
Source code in presidio_anonymizer/operators/replace.py
def operate(self, text: str = None, params: Dict = None) -> str:
""":return: new_value."""
new_val = params.get(self.NEW_VALUE)
if not new_val:
return f"<{params.get('entity_type')}>"
return new_val
operator_name(self)
Return operator name.
Source code in presidio_anonymizer/operators/replace.py
def operator_name(self) -> str:
"""Return operator name."""
return "replace"
operator_type(self)
Return operator type.
Source code in presidio_anonymizer/operators/replace.py
def operator_type(self) -> OperatorType:
"""Return operator type."""
return OperatorType.Anonymize
validate(self, params=None)
Validate the new value is string.
Source code in presidio_anonymizer/operators/replace.py
def validate(self, params: Dict = None) -> None:
"""Validate the new value is string."""
validate_type(params.get(self.NEW_VALUE), self.NEW_VALUE, str)
pass
services
special
Services init.
app_entities_convertor
AppEntitiesConvertor
Assisting class to convert API json entities to engine entities.
Source code in presidio_anonymizer/services/app_entities_convertor.py
class AppEntitiesConvertor:
"""Assisting class to convert API json entities to engine entities."""
@staticmethod
def analyzer_results_from_json(data: List[Dict]) -> List["RecognizerResult"]:
"""
Go over analyzer results, validate them and convert to List[RecognizerResult].
:param data: contains the anonymizers and analyzer_results_json
"""
if data is None:
raise InvalidParamException(
"Invalid input, " "request must contain analyzer results"
)
return [RecognizerResult.from_json(analyzer_result) for analyzer_result in data]
@staticmethod
def operators_config_from_json(data: Dict) -> Dict[str, "OperatorConfig"]:
"""
Go over the operators list and get the relevant create operator config entity.
:param data: contains the list of configuration
value - OperatorConfig
"""
if data is not None:
return {
key: OperatorConfig.from_json(operator_json)
for (key, operator_json) in data.items()
}
return {}
@staticmethod
def deanonymize_entities_from_json(json: Dict) -> List["OperatorResult"]:
"""
Create DecryptEntity list.
:param json:
{
"text": text,
"encrypt_results": [{
"start": 0,
"end": 10,
"key": "1111111111111111",
"entity_type":"PHONE_NUMBER"
}],
}
:return: List[OperatorResult]
"""
decrypt_entity = json.get("anonymizer_results")
return (
[OperatorResult.from_json(result) for result in decrypt_entity]
if decrypt_entity
else []
)
@staticmethod
def check_custom_operator(operators: Dict[str, OperatorConfig]):
"""Check if an operator is of type custom."""
return any([config.operator_name == "custom" for config in operators.values()])
analyzer_results_from_json(data)
staticmethod
Go over analyzer results, validate them and convert to List[RecognizerResult].
:param data: contains the anonymizers and analyzer_results_json
Source code in presidio_anonymizer/services/app_entities_convertor.py
@staticmethod
def analyzer_results_from_json(data: List[Dict]) -> List["RecognizerResult"]:
"""
Go over analyzer results, validate them and convert to List[RecognizerResult].
:param data: contains the anonymizers and analyzer_results_json
"""
if data is None:
raise InvalidParamException(
"Invalid input, " "request must contain analyzer results"
)
return [RecognizerResult.from_json(analyzer_result) for analyzer_result in data]
check_custom_operator(operators)
staticmethod
Check if an operator is of type custom.
Source code in presidio_anonymizer/services/app_entities_convertor.py
@staticmethod
def check_custom_operator(operators: Dict[str, OperatorConfig]):
"""Check if an operator is of type custom."""
return any([config.operator_name == "custom" for config in operators.values()])
deanonymize_entities_from_json(json)
staticmethod
Create DecryptEntity list.
:param json: { "text": text, "encrypt_results": [{ "start": 0, "end": 10, "key": "1111111111111111", "entity_type":"PHONE_NUMBER" }], } :return: List[OperatorResult]
Source code in presidio_anonymizer/services/app_entities_convertor.py
@staticmethod
def deanonymize_entities_from_json(json: Dict) -> List["OperatorResult"]:
"""
Create DecryptEntity list.
:param json:
{
"text": text,
"encrypt_results": [{
"start": 0,
"end": 10,
"key": "1111111111111111",
"entity_type":"PHONE_NUMBER"
}],
}
:return: List[OperatorResult]
"""
decrypt_entity = json.get("anonymizer_results")
return (
[OperatorResult.from_json(result) for result in decrypt_entity]
if decrypt_entity
else []
)
operators_config_from_json(data)
staticmethod
Go over the operators list and get the relevant create operator config entity.
:param data: contains the list of configuration value - OperatorConfig
Source code in presidio_anonymizer/services/app_entities_convertor.py
@staticmethod
def operators_config_from_json(data: Dict) -> Dict[str, "OperatorConfig"]:
"""
Go over the operators list and get the relevant create operator config entity.
:param data: contains the list of configuration
value - OperatorConfig
"""
if data is not None:
return {
key: OperatorConfig.from_json(operator_json)
for (key, operator_json) in data.items()
}
return {}
validators
Anomnymizers validations utility methods.
validate_parameter(parameter_value, parameter_name, parameter_type)
Validate an anonymizer parameter.
Both validate the existence of an anonymizer parameter and that it is an instance of the parameter_type. Otherwise, raise the appropriate InvalidParamException with the parameter_name as content.
Source code in presidio_anonymizer/services/validators.py
def validate_parameter(
parameter_value, parameter_name: str, parameter_type: type
) -> None:
"""Validate an anonymizer parameter.
Both validate the existence of an anonymizer parameter and that it is an
instance of the parameter_type. Otherwise, raise the appropriate
InvalidParamException with the parameter_name as content.
"""
if parameter_value is None:
raise InvalidParamException(f"Expected parameter {parameter_name}")
validate_type(parameter_value, parameter_name, parameter_type)
validate_parameter_exists(parameter_value, entity, parameter_name)
Validate parameter is not empty.
Source code in presidio_anonymizer/services/validators.py
def validate_parameter_exists(
parameter_value, entity: str, parameter_name: str
) -> None:
"""Validate parameter is not empty."""
if parameter_value is None:
raise InvalidParamException(
f"Invalid input, {entity} must contain {parameter_name}"
)
validate_parameter_in_range(values_range, parameter_value, parameter_name, parameter_type)
Validate an anonymizer parameter.
validates the existence of an anonymizer parameter and that it is an instance of the parameter_type and that it is within the range of provided values. Otherwise, raise the appropriate InvalidParamException with the parameter_name as content.
Source code in presidio_anonymizer/services/validators.py
def validate_parameter_in_range(
values_range, parameter_value, parameter_name: str, parameter_type: type
) -> None:
"""Validate an anonymizer parameter.
validates the existence of an anonymizer parameter and that it is an
instance of the parameter_type and that it is within the range of provided values.
Otherwise, raise the appropriate InvalidParamException with the
parameter_name as content.
"""
validate_parameter(parameter_value, parameter_name, object)
if parameter_value not in values_range:
raise InvalidParamException(
f"Parameter {parameter_name} value {parameter_value} is not in "
f"range of values {values_range}"
)
validate_parameter_not_empty(parameter_value, entity, parameter_name)
Validate parameter exists and not only empty.
Source code in presidio_anonymizer/services/validators.py
def validate_parameter_not_empty(
parameter_value, entity: str, parameter_name: str
) -> None:
"""Validate parameter exists and not only empty."""
if not parameter_value:
raise InvalidParamException(
f"Invalid input, {entity} must contain {parameter_name}"
)
validate_type(parameter_value, parameter_name, parameter_type)
Validate an anonymizer parameter.
Validate it exists and if so, that it is the instance of the parameter_type. Otherwise, raise the appropriate InvalidParamException with the parameter_name as content.
Source code in presidio_anonymizer/services/validators.py
def validate_type(parameter_value, parameter_name, parameter_type):
"""
Validate an anonymizer parameter.
Validate it exists and if so, that it is the instance of the parameter_type.
Otherwise, raise the appropriate InvalidParamException with the parameter_name
as content.
"""
if parameter_value and not isinstance(parameter_value, parameter_type):
message = _get_bad_typed_parameter_error_message(
parameter_name,
expected_type=parameter_type,
actual_type=type(parameter_value),
)
raise InvalidParamException(message)