diff --git a/README.md b/README.md index c71b935..800a89e 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ from hexanonyme import ReplaceAnonymizer, RedactAnonymizer ## ReplaceAnoymizer The replace anonymizer can take the following arguments -- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS"]`. +- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`. - faker (bool): Whether to use Faker library for fake data generation (default: `True`). For instance if entities list is ["PER", "LOC"], fakes names and cities will be generated and used to replaced the entities in the original text. - replacement_dict : Dictionary of replacement values for specific entity types . If faker argument is set to `False`, you can supply a dictionary for you entity replacement. For instance `{"PER" : "Jean Pierre", "LOC" : "Marseille"}` will replace all PER and LOC entities by respectively **Jean Pierre** and **Marseille**. @@ -69,7 +69,7 @@ restored_text = replace_anonymizer.deanonymize(anonymized_text) ## RedactAnonymizer Contrary to the replace anonymizer, the redact anonymizer takes only one argument (the list of entities) -- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC"]`. +- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`. ```python # Initialize ReplaceAnonymizer @@ -103,6 +103,8 @@ Data anonymization is crucial for protecting individuals' privacy and complying - ADDRESS (postal addresses) - ORG (organisation) - MISC (films, series) +- TEL (telephone number) +- MAIL (email address) These NER models accurately identify PII entities in French text. diff --git a/hexanonyme/core/anonymizer/base_anonymizer.py b/hexanonyme/core/anonymizer/base_anonymizer.py index 9fb6f03..84ce842 100644 --- a/hexanonyme/core/anonymizer/base_anonymizer.py +++ b/hexanonyme/core/anonymizer/base_anonymizer.py @@ -1,5 +1,5 @@ from transformers import pipeline - +import re class BaseAnonymizer: def __init__(self): @@ -29,7 +29,7 @@ def load_pipelines(self): def merge_overlapping_entities(self, entities): """ - Merge overlaps over one entity. + Merge overlaps over one entity. In some cases a person name like "Cecile Da Costa." can be identified as two PER entities bacause of the formating of the text and what comes before Args: @@ -55,7 +55,7 @@ def merge_overlapping_entities(self, entities): merged_entities.append(entity) i = j return merged_entities - + def find_telephone_number(self, text): entities = [] multiples_regex_formats = ["((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2}))"] @@ -71,7 +71,7 @@ def find_telephone_number(self, text): } entities.append(entity) return entities - + def find_email(self, text): entities = [] multiples_regex_formats = ["([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*)"] @@ -87,7 +87,7 @@ def find_email(self, text): } entities.append(entity) return entities - + def drop_duplicates_and_included_entities(self, list_of_dicts): """ Drop duplicate entities and entities included in other entities from a list of dictionaries. diff --git a/hexanonyme/core/anonymizer/redact_anonymizer.py b/hexanonyme/core/anonymizer/redact_anonymizer.py index 4587732..830e9e2 100644 --- a/hexanonyme/core/anonymizer/redact_anonymizer.py +++ b/hexanonyme/core/anonymizer/redact_anonymizer.py @@ -24,6 +24,7 @@ def redact(self, text): """ #Get entities from all classifiers + entities_total = [] for [classifier, filtre] in self.classifier_filtres: # Get entities from both models entities_classifier = classifier(text) @@ -31,10 +32,10 @@ def redact(self, text): entities_classifier = self.merge_overlapping_entities(entities_classifier) entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre] entities_total += entities_classifier - + entities_total += self.find_telephone_number(text) entities_total += self.find_email(text) - + entities = self.drop_duplicates_and_included_entities(entities_total) self.log_redactions = [] @@ -95,11 +96,10 @@ def deanonymize(self, redacted_text): entity_index = 0 words = redacted_text.split() - - for word in words: + for i, word in enumerate(words): if '[REDACTED]' in word and entity_index < len(self.log_redactions): - word = re.sub(r"\[REDACTED\]", (self.log_redactions[entity_index]['word']), word) + words[i] = re.sub(r"\[REDACTED\]", self.log_redactions[entity_index]['word'], word) entity_index += 1 reconstructed_sentence = ' '.join(words) - return reconstructed_sentence + return reconstructed_sentence \ No newline at end of file diff --git a/hexanonyme/core/anonymizer/replace_anonymizer.py b/hexanonyme/core/anonymizer/replace_anonymizer.py index bf5fad0..6c5d691 100644 --- a/hexanonyme/core/anonymizer/replace_anonymizer.py +++ b/hexanonyme/core/anonymizer/replace_anonymizer.py @@ -55,15 +55,15 @@ def replace(self, text): entities_classifier = self.merge_overlapping_entities(entities_classifier) entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre] entities_total += entities_classifier - + entities_total += self.find_telephone_number(text) entities_total += self.find_email(text) - + tokens = self.drop_duplicates_and_included_entities(entities_total) for entity_type in self.entities: if entity_type in ["ADDRESS", "PER", "DATE", "LOC", "ORG", "MISC", "TEL", "MAIL"]: - text = self._replace_entities(text, token, entity_type) + text = self._replace_entities(text, tokens, entity_type) else: raise ValueError(f"Unsupported entity type: {entity_type}") @@ -108,7 +108,7 @@ def deanonymize(self, text): str: The deanonymized text with replaced values restored. """ for original_word, replacement in self.log_replacements: - text = text.replace(replacement, original_word, 1) + text = text.replace(replacement, original_word, 1) return text def _generate_random_loc(self): @@ -146,7 +146,7 @@ def _generate_random_org(self): str: A randomly generated company. """ return self.fake.company() - + def _generate_random_per(self): """ Generate a random name using the Faker library. @@ -158,7 +158,7 @@ def _generate_random_per(self): last_name = self.fake.last_name() full_name = f"{first_name} {last_name}" return full_name - + def _generate_random_misc(self): """ Generate a random misc using the Faker library. @@ -178,7 +178,7 @@ def _generate_random_tel(self): str: A randomly telephone number. """ return self.fake.phone_number() - + def _generate_random_mail(self): """ Generate a random email using the Faker library. @@ -202,4 +202,4 @@ def _get_faker_value(self, entity_type): function = getattr(self,"_generate_random_{}".format(entity_type.lower())) return function() else: - raise ValueError(f"Unsupported entity type: {entity_type}") + raise ValueError(f"Unsupported entity type: {entity_type}") \ No newline at end of file diff --git a/setup.py b/setup.py index 75ea538..0703b19 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='hexanonyme', - version='0.1.1', + version='0.1.2', description='A Python package for PII data anonymization', long_description = long_description, long_description_content_type = "text/markdown",