Merge pull request #8 from BirdiD/update_code_and_readme

Update code and readme and package version
BirdiD · Sep 12, 2023 · 12210f7 · 12210f7
2 parents e1d19f8 + 929b0d9
commit 12210f7
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ from hexanonyme import ReplaceAnonymizer, RedactAnonymizer
 ## ReplaceAnoymizer
 
 The replace anonymizer can take the following arguments 
-- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS"]`.
+- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`.
 - faker (bool): Whether to use Faker library for fake data generation (default: `True`). For instance if entities list is  ["PER", "LOC"], fakes names and cities will be generated and used to replaced the entities in the original text.
 - replacement_dict : Dictionary of replacement values for specific entity types . If faker argument is set to `False`, you can supply a dictionary for you entity replacement. For instance `{"PER" : "Jean Pierre", "LOC" : "Marseille"}` will replace all PER and LOC entities by respectively **Jean Pierre** and **Marseille**. 
 
@@ -69,7 +69,7 @@ restored_text = replace_anonymizer.deanonymize(anonymized_text)
 ## RedactAnonymizer
 
 Contrary to the replace anonymizer, the redact anonymizer takes only one argument (the list of entities) 
-- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC"]`.
+- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`.
 
 ```python
 # Initialize ReplaceAnonymizer
@@ -103,6 +103,8 @@ Data anonymization is crucial for protecting individuals' privacy and complying
 - ADDRESS (postal addresses)
 - ORG (organisation)
 - MISC (films, series)
+- TEL (telephone number)
+- MAIL (email address)
 
 These NER models accurately identify PII entities in French text.
 

diff --git a/hexanonyme/core/anonymizer/base_anonymizer.py b/hexanonyme/core/anonymizer/base_anonymizer.py
@@ -1,5 +1,5 @@
 from transformers import pipeline
-
+import re
 
 class BaseAnonymizer:
     def __init__(self):
@@ -29,7 +29,7 @@ def load_pipelines(self):
 
     def merge_overlapping_entities(self, entities):
         """
-        Merge overlaps over one entity. 
+        Merge overlaps over one entity.
         In some cases a person name like "Cecile Da Costa." can be identified as two PER entities bacause of the formating of the text and what comes before
 
         Args:
@@ -55,7 +55,7 @@ def merge_overlapping_entities(self, entities):
             merged_entities.append(entity)
             i = j
         return merged_entities
-        
+
     def find_telephone_number(self, text):
         entities = []
         multiples_regex_formats = ["((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2}))"]
@@ -71,7 +71,7 @@ def find_telephone_number(self, text):
                 }
                 entities.append(entity)
         return entities
-    
+
     def find_email(self, text):
         entities = []
         multiples_regex_formats = ["([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*)"]
@@ -87,7 +87,7 @@ def find_email(self, text):
                 }
                 entities.append(entity)
         return entities
-    
+
     def drop_duplicates_and_included_entities(self, list_of_dicts):
         """
         Drop duplicate entities and entities included in other entities from a list of dictionaries.

diff --git a/hexanonyme/core/anonymizer/redact_anonymizer.py b/hexanonyme/core/anonymizer/redact_anonymizer.py
@@ -24,17 +24,18 @@ def redact(self, text):
         """
 
         #Get entities from all classifiers
+        entities_total = []
         for [classifier, filtre] in self.classifier_filtres:
             # Get entities from both models
             entities_classifier = classifier(text)
             # Merge overlapping entities
             entities_classifier = self.merge_overlapping_entities(entities_classifier)
             entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre]
             entities_total += entities_classifier
-            
+
         entities_total += self.find_telephone_number(text)
         entities_total += self.find_email(text)
-        
+
         entities = self.drop_duplicates_and_included_entities(entities_total)
 
         self.log_redactions = []
@@ -95,11 +96,10 @@ def deanonymize(self, redacted_text):
         entity_index = 0
 
         words = redacted_text.split()
-
-        for word in words:
+        for i, word in enumerate(words):
             if '[REDACTED]' in word and entity_index < len(self.log_redactions):
-                word = re.sub(r"\[REDACTED\]", (self.log_redactions[entity_index]['word']), word)
+                words[i] = re.sub(r"\[REDACTED\]", self.log_redactions[entity_index]['word'], word)
                 entity_index += 1
 
         reconstructed_sentence = ' '.join(words)
-        return reconstructed_sentence
+        return reconstructed_sentence
diff --git a/hexanonyme/core/anonymizer/replace_anonymizer.py b/hexanonyme/core/anonymizer/replace_anonymizer.py
@@ -55,15 +55,15 @@ def replace(self, text):
             entities_classifier = self.merge_overlapping_entities(entities_classifier)
             entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre]
             entities_total += entities_classifier
-        
+
         entities_total += self.find_telephone_number(text)
         entities_total += self.find_email(text)
-        
+
         tokens = self.drop_duplicates_and_included_entities(entities_total)
 
         for entity_type in self.entities:
           if entity_type in ["ADDRESS", "PER", "DATE", "LOC", "ORG", "MISC", "TEL", "MAIL"]:
-            text = self._replace_entities(text, token, entity_type)
+            text = self._replace_entities(text, tokens, entity_type)
           else:
             raise ValueError(f"Unsupported entity type: {entity_type}")
 
@@ -108,7 +108,7 @@ def deanonymize(self, text):
             str: The deanonymized text with replaced values restored.
         """
         for original_word, replacement in self.log_replacements:
-            text = text.replace(replacement, original_word, 1) 
+            text = text.replace(replacement, original_word, 1)
         return text
 
     def _generate_random_loc(self):
@@ -146,7 +146,7 @@ def _generate_random_org(self):
             str: A randomly generated company.
         """
         return self.fake.company()
-    
+
     def _generate_random_per(self):
         """
         Generate a random name using the Faker library.
@@ -158,7 +158,7 @@ def _generate_random_per(self):
         last_name = self.fake.last_name()
         full_name = f"{first_name} {last_name}"
         return full_name
-    
+
     def _generate_random_misc(self):
         """
         Generate a random misc using the Faker library.
@@ -178,7 +178,7 @@ def _generate_random_tel(self):
             str: A randomly telephone number.
         """
         return self.fake.phone_number()
-    
+
     def _generate_random_mail(self):
         """
         Generate a random email using the Faker library.
@@ -202,4 +202,4 @@ def _get_faker_value(self, entity_type):
             function = getattr(self,"_generate_random_{}".format(entity_type.lower()))
             return function()
         else:
-            raise ValueError(f"Unsupported entity type: {entity_type}")
+            raise ValueError(f"Unsupported entity type: {entity_type}")
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='hexanonyme', 
-    version='0.1.1',  
+    version='0.1.2',  
     description='A Python package for PII data anonymization',
     long_description = long_description,
     long_description_content_type = "text/markdown",