Skip to content

Commit

Permalink
Merge pull request #8 from BirdiD/update_code_and_readme
Browse files Browse the repository at this point in the history
Update code and readme and package version
  • Loading branch information
BirdiD authored Sep 12, 2023
2 parents e1d19f8 + 929b0d9 commit 12210f7
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 22 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ from hexanonyme import ReplaceAnonymizer, RedactAnonymizer
## ReplaceAnoymizer

The replace anonymizer can take the following arguments
- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS"]`.
- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`.
- faker (bool): Whether to use Faker library for fake data generation (default: `True`). For instance if entities list is ["PER", "LOC"], fakes names and cities will be generated and used to replaced the entities in the original text.
- replacement_dict : Dictionary of replacement values for specific entity types . If faker argument is set to `False`, you can supply a dictionary for you entity replacement. For instance `{"PER" : "Jean Pierre", "LOC" : "Marseille"}` will replace all PER and LOC entities by respectively **Jean Pierre** and **Marseille**.

Expand Down Expand Up @@ -69,7 +69,7 @@ restored_text = replace_anonymizer.deanonymize(anonymized_text)
## RedactAnonymizer

Contrary to the replace anonymizer, the redact anonymizer takes only one argument (the list of entities)
- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC"]`.
- entities : List of entity types to be anonymized. Default values are: `["PER", "LOC", "DATE", "ADDRESS", "ORG", "MISC", "TEL", "MAIL"]`.

```python
# Initialize ReplaceAnonymizer
Expand Down Expand Up @@ -103,6 +103,8 @@ Data anonymization is crucial for protecting individuals' privacy and complying
- ADDRESS (postal addresses)
- ORG (organisation)
- MISC (films, series)
- TEL (telephone number)
- MAIL (email address)

These NER models accurately identify PII entities in French text.

Expand Down
10 changes: 5 additions & 5 deletions hexanonyme/core/anonymizer/base_anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from transformers import pipeline

import re

class BaseAnonymizer:
def __init__(self):
Expand Down Expand Up @@ -29,7 +29,7 @@ def load_pipelines(self):

def merge_overlapping_entities(self, entities):
"""
Merge overlaps over one entity.
Merge overlaps over one entity.
In some cases a person name like "Cecile Da Costa." can be identified as two PER entities bacause of the formating of the text and what comes before
Args:
Expand All @@ -55,7 +55,7 @@ def merge_overlapping_entities(self, entities):
merged_entities.append(entity)
i = j
return merged_entities

def find_telephone_number(self, text):
entities = []
multiples_regex_formats = ["((?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2}))"]
Expand All @@ -71,7 +71,7 @@ def find_telephone_number(self, text):
}
entities.append(entity)
return entities

def find_email(self, text):
entities = []
multiples_regex_formats = ["([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*)"]
Expand All @@ -87,7 +87,7 @@ def find_email(self, text):
}
entities.append(entity)
return entities

def drop_duplicates_and_included_entities(self, list_of_dicts):
"""
Drop duplicate entities and entities included in other entities from a list of dictionaries.
Expand Down
12 changes: 6 additions & 6 deletions hexanonyme/core/anonymizer/redact_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@ def redact(self, text):
"""

#Get entities from all classifiers
entities_total = []
for [classifier, filtre] in self.classifier_filtres:
# Get entities from both models
entities_classifier = classifier(text)
# Merge overlapping entities
entities_classifier = self.merge_overlapping_entities(entities_classifier)
entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre]
entities_total += entities_classifier

entities_total += self.find_telephone_number(text)
entities_total += self.find_email(text)

entities = self.drop_duplicates_and_included_entities(entities_total)

self.log_redactions = []
Expand Down Expand Up @@ -95,11 +96,10 @@ def deanonymize(self, redacted_text):
entity_index = 0

words = redacted_text.split()

for word in words:
for i, word in enumerate(words):
if '[REDACTED]' in word and entity_index < len(self.log_redactions):
word = re.sub(r"\[REDACTED\]", (self.log_redactions[entity_index]['word']), word)
words[i] = re.sub(r"\[REDACTED\]", self.log_redactions[entity_index]['word'], word)
entity_index += 1

reconstructed_sentence = ' '.join(words)
return reconstructed_sentence
return reconstructed_sentence
16 changes: 8 additions & 8 deletions hexanonyme/core/anonymizer/replace_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ def replace(self, text):
entities_classifier = self.merge_overlapping_entities(entities_classifier)
entities_classifier = [entity for entity in entities_classifier if entity["entity_group"] in filtre]
entities_total += entities_classifier

entities_total += self.find_telephone_number(text)
entities_total += self.find_email(text)

tokens = self.drop_duplicates_and_included_entities(entities_total)

for entity_type in self.entities:
if entity_type in ["ADDRESS", "PER", "DATE", "LOC", "ORG", "MISC", "TEL", "MAIL"]:
text = self._replace_entities(text, token, entity_type)
text = self._replace_entities(text, tokens, entity_type)
else:
raise ValueError(f"Unsupported entity type: {entity_type}")

Expand Down Expand Up @@ -108,7 +108,7 @@ def deanonymize(self, text):
str: The deanonymized text with replaced values restored.
"""
for original_word, replacement in self.log_replacements:
text = text.replace(replacement, original_word, 1)
text = text.replace(replacement, original_word, 1)
return text

def _generate_random_loc(self):
Expand Down Expand Up @@ -146,7 +146,7 @@ def _generate_random_org(self):
str: A randomly generated company.
"""
return self.fake.company()

def _generate_random_per(self):
"""
Generate a random name using the Faker library.
Expand All @@ -158,7 +158,7 @@ def _generate_random_per(self):
last_name = self.fake.last_name()
full_name = f"{first_name} {last_name}"
return full_name

def _generate_random_misc(self):
"""
Generate a random misc using the Faker library.
Expand All @@ -178,7 +178,7 @@ def _generate_random_tel(self):
str: A randomly telephone number.
"""
return self.fake.phone_number()

def _generate_random_mail(self):
"""
Generate a random email using the Faker library.
Expand All @@ -202,4 +202,4 @@ def _get_faker_value(self, entity_type):
function = getattr(self,"_generate_random_{}".format(entity_type.lower()))
return function()
else:
raise ValueError(f"Unsupported entity type: {entity_type}")
raise ValueError(f"Unsupported entity type: {entity_type}")
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='hexanonyme',
version='0.1.1',
version='0.1.2',
description='A Python package for PII data anonymization',
long_description = long_description,
long_description_content_type = "text/markdown",
Expand Down

0 comments on commit 12210f7

Please sign in to comment.