Skip to content

Commit

Permalink
Merge pull request #18 from eriknovak/feature/code-docs
Browse files Browse the repository at this point in the history
Add code documentation
  • Loading branch information
eriknovak authored Jun 18, 2024
2 parents 183a0b8 + bb48068 commit d0af713
Show file tree
Hide file tree
Showing 24 changed files with 1,127 additions and 43 deletions.
18 changes: 13 additions & 5 deletions anonipy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
"""
anonipy
=========
Provides
1. Label extractors
2. Label generators
3. Anonymization strategies
The anonipy package provides utilities for data anonymization.
Submodules
----------
anonymize :
The package containing anonymization classes and functions.
utils :
The package containing utility classes and functions.
definitions :
The object definitions used within the package.
constants :
The constant values used to help with data anonymization.
How to use the documentation
----------------------------
Expand Down
22 changes: 22 additions & 0 deletions anonipy/anonymize/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
"""
anonymize
The module provides a set of anonymization utilities.
Submodules
----------
extractors :
The module containing the extractor classes
generators :
The module containing the generator classes
strategies :
The module containing the strategy classes
regex :
The module containing the regex patterns
Methods
-------
anonymize()
"""

from . import extractors
from . import generators
from . import strategies
Expand Down
14 changes: 14 additions & 0 deletions anonipy/anonymize/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
"""
extractors
The module provides a set of extractors used in the library.
Classes
-------
ExtractorInterface :
The class representing the extractor interface
EntityExtractor :
The class representing the entity extractor
"""

from .interface import ExtractorInterface
from .entity_extractor import EntityExtractor

Expand Down
116 changes: 112 additions & 4 deletions anonipy/anonymize/extractors/entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,111 @@


class EntityExtractor(ExtractorInterface):
"""The class representing the entity extractor
Attributes
----------
labels : List[dict]
The list of labels to extract
lang : str
The language of the text to extract
score_th : float
The score threshold
use_gpu : bool
Whether to use GPU
pipeline : spacy pipeline
The spacy pipeline
Methods
-------
__call__(self, text: str)
Extract the entities from the text
display(self, doc: Doc)
Display the entities in the text
"""

def __init__(
self,
labels: List[dict],
lang: LANGUAGES = LANGUAGES.ENGLISH,
score_th=0.5,
use_gpu=False,
*args,
**kwargs,
):
"""
Parameters
----------
labels : List[dict]
The list of labels to extract
lang : str
The language of the text to extract
score_th : float
The score threshold. Entities with a score below this threshold will be ignored. Default: 0.5
use_gpu : bool
Whether to use GPU. Default: False
"""

super().__init__(labels, *args, **kwargs)
self.lang = lang
self.score_th = score_th
self.use_gpu = use_gpu
self.labels = self._prepare_labels(labels)
self.pipeline = self._prepare_pipeline()

def __call__(self, text: str) -> Tuple[Doc, List[Entity]]:
def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]:
"""Extract the entities from the text
Parameters
----------
text : str
The text to extract entities from
Returns
-------
Tuple[Doc, List[Entity]]
The spacy doc and the list of entities extracted
"""

doc = self.pipeline(text)
entities, doc.ents = self._prepare_entities(doc)
return doc, entities

def display(self, doc: Doc):
"""Display the entities in the text
Parameters
----------
doc : Doc
The spacy doc to display
"""

options = {"colors": {l["label"]: "#5C7AEA" for l in self.labels}}
displacy.render(doc, style="ent", options=options)

# ===========================================
# Private methods
# ===========================================

def _prepare_labels(self, labels):
def _prepare_labels(self, labels: List[dict]) -> List[dict]:
"""Prepare the labels for the extractor
Parameters
----------
labels : List[dict]
The list of labels to prepare
Returns
-------
List[dict]
The prepared labels
"""
for l in labels:
if "regex" in l:
continue
Expand All @@ -53,6 +130,15 @@ def _prepare_labels(self, labels):
return labels

def _create_gliner_config(self):
"""Create the config for the GLINER model
Returns
-------
dict
The config for the GLINER model
"""

map_location = "cpu"
if self.use_gpu and not torch.cuda.is_available():
return warnings.warn(
Expand All @@ -72,6 +158,15 @@ def _create_gliner_config(self):
}

def _prepare_pipeline(self):
"""Prepare the spacy pipeline
Returns
-------
spacy pipeline
The spacy pipeline
"""

# load the appropriate parser for the language
module_lang, class_lang = self.lang[0].lower(), self.lang[1].lower().title()
language_module = importlib.import_module(f"spacy.lang.{module_lang}")
Expand All @@ -83,8 +178,21 @@ def _prepare_pipeline(self):
nlp.add_pipe("gliner_spacy", config=gliner_config)
return nlp

def _prepare_entities(self, doc):
# prepares the anonymized and spacy entities
def _prepare_entities(self, doc: Doc):
"""Prepares the anonipy and spacy entities
Parameters
----------
doc : Doc
The spacy doc to prepare
Returns
-------
Tuple[List[Entity], List[Entity]]
The anonipy entities and the spacy entities
"""

# TODO: make this part more generic
anoni_entities = []
Expand Down
1 change: 1 addition & 0 deletions anonipy/anonymize/extractors/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@


class ExtractorInterface:
"""The class representing the extractor interface"""

def __init__(self, labels: List[dict], *args, **kwargs):
pass
Expand Down
20 changes: 20 additions & 0 deletions anonipy/anonymize/generators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
"""
generators
The module provides a set of generators used in the library.
Classes
-------
GeneratorInterface :
The class representing the generator interface
LLMLabelGenerator :
The class representing the LLM label generator
MaskLabelGenerator :
The class representing the mask label generator
NumberGenerator :
The class representing the number generator
DateGenerator :
The class representing the date generator
"""

from .interface import GeneratorInterface
from .llm_label_generator import LLMLabelGenerator
from .mask_label_generator import MaskLabelGenerator
Expand Down
Loading

0 comments on commit d0af713

Please sign in to comment.