Skip to content

Commit

Permalink
Merge pull request #11 from eriknovak/feature/gliner
Browse files Browse the repository at this point in the history
Include new GLiNER and GLiNER-Spacy features to the package
  • Loading branch information
eriknovak authored May 30, 2024
2 parents 1fa1c97 + 239c5ba commit 838fe05
Show file tree
Hide file tree
Showing 13 changed files with 166 additions and 48 deletions.
13 changes: 13 additions & 0 deletions anonipy/anonymize/extractors/entity_extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
import importlib
from typing import List, Tuple
import warnings

import torch
from spacy import displacy
from spacy.tokens import Doc
from gliner_spacy.pipeline import GlinerSpacy
Expand All @@ -21,9 +23,11 @@ def __init__(
labels: List[dict],
lang: LANGUAGES = LANGUAGES.ENGLISH,
score_th=0.5,
use_gpu=False,
):
self.lang = lang
self.score_th = score_th
self.use_gpu = use_gpu
self.labels = self._prepare_labels(labels)
self.pipeline = self._prepare_pipeline()

Expand All @@ -50,13 +54,22 @@ def _prepare_labels(self, labels):
return labels

def _create_gliner_config(self):
map_location = "cpu"
if self.use_gpu and not torch.cuda.is_available():
return warnings.warn(
"The user requested GPU use, but not available GPU was found. Reverting back to CPU use."
)
if self.use_gpu and torch.cuda.is_available():
map_location = "cuda"

return {
# the model is specialized for extracting PII data
"gliner_model": "urchade/gliner_multi_pii-v1",
"labels": [l["label"] for l in self.labels],
"threshold": self.score_th,
"chunk_size": 384,
"style": "ent",
"map_location": map_location,
}

def _prepare_pipeline(self):
Expand Down
10 changes: 8 additions & 2 deletions anonipy/anonymize/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
# =====================================


def convert_spacy_to_entity(entity, type, regex=".*", *args, **kwargs):
def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs):
return Entity(
entity.text, entity.label_, entity.start_char, entity.end_char, type, regex
entity.text,
entity.label_,
entity.start_char,
entity.end_char,
entity._.score,
type,
regex,
)
4 changes: 2 additions & 2 deletions anonipy/anonymize/strategies/pseudonymization.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def __init__(self, mapping, *args, **kwargs):
def anonymize(self, text: str, entities: List[Entity], *args, **kwargs):
replacements = []
for ent in entities[::-1]:
r = self._create_replacement(text, ent, replacements)
r = self._create_replacement(ent, text, replacements)
text = (
text[: r["start_index"]] + r["anonymized_text"] + text[r["end_index"] :]
)
replacements.append(r)
return text, replacements[::-1]

def _create_replacement(self, text: str, entity: Entity, replacements: List[dict]):
def _create_replacement(self, entity: Entity, text: str, replacements: List[dict]):
# check if the replacement already exists
anonymized_text = self._check_replacement(entity, replacements)
# create a new replacement if it doesn't exist
Expand Down
1 change: 1 addition & 0 deletions anonipy/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ class Entity:
label: str
start_index: int
end_index: int
score: float = 1.0
type: ENTITY_TYPES = None
regex: Union[str, re.Pattern] = ".*"
38 changes: 35 additions & 3 deletions docs/documentation/notebooks/00-overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,38 @@
"entity_extractor.display(doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The extracted entities metadata is available in the `entities` variable, which are:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n",
" Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
" Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"entities"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -265,7 +297,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7518a99a76554628b656baedb345d89c",
"model_id": "430ac015084e46d3a81281c330a56686",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -360,7 +392,7 @@
"Patient Name: Ethan Thompson\n",
"Date of Birth: 01-07-1985\n",
"Date of Examination: 15-05-2024\n",
"Social Security Number: 635-28-4553\n",
"Social Security Number: 119-88-7014\n",
"\n",
"Examination Procedure:\n",
"Ethan Thompson underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n",
Expand Down Expand Up @@ -413,7 +445,7 @@
" 'label': 'social security number',\n",
" 'start_index': 121,\n",
" 'end_index': 132,\n",
" 'anonymized_text': '635-28-4553'},\n",
" 'anonymized_text': '119-88-7014'},\n",
" {'original_text': 'John Doe',\n",
" 'label': 'name',\n",
" 'start_index': 157,\n",
Expand Down
38 changes: 24 additions & 14 deletions docs/documentation/notebooks/01-extractors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@
"- `labels`: A list of dictionaries containing the labels to be extracted. \n",
"- `lang`: The language of the text to be anonymized. Defaults to `LANGUAGES.ENGLISH`.\n",
"- `score_th`: The score threshold used to filter the labels, i.e. the entity has to have a score greater than `score_th` to be considered. Defaults to 0.5.\n",
"- `use_gpu`: Whether to use the GPU. Defaults to `False`.\n",
"\n",
"We must now define the labels to be extracted. In this example, we will extract the people name, the dates, and the social security number from the text."
]
Expand Down Expand Up @@ -356,6 +357,7 @@
"- `label`: The label of the entity.\n",
"- `start_index`: The start index of the entity in the text.\n",
"- `end_index`: The end index of the entity in the text.\n",
"- `score`: The score of the entity. It shows how certain the model is that the entity is relevant.\n",
"- `type`: The type of the entity (taken from the defined `labels` variable list).\n",
"- `regex`: The regular expression the entity must match."
]
Expand All @@ -368,12 +370,12 @@
{
"data": {
"text/plain": [
"[Entity(text='John Doe', label='name', start_index=30, end_index=38, type='string', regex='.*'),\n",
" Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
" Entity(text='John Doe', label='name', start_index=157, end_index=165, type='string', regex='.*'),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
"[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n",
" Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
" Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
]
},
"execution_count": 12,
Expand Down Expand Up @@ -602,7 +604,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -626,6 +628,7 @@
" label=\"date\",\n",
" start_index=match.start(),\n",
" end_index=match.end(),\n",
" score=1.0,\n",
" type=\"date\",\n",
" regex=self.regex_pattern,\n",
" )\n",
Expand All @@ -635,7 +638,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -652,26 +655,33 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Entity(text='15-01-1985', label='date', start_index=54, end_index=64, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='23-45-6789', label='date', start_index=122, end_index=132, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}'))]"
"[Entity(text='15-01-1985', label='date', start_index=54, end_index=64, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='23-45-6789', label='date', start_index=122, end_index=132, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}'))]"
]
},
"execution_count": 20,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"entities"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
24 changes: 15 additions & 9 deletions docs/documentation/notebooks/02-generators.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
" label=\"name\",\n",
" start_index=30,\n",
" end_index=38,\n",
" score=1.0,\n",
" type=\"string\",\n",
" regex=\".*\",\n",
" ),\n",
Expand All @@ -100,6 +101,7 @@
" label=\"date of birth\",\n",
" start_index=54,\n",
" end_index=64,\n",
" score=1.0,\n",
" type=\"date\",\n",
" regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
" ),\n",
Expand All @@ -108,6 +110,7 @@
" label=\"date\",\n",
" start_index=86,\n",
" end_index=96,\n",
" score=1.0,\n",
" type=\"date\",\n",
" regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
" ),\n",
Expand All @@ -116,6 +119,7 @@
" label=\"social security number\",\n",
" start_index=121,\n",
" end_index=132,\n",
" score=1.0,\n",
" type=\"custom\",\n",
" regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
" ),\n",
Expand All @@ -124,6 +128,7 @@
" label=\"name\",\n",
" start_index=157,\n",
" end_index=165,\n",
" score=1.0,\n",
" type=\"string\",\n",
" regex=\".*\",\n",
" ),\n",
Expand All @@ -132,6 +137,7 @@
" label=\"date\",\n",
" start_index=717,\n",
" end_index=727,\n",
" score=1.0,\n",
" type=\"date\",\n",
" regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
" ),\n",
Expand Down Expand Up @@ -188,7 +194,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "388fbae393fc44f6b9ad3ab1bea29afc",
"model_id": "9a6dfc4a4fd74bcc8351b6b01755f18a",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -425,7 +431,7 @@
{
"data": {
"text/plain": [
"'Williams James'"
"'James Smith'"
]
},
"execution_count": 9,
Expand All @@ -445,7 +451,7 @@
{
"data": {
"text/plain": [
"'David Thomas'"
"'Michael Smith'"
]
},
"execution_count": 10,
Expand All @@ -465,7 +471,7 @@
{
"data": {
"text/plain": [
"'David Thomas'"
"'David Smith'"
]
},
"execution_count": 11,
Expand All @@ -486,11 +492,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"John Doe | name | Michael Smith\n",
"John Doe | name | Thomas David\n",
"15-01-1985 | date of birth | None\n",
"20-05-2024 | date | None\n",
"123-45-6789 | social security number | None\n",
"John Doe | name | Professor first\n",
"John Doe | name | Officer first\n",
"15-11-2024 | date | None\n"
]
}
Expand Down Expand Up @@ -579,7 +585,7 @@
{
"data": {
"text/plain": [
"'946-49-5488'"
"'143-46-4915'"
]
},
"execution_count": 14,
Expand Down Expand Up @@ -706,7 +712,7 @@
{
"data": {
"text/plain": [
"'01-05-2024'"
"'26-05-2024'"
]
},
"execution_count": 18,
Expand Down Expand Up @@ -873,7 +879,7 @@
{
"data": {
"text/plain": [
"'😢'"
"'😄'"
]
},
"execution_count": 25,
Expand Down
Loading

0 comments on commit 838fe05

Please sign in to comment.