Merge pull request #11 from eriknovak/feature/gliner

Include new GLiNER and GLiNER-Spacy features to the package
eriknovak · May 30, 2024 · 838fe05 · 838fe05
2 parents 1fa1c97 + 239c5ba
commit 838fe05
Show file tree

Hide file tree

Showing 13 changed files with 166 additions and 48 deletions.
diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py
@@ -1,7 +1,9 @@
 import re
 import importlib
 from typing import List, Tuple
+import warnings
 
+import torch
 from spacy import displacy
 from spacy.tokens import Doc
 from gliner_spacy.pipeline import GlinerSpacy
@@ -21,9 +23,11 @@ def __init__(
         labels: List[dict],
         lang: LANGUAGES = LANGUAGES.ENGLISH,
         score_th=0.5,
+        use_gpu=False,
     ):
         self.lang = lang
         self.score_th = score_th
+        self.use_gpu = use_gpu
         self.labels = self._prepare_labels(labels)
         self.pipeline = self._prepare_pipeline()
 
@@ -50,13 +54,22 @@ def _prepare_labels(self, labels):
         return labels
 
     def _create_gliner_config(self):
+        map_location = "cpu"
+        if self.use_gpu and not torch.cuda.is_available():
+            return warnings.warn(
+                "The user requested GPU use, but not available GPU was found. Reverting back to CPU use."
+            )
+        if self.use_gpu and torch.cuda.is_available():
+            map_location = "cuda"
+
         return {
             # the model is specialized for extracting PII data
             "gliner_model": "urchade/gliner_multi_pii-v1",
             "labels": [l["label"] for l in self.labels],
             "threshold": self.score_th,
             "chunk_size": 384,
             "style": "ent",
+            "map_location": map_location,
         }
 
     def _prepare_pipeline(self):

diff --git a/anonipy/anonymize/helpers.py b/anonipy/anonymize/helpers.py
@@ -5,7 +5,13 @@
 # =====================================
 
 
-def convert_spacy_to_entity(entity, type, regex=".*", *args, **kwargs):
+def convert_spacy_to_entity(entity, type=None, regex=".*", *args, **kwargs):
     return Entity(
-        entity.text, entity.label_, entity.start_char, entity.end_char, type, regex
+        entity.text,
+        entity.label_,
+        entity.start_char,
+        entity.end_char,
+        entity._.score,
+        type,
+        regex,
     )
diff --git a/anonipy/anonymize/strategies/pseudonymization.py b/anonipy/anonymize/strategies/pseudonymization.py
@@ -21,14 +21,14 @@ def __init__(self, mapping, *args, **kwargs):
     def anonymize(self, text: str, entities: List[Entity], *args, **kwargs):
         replacements = []
         for ent in entities[::-1]:
-            r = self._create_replacement(text, ent, replacements)
+            r = self._create_replacement(ent, text, replacements)
             text = (
                 text[: r["start_index"]] + r["anonymized_text"] + text[r["end_index"] :]
             )
             replacements.append(r)
         return text, replacements[::-1]
 
-    def _create_replacement(self, text: str, entity: Entity, replacements: List[dict]):
+    def _create_replacement(self, entity: Entity, text: str, replacements: List[dict]):
         # check if the replacement already exists
         anonymized_text = self._check_replacement(entity, replacements)
         # create a new replacement if it doesn't exist

diff --git a/anonipy/definitions.py b/anonipy/definitions.py
@@ -19,5 +19,6 @@ class Entity:
     label: str
     start_index: int
     end_index: int
+    score: float = 1.0
     type: ENTITY_TYPES = None
     regex: Union[str, re.Pattern] = ".*"
diff --git a/docs/documentation/notebooks/00-overview.ipynb b/docs/documentation/notebooks/00-overview.ipynb
@@ -218,6 +218,38 @@
     "entity_extractor.display(doc)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The extracted entities metadata is available in the `entities` variable, which are:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n",
+       " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
+       " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
+       " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
+       " Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n",
+       " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "entities"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -265,7 +297,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7518a99a76554628b656baedb345d89c",
+       "model_id": "430ac015084e46d3a81281c330a56686",
        "version_major": 2,
        "version_minor": 0
       },
@@ -360,7 +392,7 @@
       "Patient Name: Ethan Thompson\n",
       "Date of Birth: 01-07-1985\n",
       "Date of Examination: 15-05-2024\n",
-      "Social Security Number: 635-28-4553\n",
+      "Social Security Number: 119-88-7014\n",
       "\n",
       "Examination Procedure:\n",
       "Ethan Thompson underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n",
@@ -413,7 +445,7 @@
        "  'label': 'social security number',\n",
        "  'start_index': 121,\n",
        "  'end_index': 132,\n",
-       "  'anonymized_text': '635-28-4553'},\n",
+       "  'anonymized_text': '119-88-7014'},\n",
        " {'original_text': 'John Doe',\n",
        "  'label': 'name',\n",
        "  'start_index': 157,\n",

diff --git a/docs/documentation/notebooks/01-extractors.ipynb b/docs/documentation/notebooks/01-extractors.ipynb
@@ -218,6 +218,7 @@
     "- `labels`: A list of dictionaries containing the labels to be extracted. \n",
     "- `lang`: The language of the text to be anonymized. Defaults to `LANGUAGES.ENGLISH`.\n",
     "- `score_th`: The score threshold used to filter the labels, i.e. the entity has to have a score greater than `score_th` to be considered. Defaults to 0.5.\n",
+    "- `use_gpu`: Whether to use the GPU. Defaults to `False`.\n",
     "\n",
     "We must now define the labels to be extracted. In this example, we will extract the people name, the dates, and the social security number from the text."
    ]
@@ -356,6 +357,7 @@
     "- `label`: The label of the entity.\n",
     "- `start_index`: The start index of the entity in the text.\n",
     "- `end_index`: The end index of the entity in the text.\n",
+    "- `score`: The score of the entity. It shows how certain the model is that the entity is relevant.\n",
     "- `type`: The type of the entity (taken from the defined `labels` variable list).\n",
     "- `regex`: The regular expression the entity must match."
    ]
@@ -368,12 +370,12 @@
     {
      "data": {
       "text/plain": [
-       "[Entity(text='John Doe', label='name', start_index=30, end_index=38, type='string', regex='.*'),\n",
-       " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
-       " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
-       " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
-       " Entity(text='John Doe', label='name', start_index=157, end_index=165, type='string', regex='.*'),\n",
-       " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
+       "[Entity(text='John Doe', label='name', start_index=30, end_index=38, score=0.9961156845092773, type='string', regex='.*'),\n",
+       " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, score=0.9937193393707275, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
+       " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=0.9867385625839233, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
+       " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, score=0.9993416666984558, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
+       " Entity(text='John Doe', label='name', start_index=157, end_index=165, score=0.994924783706665, type='string', regex='.*'),\n",
+       " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=0.8285622596740723, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
       ]
      },
      "execution_count": 12,
@@ -602,7 +604,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -626,6 +628,7 @@
     "                    label=\"date\",\n",
     "                    start_index=match.start(),\n",
     "                    end_index=match.end(),\n",
+    "                    score=1.0,\n",
     "                    type=\"date\",\n",
     "                    regex=self.regex_pattern,\n",
     "                )\n",
@@ -635,7 +638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -652,26 +655,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Entity(text='15-01-1985', label='date', start_index=54, end_index=64, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
-       " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
-       " Entity(text='23-45-6789', label='date', start_index=122, end_index=132, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
-       " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}'))]"
+       "[Entity(text='15-01-1985', label='date', start_index=54, end_index=64, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
+       " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
+       " Entity(text='23-45-6789', label='date', start_index=122, end_index=132, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}')),\n",
+       " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, score=1.0, type='date', regex=re.compile('\\\\d{1,2}-\\\\d{1,2}-\\\\d{2,4}'))]"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "entities"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/documentation/notebooks/02-generators.ipynb b/docs/documentation/notebooks/02-generators.ipynb
@@ -92,6 +92,7 @@
     "        label=\"name\",\n",
     "        start_index=30,\n",
     "        end_index=38,\n",
+    "        score=1.0,\n",
     "        type=\"string\",\n",
     "        regex=\".*\",\n",
     "    ),\n",
@@ -100,6 +101,7 @@
     "        label=\"date of birth\",\n",
     "        start_index=54,\n",
     "        end_index=64,\n",
+    "        score=1.0,\n",
     "        type=\"date\",\n",
     "        regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
     "    ),\n",
@@ -108,6 +110,7 @@
     "        label=\"date\",\n",
     "        start_index=86,\n",
     "        end_index=96,\n",
+    "        score=1.0,\n",
     "        type=\"date\",\n",
     "        regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
     "    ),\n",
@@ -116,6 +119,7 @@
     "        label=\"social security number\",\n",
     "        start_index=121,\n",
     "        end_index=132,\n",
+    "        score=1.0,\n",
     "        type=\"custom\",\n",
     "        regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
     "    ),\n",
@@ -124,6 +128,7 @@
     "        label=\"name\",\n",
     "        start_index=157,\n",
     "        end_index=165,\n",
+    "        score=1.0,\n",
     "        type=\"string\",\n",
     "        regex=\".*\",\n",
     "    ),\n",
@@ -132,6 +137,7 @@
     "        label=\"date\",\n",
     "        start_index=717,\n",
     "        end_index=727,\n",
+    "        score=1.0,\n",
     "        type=\"date\",\n",
     "        regex=\"(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})\",\n",
     "    ),\n",
@@ -188,7 +194,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "388fbae393fc44f6b9ad3ab1bea29afc",
+       "model_id": "9a6dfc4a4fd74bcc8351b6b01755f18a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -425,7 +431,7 @@
     {
      "data": {
       "text/plain": [
-       "'Williams James'"
+       "'James Smith'"
       ]
      },
      "execution_count": 9,
@@ -445,7 +451,7 @@
     {
      "data": {
       "text/plain": [
-       "'David Thomas'"
+       "'Michael Smith'"
       ]
      },
      "execution_count": 10,
@@ -465,7 +471,7 @@
     {
      "data": {
       "text/plain": [
-       "'David Thomas'"
+       "'David Smith'"
       ]
      },
      "execution_count": 11,
@@ -486,11 +492,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "John Doe     | name                   | Michael Smith\n",
+      "John Doe     | name                   | Thomas David\n",
       "15-01-1985   | date of birth          | None\n",
       "20-05-2024   | date                   | None\n",
       "123-45-6789  | social security number | None\n",
-      "John Doe     | name                   | Professor first\n",
+      "John Doe     | name                   | Officer first\n",
       "15-11-2024   | date                   | None\n"
      ]
     }
@@ -579,7 +585,7 @@
     {
      "data": {
       "text/plain": [
-       "'946-49-5488'"
+       "'143-46-4915'"
       ]
      },
      "execution_count": 14,
@@ -706,7 +712,7 @@
     {
      "data": {
       "text/plain": [
-       "'01-05-2024'"
+       "'26-05-2024'"
       ]
      },
      "execution_count": 18,
@@ -873,7 +879,7 @@
     {
      "data": {
       "text/plain": [
-       "'😢'"
+       "'😄'"
       ]
      },
      "execution_count": 25,