Merge pull request #5 from eriknovak/tests

Add unit tests to the project
eriknovak · May 27, 2024 · ff0aaea · ff0aaea
2 parents dba04b9 + 8d5ba18
commit ff0aaea
Show file tree

Hide file tree

Showing 17 changed files with 915 additions and 51 deletions.
diff --git a/.github/workflows/unittests.yaml b/.github/workflows/unittests.yaml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.8", "3.9", "3.10"] # "3.11" - for some reason the models give different predictions on 3.11
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/anonipy/anonymize/extractors/entity_extractor.py b/anonipy/anonymize/extractors/entity_extractor.py
@@ -80,6 +80,6 @@ def _prepare_entities(self, doc):
         for e in doc.ents:
             label = list(filter(lambda x: x["label"] == e.label_, self.labels))[0]
             if re.match(label["regex"], e.text):
-                anoni_entities.append(convert_spacy_to_entity(e, label))
+                anoni_entities.append(convert_spacy_to_entity(e, **label))
                 spacy_entities.append(e)
         return anoni_entities, spacy_entities
diff --git a/anonipy/anonymize/helpers.py b/anonipy/anonymize/helpers.py
@@ -5,12 +5,7 @@
 # =====================================
 
 
-def convert_spacy_to_entity(entity, label):
+def convert_spacy_to_entity(entity, type, regex=".*", *args, **kwargs):
     return Entity(
-        entity.text,
-        entity.label_,
-        entity.start_char,
-        entity.end_char,
-        label["type"],
-        label["regex"] if "regex" in label else ".*",
+        entity.text, entity.label_, entity.start_char, entity.end_char, type, regex
     )
diff --git a/anonipy/anonymize/regex.py b/anonipy/anonymize/regex.py
@@ -4,6 +4,9 @@
 
 from collections import defaultdict
 
+from ..constants import ENTITY_TYPES
+
+
 # =====================================
 # Regex definitions
 # =====================================
@@ -33,13 +36,13 @@ class RegexMap:
     def __init__(self):
         self.regex_mapping = defaultdict(lambda: ".*")
         # Define the regex mappings
-        self.regex_mapping["string"] = REGEX_STRING
-        self.regex_mapping["integer"] = REGEX_INTEGER
-        self.regex_mapping["float"] = REGEX_FLOAT
-        self.regex_mapping["date"] = REGEX_DATE
-        self.regex_mapping["email"] = REGEX_EMAIL_ADDRESS
-        self.regex_mapping["phone_number"] = REGEX_PHONE_NUMBER
-        self.regex_mapping["website_url"] = REGEX_WEBSITE_URL
+        self.regex_mapping[ENTITY_TYPES.STRING] = REGEX_STRING
+        self.regex_mapping[ENTITY_TYPES.INTEGER] = REGEX_INTEGER
+        self.regex_mapping[ENTITY_TYPES.FLOAT] = REGEX_FLOAT
+        self.regex_mapping[ENTITY_TYPES.DATE] = REGEX_DATE
+        self.regex_mapping[ENTITY_TYPES.EMAIL] = REGEX_EMAIL_ADDRESS
+        self.regex_mapping[ENTITY_TYPES.PHONE_NUMBER] = REGEX_PHONE_NUMBER
+        self.regex_mapping[ENTITY_TYPES.WEBSITE_URL] = REGEX_WEBSITE_URL
 
     def __call__(self, type: str) -> str:
         return self.regex_mapping[type]

diff --git a/anonipy/constants.py b/anonipy/constants.py
@@ -16,6 +16,19 @@ class LANGUAGES:
     GERMAN = ("de", "German")
     GREEK = ("el", "Greek")
     ITALIAN = ("it", "Italian")
-    SLOVENE = ("sl", "Slovenian")
+    SLOVENE = ("sl", "Slovene")
     SPANISH = ("es", "Spanish")
-    UKRAINIAN = ("uk", "Ukranian")
+    UKRAINIAN = ("uk", "Ukrainian")
+
+
+class ENTITY_TYPES:
+    """Types of entities"""
+
+    CUSTOM = "custom"
+    STRING = "string"
+    INTEGER = "integer"
+    FLOAT = "float"
+    DATE = "date"
+    EMAIL = "email"
+    WEBSITE_URL = "website_url"
+    PHONE_NUMBER = "phone_number"
diff --git a/anonipy/definitions.py b/anonipy/definitions.py
@@ -2,8 +2,11 @@
 The definitions used within the package
 """
 
+import re
+from typing import Union
 from dataclasses import dataclass
 
+from .constants import ENTITY_TYPES
 
 # ================================================
 # Entity Definitions
@@ -14,7 +17,7 @@
 class Entity:
     text: str
     label: str
-    start_index: str
-    end_index: str
-    type: str = None
-    regex: str = ".*"
+    start_index: int
+    end_index: int
+    type: ENTITY_TYPES = None
+    regex: Union[str, re.Pattern] = ".*"
diff --git a/docs/documentation/notebooks/00-overview.ipynb b/docs/documentation/notebooks/00-overview.ipynb
@@ -84,6 +84,7 @@
    "outputs": [],
    "source": [
     "from anonipy.utils.language_detector import LanguageDetector\n",
+    "\n",
     "lang_detector = LanguageDetector()"
    ]
   },
@@ -134,7 +135,11 @@
     "# define the labels to be extracted and anonymized\n",
     "labels = [\n",
     "    {\"label\": \"name\", \"type\": \"string\"},\n",
-    "    {\"label\": \"social security number\", \"type\": \"custom\"},\n",
+    "    {\n",
+    "        \"label\": \"social security number\",\n",
+    "        \"type\": \"custom\",\n",
+    "        \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
+    "    },\n",
     "    {\"label\": \"date of birth\", \"type\": \"date\"},\n",
     "    {\"label\": \"date\", \"type\": \"date\"},\n",
     "]"
@@ -260,7 +265,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9c9a40553ab34d05963638984cfde6ed",
+       "model_id": "7518a99a76554628b656baedb345d89c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -355,7 +360,7 @@
       "Patient Name: Ethan Thompson\n",
       "Date of Birth: 01-07-1985\n",
       "Date of Examination: 15-05-2024\n",
-      "Social Security Number: 867-38-6549\n",
+      "Social Security Number: 635-28-4553\n",
       "\n",
       "Examination Procedure:\n",
       "Ethan Thompson underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n",
@@ -390,26 +395,32 @@
      "data": {
       "text/plain": [
        "[{'original_text': 'John Doe',\n",
+       "  'label': 'name',\n",
        "  'start_index': 30,\n",
        "  'end_index': 38,\n",
        "  'anonymized_text': 'Ethan Thompson'},\n",
        " {'original_text': '15-01-1985',\n",
+       "  'label': 'date of birth',\n",
        "  'start_index': 54,\n",
        "  'end_index': 64,\n",
        "  'anonymized_text': '01-07-1985'},\n",
        " {'original_text': '20-05-2024',\n",
+       "  'label': 'date',\n",
        "  'start_index': 86,\n",
        "  'end_index': 96,\n",
        "  'anonymized_text': '15-05-2024'},\n",
        " {'original_text': '123-45-6789',\n",
+       "  'label': 'social security number',\n",
        "  'start_index': 121,\n",
        "  'end_index': 132,\n",
-       "  'anonymized_text': '867-38-6549'},\n",
+       "  'anonymized_text': '635-28-4553'},\n",
        " {'original_text': 'John Doe',\n",
+       "  'label': 'name',\n",
        "  'start_index': 157,\n",
        "  'end_index': 165,\n",
        "  'anonymized_text': 'Ethan Thompson'},\n",
        " {'original_text': '15-11-2024',\n",
+       "  'label': 'date',\n",
        "  'start_index': 717,\n",
        "  'end_index': 727,\n",
        "  'anonymized_text': '15-11-2024'}]"

diff --git a/docs/documentation/notebooks/01-extractors.ipynb b/docs/documentation/notebooks/01-extractors.ipynb
@@ -230,7 +230,11 @@
    "source": [
     "labels = [\n",
     "    {\"label\": \"name\", \"type\": \"string\"},\n",
-    "    {\"label\": \"social security number\", \"type\": \"custom\"},\n",
+    "    {\n",
+    "        \"label\": \"social security number\",\n",
+    "        \"type\": \"custom\",\n",
+    "        \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
+    "    },\n",
     "    {\"label\": \"date of birth\", \"type\": \"date\"},\n",
     "    {\"label\": \"date\", \"type\": \"date\"},\n",
     "]"
@@ -367,7 +371,7 @@
        "[Entity(text='John Doe', label='name', start_index=30, end_index=38, type='string', regex='.*'),\n",
        " Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
        " Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
-       " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='.*'),\n",
+       " Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
        " Entity(text='John Doe', label='name', start_index=157, end_index=165, type='string', regex='.*'),\n",
        " Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
       ]

diff --git a/docs/documentation/notebooks/02-generators.ipynb b/docs/documentation/notebooks/02-generators.ipynb
@@ -117,7 +117,7 @@
     "        start_index=121,\n",
     "        end_index=132,\n",
     "        type=\"custom\",\n",
-    "        regex=\".*\",\n",
+    "        regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
     "    ),\n",
     "    Entity(\n",
     "        text=\"John Doe\",\n",
@@ -188,7 +188,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6aca0d79909b4da3beedf11f4fd78777",
+       "model_id": "388fbae393fc44f6b9ad3ab1bea29afc",
        "version_major": 2,
        "version_minor": 0
       },
@@ -266,7 +266,7 @@
     {
      "data": {
       "text/plain": [
-       "'Juan Martines'"
+       "'Juan Rodrigez'"
       ]
      },
      "execution_count": 6,
@@ -425,7 +425,7 @@
     {
      "data": {
       "text/plain": [
-       "'James Thomas'"
+       "'Williams James'"
       ]
      },
      "execution_count": 9,
@@ -445,7 +445,7 @@
     {
      "data": {
       "text/plain": [
-       "'Smith Paul'"
+       "'David Thomas'"
       ]
      },
      "execution_count": 10,
@@ -465,7 +465,7 @@
     {
      "data": {
       "text/plain": [
-       "'Michael Thomas'"
+       "'David Thomas'"
       ]
      },
      "execution_count": 11,
@@ -486,11 +486,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "John Doe     | name                   | Richard Roberts\n",
-      "15-01-1985   | date of birth          | 1964\n",
-      "20-05-2024   | date                   | 2017\n",
-      "123-45-6789  | social security number | 1\n",
-      "John Doe     | name                   | Van first\n",
+      "John Doe     | name                   | Michael Smith\n",
+      "15-01-1985   | date of birth          | None\n",
+      "20-05-2024   | date                   | None\n",
+      "123-45-6789  | social security number | None\n",
+      "John Doe     | name                   | Professor first\n",
       "15-11-2024   | date                   | None\n"
      ]
     }
@@ -579,7 +579,7 @@
     {
      "data": {
       "text/plain": [
-       "'491-93-2792'"
+       "'946-49-5488'"
       ]
      },
      "execution_count": 14,
@@ -706,7 +706,7 @@
     {
      "data": {
       "text/plain": [
-       "'10-05-2024'"
+       "'01-05-2024'"
       ]
      },
      "execution_count": 18,
@@ -873,7 +873,7 @@
     {
      "data": {
       "text/plain": [
-       "'🤗'"
+       "'😢'"
       ]
      },
      "execution_count": 25,