Skip to content

Commit

Permalink
Merge pull request #5 from eriknovak/tests
Browse files Browse the repository at this point in the history
Add unit tests to the project
  • Loading branch information
eriknovak authored May 27, 2024
2 parents dba04b9 + 8d5ba18 commit ff0aaea
Show file tree
Hide file tree
Showing 17 changed files with 915 additions and 51 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unittests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10"] # "3.11" - for some reason the models give different predictions on 3.11
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion anonipy/anonymize/extractors/entity_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,6 @@ def _prepare_entities(self, doc):
for e in doc.ents:
label = list(filter(lambda x: x["label"] == e.label_, self.labels))[0]
if re.match(label["regex"], e.text):
anoni_entities.append(convert_spacy_to_entity(e, label))
anoni_entities.append(convert_spacy_to_entity(e, **label))
spacy_entities.append(e)
return anoni_entities, spacy_entities
9 changes: 2 additions & 7 deletions anonipy/anonymize/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,7 @@
# =====================================


def convert_spacy_to_entity(entity, label):
def convert_spacy_to_entity(entity, type, regex=".*", *args, **kwargs):
return Entity(
entity.text,
entity.label_,
entity.start_char,
entity.end_char,
label["type"],
label["regex"] if "regex" in label else ".*",
entity.text, entity.label_, entity.start_char, entity.end_char, type, regex
)
17 changes: 10 additions & 7 deletions anonipy/anonymize/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

from collections import defaultdict

from ..constants import ENTITY_TYPES


# =====================================
# Regex definitions
# =====================================
Expand Down Expand Up @@ -33,13 +36,13 @@ class RegexMap:
def __init__(self):
self.regex_mapping = defaultdict(lambda: ".*")
# Define the regex mappings
self.regex_mapping["string"] = REGEX_STRING
self.regex_mapping["integer"] = REGEX_INTEGER
self.regex_mapping["float"] = REGEX_FLOAT
self.regex_mapping["date"] = REGEX_DATE
self.regex_mapping["email"] = REGEX_EMAIL_ADDRESS
self.regex_mapping["phone_number"] = REGEX_PHONE_NUMBER
self.regex_mapping["website_url"] = REGEX_WEBSITE_URL
self.regex_mapping[ENTITY_TYPES.STRING] = REGEX_STRING
self.regex_mapping[ENTITY_TYPES.INTEGER] = REGEX_INTEGER
self.regex_mapping[ENTITY_TYPES.FLOAT] = REGEX_FLOAT
self.regex_mapping[ENTITY_TYPES.DATE] = REGEX_DATE
self.regex_mapping[ENTITY_TYPES.EMAIL] = REGEX_EMAIL_ADDRESS
self.regex_mapping[ENTITY_TYPES.PHONE_NUMBER] = REGEX_PHONE_NUMBER
self.regex_mapping[ENTITY_TYPES.WEBSITE_URL] = REGEX_WEBSITE_URL

def __call__(self, type: str) -> str:
return self.regex_mapping[type]
Expand Down
17 changes: 15 additions & 2 deletions anonipy/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@ class LANGUAGES:
GERMAN = ("de", "German")
GREEK = ("el", "Greek")
ITALIAN = ("it", "Italian")
SLOVENE = ("sl", "Slovenian")
SLOVENE = ("sl", "Slovene")
SPANISH = ("es", "Spanish")
UKRAINIAN = ("uk", "Ukranian")
UKRAINIAN = ("uk", "Ukrainian")


class ENTITY_TYPES:
"""Types of entities"""

CUSTOM = "custom"
STRING = "string"
INTEGER = "integer"
FLOAT = "float"
DATE = "date"
EMAIL = "email"
WEBSITE_URL = "website_url"
PHONE_NUMBER = "phone_number"
11 changes: 7 additions & 4 deletions anonipy/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
The definitions used within the package
"""

import re
from typing import Union
from dataclasses import dataclass

from .constants import ENTITY_TYPES

# ================================================
# Entity Definitions
Expand All @@ -14,7 +17,7 @@
class Entity:
text: str
label: str
start_index: str
end_index: str
type: str = None
regex: str = ".*"
start_index: int
end_index: int
type: ENTITY_TYPES = None
regex: Union[str, re.Pattern] = ".*"
19 changes: 15 additions & 4 deletions docs/documentation/notebooks/00-overview.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
"outputs": [],
"source": [
"from anonipy.utils.language_detector import LanguageDetector\n",
"\n",
"lang_detector = LanguageDetector()"
]
},
Expand Down Expand Up @@ -134,7 +135,11 @@
"# define the labels to be extracted and anonymized\n",
"labels = [\n",
" {\"label\": \"name\", \"type\": \"string\"},\n",
" {\"label\": \"social security number\", \"type\": \"custom\"},\n",
" {\n",
" \"label\": \"social security number\",\n",
" \"type\": \"custom\",\n",
" \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
" },\n",
" {\"label\": \"date of birth\", \"type\": \"date\"},\n",
" {\"label\": \"date\", \"type\": \"date\"},\n",
"]"
Expand Down Expand Up @@ -260,7 +265,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9c9a40553ab34d05963638984cfde6ed",
"model_id": "7518a99a76554628b656baedb345d89c",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -355,7 +360,7 @@
"Patient Name: Ethan Thompson\n",
"Date of Birth: 01-07-1985\n",
"Date of Examination: 15-05-2024\n",
"Social Security Number: 867-38-6549\n",
"Social Security Number: 635-28-4553\n",
"\n",
"Examination Procedure:\n",
"Ethan Thompson underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.\n",
Expand Down Expand Up @@ -390,26 +395,32 @@
"data": {
"text/plain": [
"[{'original_text': 'John Doe',\n",
" 'label': 'name',\n",
" 'start_index': 30,\n",
" 'end_index': 38,\n",
" 'anonymized_text': 'Ethan Thompson'},\n",
" {'original_text': '15-01-1985',\n",
" 'label': 'date of birth',\n",
" 'start_index': 54,\n",
" 'end_index': 64,\n",
" 'anonymized_text': '01-07-1985'},\n",
" {'original_text': '20-05-2024',\n",
" 'label': 'date',\n",
" 'start_index': 86,\n",
" 'end_index': 96,\n",
" 'anonymized_text': '15-05-2024'},\n",
" {'original_text': '123-45-6789',\n",
" 'label': 'social security number',\n",
" 'start_index': 121,\n",
" 'end_index': 132,\n",
" 'anonymized_text': '867-38-6549'},\n",
" 'anonymized_text': '635-28-4553'},\n",
" {'original_text': 'John Doe',\n",
" 'label': 'name',\n",
" 'start_index': 157,\n",
" 'end_index': 165,\n",
" 'anonymized_text': 'Ethan Thompson'},\n",
" {'original_text': '15-11-2024',\n",
" 'label': 'date',\n",
" 'start_index': 717,\n",
" 'end_index': 727,\n",
" 'anonymized_text': '15-11-2024'}]"
Expand Down
8 changes: 6 additions & 2 deletions docs/documentation/notebooks/01-extractors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,11 @@
"source": [
"labels = [\n",
" {\"label\": \"name\", \"type\": \"string\"},\n",
" {\"label\": \"social security number\", \"type\": \"custom\"},\n",
" {\n",
" \"label\": \"social security number\",\n",
" \"type\": \"custom\",\n",
" \"regex\": \"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
" },\n",
" {\"label\": \"date of birth\", \"type\": \"date\"},\n",
" {\"label\": \"date\", \"type\": \"date\"},\n",
"]"
Expand Down Expand Up @@ -367,7 +371,7 @@
"[Entity(text='John Doe', label='name', start_index=30, end_index=38, type='string', regex='.*'),\n",
" Entity(text='15-01-1985', label='date of birth', start_index=54, end_index=64, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='20-05-2024', label='date', start_index=86, end_index=96, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})'),\n",
" Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='.*'),\n",
" Entity(text='123-45-6789', label='social security number', start_index=121, end_index=132, type='custom', regex='[0-9]{3}-[0-9]{2}-[0-9]{4}'),\n",
" Entity(text='John Doe', label='name', start_index=157, end_index=165, type='string', regex='.*'),\n",
" Entity(text='15-11-2024', label='date', start_index=717, end_index=727, type='date', regex='(\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{2,4})|(\\\\d{2,4}[\\\\/\\\\-\\\\.]\\\\d{1,2}[\\\\/\\\\-\\\\.]\\\\d{1,2})')]"
]
Expand Down
28 changes: 14 additions & 14 deletions docs/documentation/notebooks/02-generators.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
" start_index=121,\n",
" end_index=132,\n",
" type=\"custom\",\n",
" regex=\".*\",\n",
" regex=\"[0-9]{3}-[0-9]{2}-[0-9]{4}\",\n",
" ),\n",
" Entity(\n",
" text=\"John Doe\",\n",
Expand Down Expand Up @@ -188,7 +188,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6aca0d79909b4da3beedf11f4fd78777",
"model_id": "388fbae393fc44f6b9ad3ab1bea29afc",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -266,7 +266,7 @@
{
"data": {
"text/plain": [
"'Juan Martines'"
"'Juan Rodrigez'"
]
},
"execution_count": 6,
Expand Down Expand Up @@ -425,7 +425,7 @@
{
"data": {
"text/plain": [
"'James Thomas'"
"'Williams James'"
]
},
"execution_count": 9,
Expand All @@ -445,7 +445,7 @@
{
"data": {
"text/plain": [
"'Smith Paul'"
"'David Thomas'"
]
},
"execution_count": 10,
Expand All @@ -465,7 +465,7 @@
{
"data": {
"text/plain": [
"'Michael Thomas'"
"'David Thomas'"
]
},
"execution_count": 11,
Expand All @@ -486,11 +486,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"John Doe | name | Richard Roberts\n",
"15-01-1985 | date of birth | 1964\n",
"20-05-2024 | date | 2017\n",
"123-45-6789 | social security number | 1\n",
"John Doe | name | Van first\n",
"John Doe | name | Michael Smith\n",
"15-01-1985 | date of birth | None\n",
"20-05-2024 | date | None\n",
"123-45-6789 | social security number | None\n",
"John Doe | name | Professor first\n",
"15-11-2024 | date | None\n"
]
}
Expand Down Expand Up @@ -579,7 +579,7 @@
{
"data": {
"text/plain": [
"'491-93-2792'"
"'946-49-5488'"
]
},
"execution_count": 14,
Expand Down Expand Up @@ -706,7 +706,7 @@
{
"data": {
"text/plain": [
"'10-05-2024'"
"'01-05-2024'"
]
},
"execution_count": 18,
Expand Down Expand Up @@ -873,7 +873,7 @@
{
"data": {
"text/plain": [
"'🤗'"
"'😢'"
]
},
"execution_count": 25,
Expand Down
Loading

0 comments on commit ff0aaea

Please sign in to comment.