Skip to content

Commit

Permalink
Merge pull request #17 from eriknovak/feature/auto-date
Browse files Browse the repository at this point in the history
Add automatic date format detection support to `DateGenerator`
  • Loading branch information
eriknovak authored Jun 17, 2024
2 parents 3bed942 + 64aacd6 commit 57a5f13
Show file tree
Hide file tree
Showing 7 changed files with 305 additions and 59 deletions.
20 changes: 14 additions & 6 deletions anonipy/anonymize/generators/date_generator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import random
import warnings
import datetime

from ...utils.datetime import detect_datetime_format
from .interface import GeneratorInterface
from ...definitions import Entity

# =====================================
# Helper functions
# Operation functions
# =====================================


Expand Down Expand Up @@ -47,9 +49,7 @@ def random_date(day: datetime.datetime, sigma: int = 30, *args, **kwargs):

class DateGenerator(GeneratorInterface):

def __init__(
self, date_format: str = "%d-%m-%Y", day_sigma: int = 30, *args, **kwargs
):
def __init__(self, date_format="auto", day_sigma: int = 30, *args, **kwargs):
self.date_format = date_format
self.day_sigma = day_sigma

Expand All @@ -66,6 +66,14 @@ def generate(self, entity: Entity, output_gen: str = "random", *args, **kwargs):
f"The output_gen must be one of {', '.join(list(operations.keys()))} to generate dates."
)

entity_date = datetime.datetime.strptime(entity.text, self.date_format)
if self.date_format == "auto":
entity_date, date_format = detect_datetime_format(entity.text)
else:
entity_date = datetime.datetime.strptime(entity.text, self.date_format)
date_format = self.date_format
if entity_date is None:
raise ValueError(f"Entity `{entity.text}` is not a valid date.")
if date_format is None or date_format == ValueError("Unknown Format"):
raise ValueError(f"Entity `{entity.text}` is not a valid date.")
generate_date = operations[output_gen](entity_date, self.day_sigma)
return generate_date.strftime(self.date_format)
return generate_date.strftime(date_format)
21 changes: 20 additions & 1 deletion anonipy/anonymize/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,26 @@
REGEX_INTEGER = "\d+"
REGEX_FLOAT = "[\d\.,]+"
REGEX_DATE = (
"(\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4})|(\d{2,4}[\/\-\.]\d{1,2}[\/\-\.]\d{1,2})"
r"("
r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2}[ T]\d{2}:\d{2}:\d{2})|"
r"(\d{2}[-/.\s]\d{2}[-/.\s]\d{4}[ T]\d{2}:\d{2}:\d{2})|"
r"(\d{2}[-/.\s]\d{2}[-/.\s]\d{4}[ T]\d{2}:\d{2})|"
r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2}[ T]\d{2}:\d{2})|"
r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2}[ T]\d{2}:\d{2} [APap][mM])|"
r"(\d{2}[-/.\s]\d{2}[-/.\s]\d{4}[ T]\d{2}:\d{2} [APap][mM])|"
r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2})|"
r"(\d{2}[-/.\s]\d{2}[-/.\s]\d{4})|"
r"(\d{2}[-/.\s]\d{2}[-/.\s]\d{4}[ ]?\d{2}:\d{2}:\d{2})|"
r"(\d{4}[-/.\s]\d{2}[-/.\s]\d{2}[ ]?\d{2}:\d{2}:\d{2})|"
r"(\d{1,2}[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{4}[ ]?\d{2}:\d{2}:\d{2})|"
r"(\d{1,2}[ ](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]\d{4}[ ]?\d{2}:\d{2}:\d{2})|"
r"(\d{1,2}[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{4}[ ]?\d{2}:\d{2}[ ]?[APap][mM])|"
r"(\d{1,2}[ ](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[ ]\d{4}[ ]?\d{2}:\d{2}[ ]?[APap][mM])|"
r"([A-Za-z]+,[ ]\d{1,2}[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{4}[ ]?\d{2}:\d{2}:\d{2})|"
r"([A-Za-z]+,[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{1,2},[ ]\d{4}[ ]?\d{2}:\d{2}:\d{2})|"
r"([A-Za-z]+,[ ]\d{1,2}[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{4}[ ]?\d{2}:\d{2}[ ]?[APap][mM])|"
r"([A-Za-z]+,[ ](January|February|March|April|May|June|July|August|September|October|November|December)[ ]\d{1,2},[ ]\d{4}[ ]?\d{2}:\d{2}[ ]?[APap][mM])"
r")"
)
REGEX_EMAIL_ADDRESS = (
"[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*"
Expand Down
97 changes: 97 additions & 0 deletions anonipy/utils/datetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import dateutil.parser as parser

# =====================================
# Constants
# =====================================

POSSIBLE_FORMATS = [
"%Y-%m-%d %H:%M:%S",
"%d-%m-%Y %H:%M:%S",
"%m-%d-%Y %H:%M:%S",
"%Y/%m/%d %H:%M:%S",
"%d/%m/%Y %H:%M:%S",
"%m/%d/%Y %H:%M:%S",
"%Y.%m.%d %H:%M:%S",
"%d.%m.%Y %H:%M:%S",
"%m.%d.%Y %H:%M:%S",
"%Y %m %d %H:%M:%S",
"%d %m %Y %H:%M:%S",
"%m %d %Y %H:%M:%S",
"%Y-%m-%d %I:%M %p",
"%d-%m-%Y %I:%M %p",
"%m-%d-%Y %I:%M %p",
"%Y/%m/%d %I:%M %p",
"%d/%m/%Y %I:%M %p",
"%m/%d/%Y %I:%M %p",
"%Y.%m.%d %I:%M %p",
"%d.%m.%Y %I:%M %p",
"%m.%d.%Y %I:%M %p",
"%Y %m %d %I:%M %p",
"%d %m %Y %I:%M %p",
"%m %d %Y %I:%M %p",
"%Y-%m-%d %H:%M",
"%d-%m-%Y %H:%M",
"%m-%d-%Y %H:%M",
"%Y/%m/%d %H:%M",
"%d/%m/%Y %H:%M",
"%m/%d/%Y %H:%M",
"%Y.%m.%d %H:%M",
"%d.%m.%Y %H:%M",
"%m.%d.%Y %H:%M",
"%Y %m %d %H:%M",
"%d %m %Y %H:%M",
"%m %d %Y %H:%M",
"%Y-%m-%d %H:%M",
"%A, %d %B %Y %H:%M:%S",
"%A, %B %d, %Y %H:%M:%S",
"%A, %d %B %Y %I:%M %p",
"%A, %B %d, %Y %I:%M %p",
"%B %d, %Y %H:%M:%S",
"%d %B %Y %H:%M:%S",
"%b %d, %Y %H:%M:%S",
"%d %b %Y %H:%M:%S",
"%B %d, %Y %I:%M %p",
"%d %B %Y %I:%M %p",
"%b %d, %Y %I:%M %p",
"%d %b %Y %I:%M %p",
"%Y-%m-%d",
"%d-%m-%Y",
"%m-%d-%Y",
"%Y/%m/%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%Y.%m.%d",
"%d.%m.%Y",
"%m.%d.%Y",
"%Y %m %d",
"%d %m %Y",
"%m %d %Y",
"%B %d, %Y",
"%d %B %Y",
"%b %d, %Y",
"%d %b %Y",
"%A, %d %B %Y",
"%A, %B %d, %Y",
]


# =====================================
# Auto date format detector
# =====================================


def detect_datetime_format(datetime):
try:
parsed_datetime = parser.parse(datetime, fuzzy=True)

for FMT in POSSIBLE_FORMATS:
try:
if parsed_datetime.strftime(FMT) == datetime:
return parsed_datetime, FMT
except ValueError:
continue

return parsed_datetime, ValueError("Unknown Format")

except parser.ParserError:
return None, None
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformers
bitsandbytes
lingua-language-detector
guidance==0.1.14
python-dateutil>=2.9.0
sentencepiece
# File readers
pypdf>=4.2.0
Expand Down
12 changes: 7 additions & 5 deletions test/test_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

from anonipy.definitions import Entity
from anonipy.anonymize.extractors import EntityExtractor
from anonipy.anonymize.regex import regex_map
from anonipy.constants import LANGUAGES


# =====================================
# Helper functions
# =====================================
Expand Down Expand Up @@ -37,23 +39,23 @@
start_index=30,
end_index=38,
type="string",
regex=".*",
regex=regex_map("string"),
),
Entity(
text="15-01-1985",
label="date of birth",
start_index=54,
end_index=64,
type="date",
regex="(\\d{1,2}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{2,4})|(\\d{2,4}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{1,2})",
regex=regex_map("date"),
),
Entity(
text="20-05-2024",
label="date",
start_index=86,
end_index=96,
type="date",
regex="(\\d{1,2}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{2,4})|(\\d{2,4}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{1,2})",
regex=regex_map("date"),
),
Entity(
text="123-45-6789",
Expand All @@ -69,15 +71,15 @@
start_index=157,
end_index=165,
type="string",
regex=".*",
regex=regex_map("string"),
),
Entity(
text="15-11-2024",
label="date",
start_index=717,
end_index=727,
type="date",
regex="(\\d{1,2}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{2,4})|(\\d{2,4}[\\/\\-\\.]\\d{1,2}[\\/\\-\\.]\\d{1,2})",
regex=regex_map("date"),
),
]

Expand Down
Loading

0 comments on commit 57a5f13

Please sign in to comment.