Skip to content

Commit

Permalink
feat(pipeline) : Setup deduplication as a new processings module
Browse files Browse the repository at this point in the history
(.venv) [victor@W]<~/dev/data-inclusion/pipeline/dbt> time dbt run -s models/intermediate/int__doublons_structures.sql
10:06:25  Running with dbt=1.8.6
10:06:25  Registered adapter: postgres=1.8.2
10:06:25  Unable to do partial parsing because of a version mismatch
10:06:28  Found 137 models, 12 snapshots, 10 seeds, 793 data tests, 52 sources, 554 macros, 5 unit tests
10:06:28
10:06:28  Concurrency: 1 threads (target='dev')
10:06:28
10:06:28  1 of 1 START sql table model public_intermediate.int__doublons_structures ...... [RUN]
10:06:49  1 of 1 OK created sql table model public_intermediate.int__doublons_structures . [SELECT 9457 in 20.66s]
10:06:49
10:06:49  Finished running 1 table model in 0 hours 0 minutes and 20.93 seconds (20.93s).
10:06:49
10:06:49  Completed successfully
10:06:49
10:06:49  Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1

real    0m25,923s
user    0m4,834s
sys     0m0,077s
  • Loading branch information
vperron committed Dec 11, 2024
1 parent 02dfec4 commit 72e8608
Show file tree
Hide file tree
Showing 17 changed files with 737 additions and 0 deletions.
2 changes: 2 additions & 0 deletions datawarehouse/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ RUN apt-get update \
postgresql-plpython3-17 \
python3-pip \
python3-venv \
python3-dev \
build-essential \
&& apt-get autoremove --purge -y \
&& apt-get clean -y \
&& rm -rf /var/lib/apt/lists/*
Expand Down
5 changes: 5 additions & 0 deletions datawarehouse/processings/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@ requires = ["setuptools", "wheel"]
name = "data-inclusion-processings"
version = "0.1.0"
dependencies = [
"dedupe",
"dedupe-variable-datetime",
"numpy~=2.0",
"pandas~=2.2",
"phonenumbers",
"requests~=2.31",
"unidecode",
"data-inclusion-schema",

]

[project.optional-dependencies]
Expand Down
79 changes: 79 additions & 0 deletions datawarehouse/processings/requirements/dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,49 +1,100 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --extra=dev --output-file=requirements/dev-requirements.txt
affinegap==1.12
# via dedupe
annotated-types==0.7.0
# via pydantic
btrees==6.1
# via
# dedupe
# zope-index
categorical-distance==1.9
# via dedupe
certifi==2024.8.30
# via requests
cffi==1.17.1
# via persistent
cfgv==3.4.0
# via pre-commit
charset-normalizer==3.4.0
# via requests
data-inclusion-schema==0.19.0
# via data-inclusion-processings (pyproject.toml)
datetime-distance==0.1.3
# via dedupe-variable-datetime
dedupe==3.0.3
# via
# data-inclusion-processings (pyproject.toml)
# dedupe-variable-datetime
dedupe-levenshtein-search==1.4.5
# via dedupe
dedupe-variable-datetime==2.0.0
# via data-inclusion-processings (pyproject.toml)
distlib==0.3.9
# via virtualenv
dnspython==2.7.0
# via email-validator
doublemetaphone==1.1
# via dedupe
email-validator==2.2.0
# via pydantic
filelock==3.16.1
# via virtualenv
future==1.0.0
# via datetime-distance
haversine==2.9.0
# via dedupe
highered==0.2.1
# via dedupe
identify==2.6.1
# via pre-commit
idna==3.10
# via
# email-validator
# requests
joblib==1.4.2
# via scikit-learn
nodeenv==1.9.1
# via pre-commit
numpy==2.1.3
# via
# data-inclusion-processings (pyproject.toml)
# categorical-distance
# dedupe
# highered
# pandas
# pyhacrf-datamade
# pylbfgs
# scikit-learn
# scipy
# simplecosine
pandas==2.2.3
# via data-inclusion-processings (pyproject.toml)
pendulum==3.0.0
# via data-inclusion-schema
persistent==6.1
# via
# btrees
# zope-index
phonenumbers==8.13.51
# via data-inclusion-processings (pyproject.toml)
platformdirs==4.3.6
# via virtualenv
pre-commit==4.0.1
# via data-inclusion-processings (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==2.9.2
# via data-inclusion-schema
pydantic-core==2.23.4
# via pydantic
pyhacrf-datamade==0.2.8
# via highered
pylbfgs==0.2.0.16
# via pyhacrf-datamade
python-dateutil==2.9.0.post0
# via
# datetime-distance
# pandas
# pendulum
# time-machine
Expand All @@ -55,8 +106,22 @@ requests==2.32.3
# via data-inclusion-processings (pyproject.toml)
ruff==0.7.2
# via data-inclusion-processings (pyproject.toml)
scikit-learn==1.6.0
# via dedupe
scipy==1.14.1
# via scikit-learn
setuptools==75.6.0
# via
# zope-deferredimport
# zope-index
# zope-interface
# zope-proxy
simplecosine==1.2
# via dedupe
six==1.16.0
# via python-dateutil
threadpoolctl==3.5.0
# via scikit-learn
time-machine==2.16.0
# via pendulum
typing-extensions==4.12.2
Expand All @@ -67,7 +132,21 @@ tzdata==2024.2
# via
# pandas
# pendulum
unidecode==1.3.8
# via data-inclusion-processings (pyproject.toml)
urllib3==2.2.3
# via requests
virtualenv==20.27.1
# via pre-commit
zope-deferredimport==5.0
# via persistent
zope-index==7.0
# via dedupe
zope-interface==7.2
# via
# btrees
# persistent
# zope-index
# zope-proxy
zope-proxy==6.1
# via zope-deferredimport
79 changes: 79 additions & 0 deletions datawarehouse/processings/requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,44 +1,109 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml --output-file=requirements/requirements.txt
affinegap==1.12
# via dedupe
annotated-types==0.7.0
# via pydantic
btrees==6.1
# via
# dedupe
# zope-index
categorical-distance==1.9
# via dedupe
certifi==2024.8.30
# via requests
cffi==1.17.1
# via persistent
charset-normalizer==3.4.0
# via requests
data-inclusion-schema==0.19.0
# via data-inclusion-processings (pyproject.toml)
datetime-distance==0.1.3
# via dedupe-variable-datetime
dedupe==3.0.3
# via
# data-inclusion-processings (pyproject.toml)
# dedupe-variable-datetime
dedupe-levenshtein-search==1.4.5
# via dedupe
dedupe-variable-datetime==2.0.0
# via data-inclusion-processings (pyproject.toml)
dnspython==2.7.0
# via email-validator
doublemetaphone==1.1
# via dedupe
email-validator==2.2.0
# via pydantic
future==1.0.0
# via datetime-distance
haversine==2.9.0
# via dedupe
highered==0.2.1
# via dedupe
idna==3.10
# via
# email-validator
# requests
joblib==1.4.2
# via scikit-learn
numpy==2.1.3
# via
# data-inclusion-processings (pyproject.toml)
# categorical-distance
# dedupe
# highered
# pandas
# pyhacrf-datamade
# pylbfgs
# scikit-learn
# scipy
# simplecosine
pandas==2.2.3
# via data-inclusion-processings (pyproject.toml)
pendulum==3.0.0
# via data-inclusion-schema
persistent==6.1
# via
# btrees
# zope-index
phonenumbers==8.13.51
# via data-inclusion-processings (pyproject.toml)
pycparser==2.22
# via cffi
pydantic==2.9.2
# via data-inclusion-schema
pydantic-core==2.23.4
# via pydantic
pyhacrf-datamade==0.2.8
# via highered
pylbfgs==0.2.0.16
# via pyhacrf-datamade
python-dateutil==2.9.0.post0
# via
# datetime-distance
# pandas
# pendulum
# time-machine
pytz==2024.2
# via pandas
requests==2.32.3
# via data-inclusion-processings (pyproject.toml)
scikit-learn==1.6.0
# via dedupe
scipy==1.14.1
# via scikit-learn
setuptools==75.6.0
# via
# zope-deferredimport
# zope-index
# zope-interface
# zope-proxy
simplecosine==1.2
# via dedupe
six==1.16.0
# via python-dateutil
threadpoolctl==3.5.0
# via scikit-learn
time-machine==2.16.0
# via pendulum
typing-extensions==4.12.2
Expand All @@ -49,5 +114,19 @@ tzdata==2024.2
# via
# pandas
# pendulum
unidecode==1.3.8
# via data-inclusion-processings (pyproject.toml)
urllib3==2.2.3
# via requests
zope-deferredimport==5.0
# via persistent
zope-index==7.0
# via dedupe
zope-interface==7.2
# via
# btrees
# persistent
# zope-index
# zope-proxy
zope-proxy==6.1
# via zope-deferredimport
Loading

0 comments on commit 72e8608

Please sign in to comment.