-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(data-homogenise): create service
- Loading branch information
Showing
22 changed files
with
638 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Ignore all files by default | ||
* | ||
|
||
# White list only the required files | ||
!config.json | ||
!v1 | ||
!swagger.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# syntax=docker/dockerfile:1.2 | ||
FROM python:3.9-slim-bullseye AS dvc-files | ||
WORKDIR /dvc | ||
RUN apt update && apt -y install git | ||
RUN pip install dvc[webdav]==3.39.0 | ||
RUN --mount=type=secret,id=webdav_login \ | ||
--mount=type=secret,id=webdav_password \ | ||
--mount=type=secret,id=webdav_url \ | ||
git init && \ | ||
dvc init && \ | ||
dvc remote add -d webdav-remote "$(cat /run/secrets/webdav_url)" && \ | ||
dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \ | ||
dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)" | ||
RUN dvc doctor | ||
COPY ./v1/all-MiniLM-L6-v2.dvc /dvc | ||
RUN dvc pull -v | ||
|
||
|
||
FROM cnrsinist/ezs-python-pytorch-server:py3.9-no16-1.1.4 | ||
|
||
ENV HF_HOME=/app/.cache/huggingface | ||
ENV NUMBA_CACHE_DIR=/tmp/numba_cache | ||
|
||
USER root | ||
# Install all python dependencies | ||
RUN pip install --no-build-isolation \ | ||
--index-url https://download.pytorch.org/whl/cpu \ | ||
--extra-index-url https://pypi.org/simple \ | ||
sentence-transformers==3.3.1 \ | ||
huggingface_hub==0.23.2 | ||
|
||
# Declare files to copy in .dockerignore | ||
COPY --chown=daemon:daemon . /app/public/ | ||
RUN mv /app/public/config.json /app && chmod a+w /app/config.json | ||
RUN mkdir /tmp/retrieve | ||
|
||
COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# ws-data-homogenise@0.0.0 | ||
|
||
Homogénéise les valeurs d'un champs. | ||
|
||
Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"environnement": { | ||
"EZS_TITLE": "Homogénéise les valeurs d'un champs.", | ||
"EZS_DESCRIPTION": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.", | ||
"EZS_METRICS": true, | ||
"EZS_CONCURRENCY": 2, | ||
"EZS_CONTINUE_DELAY": 18000, | ||
"EZS_PIPELINE_DELAY": 10800, | ||
"EZS_NSHARDS": 32, | ||
"EZS_CACHE": true, | ||
"EZS_VERBOSE": false, | ||
"NODE_OPTIONS": "--max_old_space_size=1024", | ||
"NODE_ENV": "production" | ||
|
||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# These examples can be used directly in VSCode, using HTTPYac extension (anweber.vscode-httpyac) | ||
# They are important, because used to generate the tests.hurl file. | ||
|
||
# Décommenter/commenter les lignes voulues pour tester localement | ||
@host=http://localhost:31976 | ||
# @host=https://data-homogenise.services.istex.fr | ||
|
||
### | ||
# @name v1routeInCamelCase | ||
# Description de la route | ||
POST {{host}}/v1/route/in/camel/case?indent=true HTTP/1.1 | ||
Content-Type: application/x-tar | ||
X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 | ||
X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 | ||
|
||
< ./example-json.tar.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"private": true, | ||
"name": "ws-data-homogenise", | ||
"version": "0.0.0", | ||
"description": "Homogénéise les valeurs d'un champs.", | ||
"repository": { | ||
"type": "git", | ||
"url": "git+https://github.com/Inist-CNRS/web-services.git" | ||
}, | ||
"keywords": [ | ||
"ezmaster" | ||
], | ||
"author": "Léo Gaillard <leo.gaillard@cnrs.fr>", | ||
"license": "MIT", | ||
"bugs": { | ||
"url": "https://github.com/Inist-CNRS/web-services/issues" | ||
}, | ||
"homepage": "https://github.com/Inist-CNRS/web-services/#readme", | ||
"scripts": { | ||
"version:insert:readme": "sed -i \"s#\\(${npm_package_name}.\\)\\([\\.a-z0-9]\\+\\)#\\1${npm_package_version}#g\" README.md && git add README.md", | ||
"version:insert:swagger": "sed -i \"s/\\\"version\\\": \\\"[0-9]\\+.[0-9]\\+.[0-9]\\+\\\"/\\\"version\\\": \\\"${npm_package_version}\\\"/g\" swagger.json && git add swagger.json", | ||
"version:insert": "npm run version:insert:readme && npm run version:insert:swagger", | ||
"version:commit": "git commit -a -m \"release ${npm_package_name}@${npm_package_version}\"", | ||
"version:tag": "git tag \"${npm_package_name}@${npm_package_version}\" -m \"${npm_package_name}@${npm_package_version}\"", | ||
"version:push": "git push && git push --tags", | ||
"version": "npm run version:insert && npm run version:commit && npm run version:tag", | ||
"postversion": "npm run version:push", | ||
"build:dev": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:latest --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .", | ||
"start:dev": "npm run build:dev && docker run --name dev --rm --detach -p 31976:31976 cnrsinist/${npm_package_name}:latest", | ||
"stop:dev": "docker stop dev", | ||
"build": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:${npm_package_version} --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .", | ||
"start": "docker run --rm -p 31976:31976 cnrsinist/${npm_package_name}:${npm_package_version}", | ||
"publish": "docker push cnrsinist/${npm_package_name}:${npm_package_version}" | ||
}, | ||
"avoid-testing": false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
{ | ||
"openapi": "3.0.0", | ||
"info": { | ||
"title": "data-homogenise - Homogénéise les valeurs d'un champs.", | ||
"description": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.", | ||
"version": "0.0.0", | ||
"termsOfService": "https://services.istex.fr/", | ||
"contact": { | ||
"name": "Inist-CNRS", | ||
"url": "https://www.inist.fr/nous-contacter/" | ||
} | ||
}, | ||
"servers": [ | ||
{ | ||
"x-comment": "Will be automatically completed by the ezs server." | ||
}, | ||
{ | ||
"url": "http://vptdmjobs.intra.inist.fr:49206/", | ||
"description": "Latest version for production", | ||
"#DISABLED#x-profil": "Standard" | ||
} | ||
], | ||
"tags": [ | ||
{ | ||
"name": "data-homogenise", | ||
"description": "Homogénéise les valeurs d'un champs.", | ||
"externalDocs": { | ||
"description": "Plus de documentation", | ||
"url": "https://github.com/inist-cnrs/web-services/tree/main/services/data-homogenise" | ||
} | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# WARNING: This file was not generated, but manually written. | ||
# DON'T OVERWRITE IT | ||
# Use it to test: | ||
# npx hurl --test data-computer/tests.hurl | ||
|
||
################################################################## | ||
# v1/homogenise | ||
POST {{host}}/v1/homogenise | ||
content-type: application/x-tar | ||
x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 | ||
file,example.tar.gz; | ||
|
||
HTTP 200 | ||
# Capture the computing token | ||
[Captures] | ||
computing_token: jsonpath "$[0].value" | ||
[Asserts] | ||
variable "computing_token" exists | ||
|
||
POST {{host}}/v1/retrieve-json | ||
content-type: application/json | ||
[Options] | ||
delay: 30000 | ||
``` | ||
[ | ||
{ | ||
"value":"{{computing_token}}" | ||
} | ||
] | ||
``` | ||
|
||
HTTP 200 | ||
Content-Type: application/json | ||
jsonpath "$" count == 3 | ||
jsonpath "$[0].value.[0]" == "Inist-CNRS, UAR76" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
outs: | ||
- md5: 25a76b5a8c779766c28a1b3b17139477.dir | ||
size: 273545812 | ||
nfiles: 16 | ||
hash: md5 | ||
path: all-MiniLM-L6-v2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
[use] | ||
plugin = basics | ||
|
||
# On sauvegarde sur disque pour accepter rapidement tous les objets en entrée | ||
# et répondre rapidement au client que le traitement asynchrone est lancé. | ||
# | ||
# Le "fork" se détache uniquement quand tous les objets sont "rentrés" dans le fork | ||
# Si le traitement est plus lent que la sauvegarde sur disque | ||
# il est nécessaire de créer un fichier temporaire | ||
[pack] | ||
[FILESave] | ||
identifier = env('identifier') | ||
location = /tmp/upload | ||
compress = true | ||
|
||
[debug] | ||
text = fix('Data received by', env('generator'), 'for', env('identifier')).join(' ') | ||
|
||
[exchange] | ||
value = get('filename') | ||
|
||
[FILELoad] | ||
compress = true | ||
location = /tmp/upload | ||
[unpack] | ||
|
||
[metrics] | ||
bucket = buffer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
[use] | ||
plugin = basics | ||
|
||
# Step 0 (générique) : Lire le fichier standard tar.gz | ||
[TARExtract] | ||
compress = true | ||
path = */*.json | ||
|
||
# Step 1 (générique) : Créer un identifiant unique pour le corpus reçu | ||
[singleton] | ||
|
||
# Step 1.1 : On évite de récupérer un champ uri existant | ||
[singleton/env] | ||
path = pid | ||
value = fix(`PID${Date.now()}`) | ||
|
||
# Step 1.2 : On génère un identifiant unique | ||
[singleton/identify] | ||
path = env('pid') | ||
|
||
# Step 1.3: On garde en mémoire l'identifiant généré (en le simplifiant) | ||
[singleton/env] | ||
path = identifier | ||
value = get(env('pid')).replace('uid:/', '') | ||
|
||
[singleton/exchange] | ||
value = self().omit([env('pid')]) | ||
|
||
[metrics] | ||
bucket = charger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Entrypoint output format | ||
mimeType = application/json | ||
|
||
# OpenAPI Documentation - JSON format (dot notation) | ||
post.operationId = post-v1-homogenise | ||
post.summary = Homogénéise les valeurs d'un champs. | ||
post.description = Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur. | ||
post.tags.0 = homogenise | ||
post.requestBody.content.application/x-gzip.schema.type = string | ||
post.requestBody.content.application/x-gzip.schema.format = binary | ||
post.requestBody.content.application/x-tar.schema.type = string | ||
post.requestBody.content.application/x-tar.schema.format = binary | ||
post.requestBody.required = true | ||
post.responses.default.description = Informations permettant de récupérer les données le moment venu | ||
post.parameters.0.description = Indenter le JSON résultant | ||
post.parameters.0.in = query | ||
post.parameters.0.name = indent | ||
post.parameters.0.schema.type = boolean | ||
post.parameters.1.description = URL pour signaler que le traitement est terminé | ||
post.parameters.1.in = header | ||
post.parameters.1.name = X-Webhook-Success | ||
post.parameters.1.schema.type = string | ||
post.parameters.1.schema.format = uri | ||
post.parameters.1.required = false | ||
post.parameters.2.description = URL pour signaler que le traitement a échoué | ||
post.parameters.2.in = header | ||
post.parameters.2.name = X-Webhook-Failure | ||
post.parameters.2.schema.type = string | ||
post.parameters.2.schema.format = uri | ||
post.parameters.2.required = false | ||
post.parameters.3.in = query | ||
post.parameters.3.name = thereshold | ||
post.parameters.3.schema.type = number | ||
post.parameters.3.description = threshold of similarity to use to homogenise (in percent) | ||
#' | ||
|
||
[use] | ||
plugin = @ezs/spawn | ||
|
||
[env] | ||
path = generator | ||
value = homogenise | ||
|
||
# Step 1 (générique): Charger le fichier corpus | ||
[delegate] | ||
file = charger.cfg | ||
|
||
# Step 2 (générique): Traiter de manière asynchnore les items reçus | ||
[fork] | ||
standalone = true | ||
logger = logger.cfg | ||
|
||
# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus | ||
[fork/exec] | ||
# command should be executable ! | ||
command = ./v1/homogenise.py | ||
args = fix('-p') | ||
args = env('thereshold', "70") | ||
|
||
# Step 2.2 (générique): Enregister le résulat et signaler que le traitment est fini | ||
[fork/delegate] | ||
file = recorder.cfg | ||
|
||
# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résulat quand il sera prêt | ||
[delegate] | ||
file = recipient.cfg | ||
|
Oops, something went wrong.