Skip to content

Commit

Permalink
feat(data-homogenise): create service
Browse files Browse the repository at this point in the history
  • Loading branch information
leogail committed Dec 4, 2024
1 parent 2b4ee01 commit 3862e9c
Show file tree
Hide file tree
Showing 22 changed files with 638 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ All contributing instructions are in [CONTRIBUTING](CONTRIBUTING.md).
- [biblio-tools](./services/biblio-tools) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-biblio-tools.svg)](https://hub.docker.com/r/cnrsinist/ws-biblio-tools/)
- [chem-ner](./services/chem-ner) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-chem-ner.svg)](https://hub.docker.com/r/cnrsinist/ws-chem-ner/)
- [data-computer](./services/data-computer) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-computer.svg)](https://hub.docker.com/r/cnrsinist/ws-data-computer/)
- [data-homogenise](./services/data-homogenise) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-homogenise.svg)](https://hub.docker.com/r/cnrsinist/ws-data-homogenise/)
- [data-rapido](./services/data-rapido) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-rapido.svg)](https://hub.docker.com/r/cnrsinist/ws-data-rapido/)
- [data-termsuite](./services/data-termsuite) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-termsuite.svg)](https://hub.docker.com/r/cnrsinist/ws-data-termsuite/)
- [data-thesesul](./services/data-thesesul) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-thesesul.svg)](https://hub.docker.com/r/cnrsinist/ws-data-thesesul/)
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"services/biblio-tools",
"services/chem-ner",
"services/data-computer",
"services/data-homogenise",
"services/data-rapido",
"services/data-termsuite",
"services/data-thesesul",
Expand Down
7 changes: 7 additions & 0 deletions services/data-homogenise/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Ignore all files by default
*

# White list only the required files
!config.json
!v1
!swagger.json
37 changes: 37 additions & 0 deletions services/data-homogenise/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# syntax=docker/dockerfile:1.2
FROM python:3.9-slim-bullseye AS dvc-files
WORKDIR /dvc
RUN apt update && apt -y install git
RUN pip install dvc[webdav]==3.39.0
RUN --mount=type=secret,id=webdav_login \
--mount=type=secret,id=webdav_password \
--mount=type=secret,id=webdav_url \
git init && \
dvc init && \
dvc remote add -d webdav-remote "$(cat /run/secrets/webdav_url)" && \
dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \
dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)"
RUN dvc doctor
COPY ./v1/all-MiniLM-L6-v2.dvc /dvc
RUN dvc pull -v


FROM cnrsinist/ezs-python-pytorch-server:py3.9-no16-1.1.4

ENV HF_HOME=/app/.cache/huggingface
ENV NUMBA_CACHE_DIR=/tmp/numba_cache

USER root
# Install all python dependencies
RUN pip install --no-build-isolation \
--index-url https://download.pytorch.org/whl/cpu \
--extra-index-url https://pypi.org/simple \
sentence-transformers==3.3.1 \
huggingface_hub==0.23.2

# Declare files to copy in .dockerignore
COPY --chown=daemon:daemon . /app/public/
RUN mv /app/public/config.json /app && chmod a+w /app/config.json
RUN mkdir /tmp/retrieve

COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2
5 changes: 5 additions & 0 deletions services/data-homogenise/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# ws-data-homogenise@0.0.0

Homogénéise les valeurs d'un champs.

Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.
16 changes: 16 additions & 0 deletions services/data-homogenise/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"environnement": {
"EZS_TITLE": "Homogénéise les valeurs d'un champs.",
"EZS_DESCRIPTION": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.",
"EZS_METRICS": true,
"EZS_CONCURRENCY": 2,
"EZS_CONTINUE_DELAY": 18000,
"EZS_PIPELINE_DELAY": 10800,
"EZS_NSHARDS": 32,
"EZS_CACHE": true,
"EZS_VERBOSE": false,
"NODE_OPTIONS": "--max_old_space_size=1024",
"NODE_ENV": "production"

}
}
Binary file added services/data-homogenise/example.tar.gz
Binary file not shown.
16 changes: 16 additions & 0 deletions services/data-homogenise/examples.http
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# These examples can be used directly in VSCode, using HTTPYac extension (anweber.vscode-httpyac)
# They are important, because used to generate the tests.hurl file.

# Décommenter/commenter les lignes voulues pour tester localement
@host=http://localhost:31976
# @host=https://data-homogenise.services.istex.fr

###
# @name v1routeInCamelCase
# Description de la route
POST {{host}}/v1/route/in/camel/case?indent=true HTTP/1.1
Content-Type: application/x-tar
X-Webhook-Success: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9
X-Webhook-Failure: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9

< ./example-json.tar.gz
36 changes: 36 additions & 0 deletions services/data-homogenise/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"private": true,
"name": "ws-data-homogenise",
"version": "0.0.0",
"description": "Homogénéise les valeurs d'un champs.",
"repository": {
"type": "git",
"url": "git+https://github.com/Inist-CNRS/web-services.git"
},
"keywords": [
"ezmaster"
],
"author": "Léo Gaillard <leo.gaillard@cnrs.fr>",
"license": "MIT",
"bugs": {
"url": "https://github.com/Inist-CNRS/web-services/issues"
},
"homepage": "https://github.com/Inist-CNRS/web-services/#readme",
"scripts": {
"version:insert:readme": "sed -i \"s#\\(${npm_package_name}.\\)\\([\\.a-z0-9]\\+\\)#\\1${npm_package_version}#g\" README.md && git add README.md",
"version:insert:swagger": "sed -i \"s/\\\"version\\\": \\\"[0-9]\\+.[0-9]\\+.[0-9]\\+\\\"/\\\"version\\\": \\\"${npm_package_version}\\\"/g\" swagger.json && git add swagger.json",
"version:insert": "npm run version:insert:readme && npm run version:insert:swagger",
"version:commit": "git commit -a -m \"release ${npm_package_name}@${npm_package_version}\"",
"version:tag": "git tag \"${npm_package_name}@${npm_package_version}\" -m \"${npm_package_name}@${npm_package_version}\"",
"version:push": "git push && git push --tags",
"version": "npm run version:insert && npm run version:commit && npm run version:tag",
"postversion": "npm run version:push",
"build:dev": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:latest --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .",
"start:dev": "npm run build:dev && docker run --name dev --rm --detach -p 31976:31976 cnrsinist/${npm_package_name}:latest",
"stop:dev": "docker stop dev",
"build": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:${npm_package_version} --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .",
"start": "docker run --rm -p 31976:31976 cnrsinist/${npm_package_name}:${npm_package_version}",
"publish": "docker push cnrsinist/${npm_package_name}:${npm_package_version}"
},
"avoid-testing": false
}
33 changes: 33 additions & 0 deletions services/data-homogenise/swagger.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"openapi": "3.0.0",
"info": {
"title": "data-homogenise - Homogénéise les valeurs d'un champs.",
"description": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.",
"version": "0.0.0",
"termsOfService": "https://services.istex.fr/",
"contact": {
"name": "Inist-CNRS",
"url": "https://www.inist.fr/nous-contacter/"
}
},
"servers": [
{
"x-comment": "Will be automatically completed by the ezs server."
},
{
"url": "http://vptdmjobs.intra.inist.fr:49206/",
"description": "Latest version for production",
"#DISABLED#x-profil": "Standard"
}
],
"tags": [
{
"name": "data-homogenise",
"description": "Homogénéise les valeurs d'un champs.",
"externalDocs": {
"description": "Plus de documentation",
"url": "https://github.com/inist-cnrs/web-services/tree/main/services/data-homogenise"
}
}
]
}
35 changes: 35 additions & 0 deletions services/data-homogenise/tests.hurl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# WARNING: This file was not generated, but manually written.
# DON'T OVERWRITE IT
# Use it to test:
# npx hurl --test data-computer/tests.hurl

##################################################################
# v1/homogenise
POST {{host}}/v1/homogenise
content-type: application/x-tar
x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9
file,example.tar.gz;

HTTP 200
# Capture the computing token
[Captures]
computing_token: jsonpath "$[0].value"
[Asserts]
variable "computing_token" exists

POST {{host}}/v1/retrieve-json
content-type: application/json
[Options]
delay: 30000
```
[
{
"value":"{{computing_token}}"
}
]
```

HTTP 200
Content-Type: application/json
jsonpath "$" count == 3
jsonpath "$[0].value.[0]" == "Inist-CNRS, UAR76"
6 changes: 6 additions & 0 deletions services/data-homogenise/v1/all-MiniLM-L6-v2.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 25a76b5a8c779766c28a1b3b17139477.dir
size: 273545812
nfiles: 16
hash: md5
path: all-MiniLM-L6-v2
28 changes: 28 additions & 0 deletions services/data-homogenise/v1/buffer.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
[use]
plugin = basics

# On sauvegarde sur disque pour accepter rapidement tous les objets en entrée
# et répondre rapidement au client que le traitement asynchrone est lancé.
#
# Le "fork" se détache uniquement quand tous les objets sont "rentrés" dans le fork
# Si le traitement est plus lent que la sauvegarde sur disque
# il est nécessaire de créer un fichier temporaire
[pack]
[FILESave]
identifier = env('identifier')
location = /tmp/upload
compress = true

[debug]
text = fix('Data received by', env('generator'), 'for', env('identifier')).join(' ')

[exchange]
value = get('filename')

[FILELoad]
compress = true
location = /tmp/upload
[unpack]

[metrics]
bucket = buffer
30 changes: 30 additions & 0 deletions services/data-homogenise/v1/charger.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[use]
plugin = basics

# Step 0 (générique) : Lire le fichier standard tar.gz
[TARExtract]
compress = true
path = */*.json

# Step 1 (générique) : Créer un identifiant unique pour le corpus reçu
[singleton]

# Step 1.1 : On évite de récupérer un champ uri existant
[singleton/env]
path = pid
value = fix(`PID${Date.now()}`)

# Step 1.2 : On génère un identifiant unique
[singleton/identify]
path = env('pid')

# Step 1.3: On garde en mémoire l'identifiant généré (en le simplifiant)
[singleton/env]
path = identifier
value = get(env('pid')).replace('uid:/', '')

[singleton/exchange]
value = self().omit([env('pid')])

[metrics]
bucket = charger
67 changes: 67 additions & 0 deletions services/data-homogenise/v1/homogenise.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Entrypoint output format
mimeType = application/json

# OpenAPI Documentation - JSON format (dot notation)
post.operationId = post-v1-homogenise
post.summary = Homogénéise les valeurs d'un champs.
post.description = Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.
post.tags.0 = homogenise
post.requestBody.content.application/x-gzip.schema.type = string
post.requestBody.content.application/x-gzip.schema.format = binary
post.requestBody.content.application/x-tar.schema.type = string
post.requestBody.content.application/x-tar.schema.format = binary
post.requestBody.required = true
post.responses.default.description = Informations permettant de récupérer les données le moment venu
post.parameters.0.description = Indenter le JSON résultant
post.parameters.0.in = query
post.parameters.0.name = indent
post.parameters.0.schema.type = boolean
post.parameters.1.description = URL pour signaler que le traitement est terminé
post.parameters.1.in = header
post.parameters.1.name = X-Webhook-Success
post.parameters.1.schema.type = string
post.parameters.1.schema.format = uri
post.parameters.1.required = false
post.parameters.2.description = URL pour signaler que le traitement a échoué
post.parameters.2.in = header
post.parameters.2.name = X-Webhook-Failure
post.parameters.2.schema.type = string
post.parameters.2.schema.format = uri
post.parameters.2.required = false
post.parameters.3.in = query
post.parameters.3.name = thereshold
post.parameters.3.schema.type = number
post.parameters.3.description = threshold of similarity to use to homogenise (in percent)
#'

[use]
plugin = @ezs/spawn

[env]
path = generator
value = homogenise

# Step 1 (générique): Charger le fichier corpus
[delegate]
file = charger.cfg

# Step 2 (générique): Traiter de manière asynchnore les items reçus
[fork]
standalone = true
logger = logger.cfg

# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus
[fork/exec]
# command should be executable !
command = ./v1/homogenise.py
args = fix('-p')
args = env('thereshold', "70")

# Step 2.2 (générique): Enregister le résulat et signaler que le traitment est fini
[fork/delegate]
file = recorder.cfg

# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résulat quand il sera prêt
[delegate]
file = recipient.cfg

Loading

0 comments on commit 3862e9c

Please sign in to comment.