From 3862e9cfaec4e8574a7242b3bb1ee4957a4b2e98 Mon Sep 17 00:00:00 2001 From: leogail Date: Wed, 4 Dec 2024 11:01:37 +0100 Subject: [PATCH] feat(data-homogenise): create service --- README.md | 1 + package.json | 1 + services/data-homogenise/.dockerignore | 7 ++ services/data-homogenise/Dockerfile | 37 +++++++ services/data-homogenise/README.md | 5 + services/data-homogenise/config.json | 16 +++ services/data-homogenise/example.tar.gz | Bin 0 -> 538 bytes services/data-homogenise/examples.http | 16 +++ services/data-homogenise/package.json | 36 +++++++ services/data-homogenise/swagger.json | 33 ++++++ services/data-homogenise/tests.hurl | 35 ++++++ .../data-homogenise/v1/all-MiniLM-L6-v2.dvc | 6 ++ services/data-homogenise/v1/buffer.cfg | 28 +++++ services/data-homogenise/v1/charger.cfg | 30 ++++++ services/data-homogenise/v1/homogenise.ini | 67 ++++++++++++ services/data-homogenise/v1/homogenise.py | 100 ++++++++++++++++++ services/data-homogenise/v1/logger.cfg | 55 ++++++++++ services/data-homogenise/v1/recipient.cfg | 12 +++ services/data-homogenise/v1/recorder.cfg | 54 ++++++++++ services/data-homogenise/v1/retrieve-csv.ini | 35 ++++++ services/data-homogenise/v1/retrieve-json.ini | 36 +++++++ services/data-homogenise/v1/retrieve.ini | 28 +++++ 22 files changed, 638 insertions(+) create mode 100644 services/data-homogenise/.dockerignore create mode 100644 services/data-homogenise/Dockerfile create mode 100644 services/data-homogenise/README.md create mode 100644 services/data-homogenise/config.json create mode 100644 services/data-homogenise/example.tar.gz create mode 100644 services/data-homogenise/examples.http create mode 100644 services/data-homogenise/package.json create mode 100644 services/data-homogenise/swagger.json create mode 100644 services/data-homogenise/tests.hurl create mode 100644 services/data-homogenise/v1/all-MiniLM-L6-v2.dvc create mode 100644 services/data-homogenise/v1/buffer.cfg create mode 100644 services/data-homogenise/v1/charger.cfg create mode 100644 services/data-homogenise/v1/homogenise.ini create mode 100755 services/data-homogenise/v1/homogenise.py create mode 100644 services/data-homogenise/v1/logger.cfg create mode 100644 services/data-homogenise/v1/recipient.cfg create mode 100644 services/data-homogenise/v1/recorder.cfg create mode 100644 services/data-homogenise/v1/retrieve-csv.ini create mode 100644 services/data-homogenise/v1/retrieve-json.ini create mode 100644 services/data-homogenise/v1/retrieve.ini diff --git a/README.md b/README.md index f195e928..bde0052b 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ All contributing instructions are in [CONTRIBUTING](CONTRIBUTING.md). - [biblio-tools](./services/biblio-tools) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-biblio-tools.svg)](https://hub.docker.com/r/cnrsinist/ws-biblio-tools/) - [chem-ner](./services/chem-ner) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-chem-ner.svg)](https://hub.docker.com/r/cnrsinist/ws-chem-ner/) - [data-computer](./services/data-computer) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-computer.svg)](https://hub.docker.com/r/cnrsinist/ws-data-computer/) +- [data-homogenise](./services/data-homogenise) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-homogenise.svg)](https://hub.docker.com/r/cnrsinist/ws-data-homogenise/) - [data-rapido](./services/data-rapido) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-rapido.svg)](https://hub.docker.com/r/cnrsinist/ws-data-rapido/) - [data-termsuite](./services/data-termsuite) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-termsuite.svg)](https://hub.docker.com/r/cnrsinist/ws-data-termsuite/) - [data-thesesul](./services/data-thesesul) [![Docker Pulls](https://img.shields.io/docker/pulls/cnrsinist/ws-data-thesesul.svg)](https://hub.docker.com/r/cnrsinist/ws-data-thesesul/) diff --git a/package.json b/package.json index ee9683d0..c1179f9c 100644 --- a/package.json +++ b/package.json @@ -49,6 +49,7 @@ "services/biblio-tools", "services/chem-ner", "services/data-computer", + "services/data-homogenise", "services/data-rapido", "services/data-termsuite", "services/data-thesesul", diff --git a/services/data-homogenise/.dockerignore b/services/data-homogenise/.dockerignore new file mode 100644 index 00000000..e280cbb7 --- /dev/null +++ b/services/data-homogenise/.dockerignore @@ -0,0 +1,7 @@ +# Ignore all files by default +* + +# White list only the required files +!config.json +!v1 +!swagger.json diff --git a/services/data-homogenise/Dockerfile b/services/data-homogenise/Dockerfile new file mode 100644 index 00000000..34b4a0b9 --- /dev/null +++ b/services/data-homogenise/Dockerfile @@ -0,0 +1,37 @@ +# syntax=docker/dockerfile:1.2 +FROM python:3.9-slim-bullseye AS dvc-files +WORKDIR /dvc +RUN apt update && apt -y install git +RUN pip install dvc[webdav]==3.39.0 +RUN --mount=type=secret,id=webdav_login \ + --mount=type=secret,id=webdav_password \ + --mount=type=secret,id=webdav_url \ + git init && \ + dvc init && \ + dvc remote add -d webdav-remote "$(cat /run/secrets/webdav_url)" && \ + dvc remote modify --local webdav-remote user "$(cat /run/secrets/webdav_login)" && \ + dvc remote modify --local webdav-remote password "$(cat /run/secrets/webdav_password)" +RUN dvc doctor +COPY ./v1/all-MiniLM-L6-v2.dvc /dvc +RUN dvc pull -v + + +FROM cnrsinist/ezs-python-pytorch-server:py3.9-no16-1.1.4 + +ENV HF_HOME=/app/.cache/huggingface +ENV NUMBA_CACHE_DIR=/tmp/numba_cache + +USER root +# Install all python dependencies +RUN pip install --no-build-isolation \ + --index-url https://download.pytorch.org/whl/cpu \ + --extra-index-url https://pypi.org/simple \ + sentence-transformers==3.3.1 \ + huggingface_hub==0.23.2 + +# Declare files to copy in .dockerignore +COPY --chown=daemon:daemon . /app/public/ +RUN mv /app/public/config.json /app && chmod a+w /app/config.json +RUN mkdir /tmp/retrieve + +COPY --chown=daemon:daemon --from=dvc-files /dvc/all-MiniLM-L6-v2 /app/public/v1/all-MiniLM-L6-v2 diff --git a/services/data-homogenise/README.md b/services/data-homogenise/README.md new file mode 100644 index 00000000..c8acd715 --- /dev/null +++ b/services/data-homogenise/README.md @@ -0,0 +1,5 @@ +# ws-data-homogenise@0.0.0 + +Homogénéise les valeurs d'un champs. + +Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur. diff --git a/services/data-homogenise/config.json b/services/data-homogenise/config.json new file mode 100644 index 00000000..00b6fbf5 --- /dev/null +++ b/services/data-homogenise/config.json @@ -0,0 +1,16 @@ +{ + "environnement": { + "EZS_TITLE": "Homogénéise les valeurs d'un champs.", + "EZS_DESCRIPTION": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.", + "EZS_METRICS": true, + "EZS_CONCURRENCY": 2, + "EZS_CONTINUE_DELAY": 18000, + "EZS_PIPELINE_DELAY": 10800, + "EZS_NSHARDS": 32, + "EZS_CACHE": true, + "EZS_VERBOSE": false, + "NODE_OPTIONS": "--max_old_space_size=1024", + "NODE_ENV": "production" + + } +} diff --git a/services/data-homogenise/example.tar.gz b/services/data-homogenise/example.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..2a0b7cc114e3f22a06dd2f6a9b72acecadfbd087 GIT binary patch literal 538 zcmV+#0_FW5iwFP!000001MQXFZqqOn$9LVQIPz61rD>c`C|6Y{4cNMiwc8Lk$Rwwm zS>i;tlXXmeAKrr(!)a4K3J6VXt19IeDRGW{j*k=nj}xZZVD4<7_9vB#gU7|h4SgRD z)>CXh4SodFk6hoOLEwhqP}g+>)BnhM{E<>=CQT~;8)tiQd%`{82^kVfEK;$wMALpF zxYGUc>DknR^U+xpl4WPtX|_BO$*=l%ccFhu!{7DqwfcV!8vVOJ>E8ueBD_T=;61yu zAn*gTpbI9F9Lq{#KmGbe^-r0I@1XmmSoGR5v4?Ew-`kD;gI51LqS3$im-GJtwV;*% zK9VuP=cx2k=zhb$^dCmfmi}qvg{}UdgZmBui6t`4b8*PDDF8q&F|ok$;SghR9oGlv z<gaSG;R*#xrt{jJRCExlYTZ#V$ucx?mFcM$R8N=Gqs-k}zc3Vd1JoaxTD+-jhLRoQ>uWjlhF6_B%K83M=cbMd#d7838U^~!#N9ij# z;i6o?;zjr&^m{NWiWD#L<1yERz>93N&%4L3XOj~PQhtMQgz-(@gYmV@GaOKl+D?7J cl+BssjSJ~4cfD^~T3Ys>Z$2#u2mllS0D", + "license": "MIT", + "bugs": { + "url": "https://github.com/Inist-CNRS/web-services/issues" + }, + "homepage": "https://github.com/Inist-CNRS/web-services/#readme", + "scripts": { + "version:insert:readme": "sed -i \"s#\\(${npm_package_name}.\\)\\([\\.a-z0-9]\\+\\)#\\1${npm_package_version}#g\" README.md && git add README.md", + "version:insert:swagger": "sed -i \"s/\\\"version\\\": \\\"[0-9]\\+.[0-9]\\+.[0-9]\\+\\\"/\\\"version\\\": \\\"${npm_package_version}\\\"/g\" swagger.json && git add swagger.json", + "version:insert": "npm run version:insert:readme && npm run version:insert:swagger", + "version:commit": "git commit -a -m \"release ${npm_package_name}@${npm_package_version}\"", + "version:tag": "git tag \"${npm_package_name}@${npm_package_version}\" -m \"${npm_package_name}@${npm_package_version}\"", + "version:push": "git push && git push --tags", + "version": "npm run version:insert && npm run version:commit && npm run version:tag", + "postversion": "npm run version:push", + "build:dev": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:latest --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .", + "start:dev": "npm run build:dev && docker run --name dev --rm --detach -p 31976:31976 cnrsinist/${npm_package_name}:latest", + "stop:dev": "docker stop dev", + "build": ". ./.env 2> /dev/null; DOCKER_BUILDKIT=1 docker build -t cnrsinist/${npm_package_name}:${npm_package_version} --secret id=webdav_login,env=WEBDAV_LOGIN --secret id=webdav_password,env=WEBDAV_PASSWORD --secret id=webdav_url,env=WEBDAV_URL .", + "start": "docker run --rm -p 31976:31976 cnrsinist/${npm_package_name}:${npm_package_version}", + "publish": "docker push cnrsinist/${npm_package_name}:${npm_package_version}" +}, + "avoid-testing": false +} diff --git a/services/data-homogenise/swagger.json b/services/data-homogenise/swagger.json new file mode 100644 index 00000000..bf40faf4 --- /dev/null +++ b/services/data-homogenise/swagger.json @@ -0,0 +1,33 @@ +{ + "openapi": "3.0.0", + "info": { + "title": "data-homogenise - Homogénéise les valeurs d'un champs.", + "description": "Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur.", + "version": "0.0.0", + "termsOfService": "https://services.istex.fr/", + "contact": { + "name": "Inist-CNRS", + "url": "https://www.inist.fr/nous-contacter/" + } + }, + "servers": [ + { + "x-comment": "Will be automatically completed by the ezs server." + }, + { + "url": "http://vptdmjobs.intra.inist.fr:49206/", + "description": "Latest version for production", + "#DISABLED#x-profil": "Standard" + } + ], + "tags": [ + { + "name": "data-homogenise", + "description": "Homogénéise les valeurs d'un champs.", + "externalDocs": { + "description": "Plus de documentation", + "url": "https://github.com/inist-cnrs/web-services/tree/main/services/data-homogenise" + } + } + ] +} diff --git a/services/data-homogenise/tests.hurl b/services/data-homogenise/tests.hurl new file mode 100644 index 00000000..5b76932c --- /dev/null +++ b/services/data-homogenise/tests.hurl @@ -0,0 +1,35 @@ +# WARNING: This file was not generated, but manually written. +# DON'T OVERWRITE IT +# Use it to test: +# npx hurl --test data-computer/tests.hurl + +################################################################## +# v1/homogenise +POST {{host}}/v1/homogenise +content-type: application/x-tar +x-hook: https://webhook.site/69300b22-a251-4c16-9905-f7ba218ae7e9 +file,example.tar.gz; + +HTTP 200 +# Capture the computing token +[Captures] +computing_token: jsonpath "$[0].value" +[Asserts] +variable "computing_token" exists + +POST {{host}}/v1/retrieve-json +content-type: application/json +[Options] +delay: 30000 +``` +[ + { + "value":"{{computing_token}}" + } +] +``` + +HTTP 200 +Content-Type: application/json +jsonpath "$" count == 3 +jsonpath "$[0].value.[0]" == "Inist-CNRS, UAR76" diff --git a/services/data-homogenise/v1/all-MiniLM-L6-v2.dvc b/services/data-homogenise/v1/all-MiniLM-L6-v2.dvc new file mode 100644 index 00000000..baadf72b --- /dev/null +++ b/services/data-homogenise/v1/all-MiniLM-L6-v2.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 25a76b5a8c779766c28a1b3b17139477.dir + size: 273545812 + nfiles: 16 + hash: md5 + path: all-MiniLM-L6-v2 diff --git a/services/data-homogenise/v1/buffer.cfg b/services/data-homogenise/v1/buffer.cfg new file mode 100644 index 00000000..d1ccaa6a --- /dev/null +++ b/services/data-homogenise/v1/buffer.cfg @@ -0,0 +1,28 @@ +[use] +plugin = basics + +# On sauvegarde sur disque pour accepter rapidement tous les objets en entrée +# et répondre rapidement au client que le traitement asynchrone est lancé. +# +# Le "fork" se détache uniquement quand tous les objets sont "rentrés" dans le fork +# Si le traitement est plus lent que la sauvegarde sur disque +# il est nécessaire de créer un fichier temporaire +[pack] +[FILESave] +identifier = env('identifier') +location = /tmp/upload +compress = true + +[debug] +text = fix('Data received by', env('generator'), 'for', env('identifier')).join(' ') + +[exchange] +value = get('filename') + +[FILELoad] +compress = true +location = /tmp/upload +[unpack] + +[metrics] +bucket = buffer diff --git a/services/data-homogenise/v1/charger.cfg b/services/data-homogenise/v1/charger.cfg new file mode 100644 index 00000000..0d727663 --- /dev/null +++ b/services/data-homogenise/v1/charger.cfg @@ -0,0 +1,30 @@ +[use] +plugin = basics + +# Step 0 (générique) : Lire le fichier standard tar.gz +[TARExtract] +compress = true +path = */*.json + +# Step 1 (générique) : Créer un identifiant unique pour le corpus reçu +[singleton] + +# Step 1.1 : On évite de récupérer un champ uri existant +[singleton/env] +path = pid +value = fix(`PID${Date.now()}`) + +# Step 1.2 : On génère un identifiant unique +[singleton/identify] +path = env('pid') + +# Step 1.3: On garde en mémoire l'identifiant généré (en le simplifiant) +[singleton/env] +path = identifier +value = get(env('pid')).replace('uid:/', '') + +[singleton/exchange] +value = self().omit([env('pid')]) + +[metrics] +bucket = charger diff --git a/services/data-homogenise/v1/homogenise.ini b/services/data-homogenise/v1/homogenise.ini new file mode 100644 index 00000000..9fe42b35 --- /dev/null +++ b/services/data-homogenise/v1/homogenise.ini @@ -0,0 +1,67 @@ +# Entrypoint output format +mimeType = application/json + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-homogenise +post.summary = Homogénéise les valeurs d'un champs. +post.description = Parcourt l'ensemble des valeurs d'un champs et regroupe les valeurs sémantiquement ou syntaxiquement très proches. La première valeur rencontrée considérée comme proche est prise comme référence et les prochaines occurences seront remplacées par cette première valeur. +post.tags.0 = homogenise +post.requestBody.content.application/x-gzip.schema.type = string +post.requestBody.content.application/x-gzip.schema.format = binary +post.requestBody.content.application/x-tar.schema.type = string +post.requestBody.content.application/x-tar.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Informations permettant de récupérer les données le moment venu +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean +post.parameters.1.description = URL pour signaler que le traitement est terminé +post.parameters.1.in = header +post.parameters.1.name = X-Webhook-Success +post.parameters.1.schema.type = string +post.parameters.1.schema.format = uri +post.parameters.1.required = false +post.parameters.2.description = URL pour signaler que le traitement a échoué +post.parameters.2.in = header +post.parameters.2.name = X-Webhook-Failure +post.parameters.2.schema.type = string +post.parameters.2.schema.format = uri +post.parameters.2.required = false +post.parameters.3.in = query +post.parameters.3.name = thereshold +post.parameters.3.schema.type = number +post.parameters.3.description = threshold of similarity to use to homogenise (in percent) +#' + +[use] +plugin = @ezs/spawn + +[env] +path = generator +value = homogenise + +# Step 1 (générique): Charger le fichier corpus +[delegate] +file = charger.cfg + +# Step 2 (générique): Traiter de manière asynchnore les items reçus +[fork] +standalone = true +logger = logger.cfg + +# Step 2.1 (spécifique): Lancer un calcul sur tous les items reçus +[fork/exec] +# command should be executable ! +command = ./v1/homogenise.py +args = fix('-p') +args = env('thereshold', "70") + +# Step 2.2 (générique): Enregister le résulat et signaler que le traitment est fini +[fork/delegate] +file = recorder.cfg + +# Step 3 : Renvoyer immédiatement un seul élément indiquant comment récupérer le résulat quand il sera prêt +[delegate] +file = recipient.cfg + diff --git a/services/data-homogenise/v1/homogenise.py b/services/data-homogenise/v1/homogenise.py new file mode 100755 index 00000000..e365bd89 --- /dev/null +++ b/services/data-homogenise/v1/homogenise.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import json +import sys +from sentence_transformers import SentenceTransformer + +similarity_thereshold = int(sys.argv[sys.argv.index('-p') + 1] if '-p' in sys.argv else 70)/100 +if similarity_thereshold < 0 : + similarity_thereshold=0.7 +if similarity_thereshold >1: + similarity_thereshold = 0.7 +model = SentenceTransformer('./v1/all-MiniLM-L6-v2') + + +def homogenise(phrases, similarity_matrix, similarity_thereshold=similarity_thereshold): + output = [] + already_homogenise = {} + for i in range(similarity_matrix.shape[0]): + for j in range(i+1): + if i == j: + output.append(phrases[i]) + break + similarity_value = similarity_matrix[i, j].item() + + if similarity_value < similarity_thereshold: + continue + else: + # Sans ce passage : imaginons nous avons 3 documents a b c + # si b est homogénéisé par a et c par b + # la sortie serait a a b mais nous on veut a a a + indice_is_homogen_to = i + already_homogenise[indice_is_homogen_to] = j + + while indice_is_homogen_to in already_homogenise: + indice_is_homogen_to = already_homogenise[indice_is_homogen_to] + + output.append(phrases[indice_is_homogen_to]) + break + return output + + +#WS +all_data = [] +for line in sys.stdin: + data=json.loads(line) + all_data.append(data) + +len_data = len(all_data) +indice_noise = [] +texts = [] +for i in range(len_data): + + try: + line = all_data[i] + + if "value" in line : + value = line["value"] + if type(value)==list: + texts.append([elt for elt in value if isinstance(elt,str)]) + elif type(value)==str: + texts.append([value]) + else: + indice_noise.append(i) + + else: + indice_noise.append(i) + + except: + indice_noise.append(i) + + +# Flatten and keep indice +phrases = [] +indices_lignes = [] +for i, sous_liste in enumerate(texts): + for phrase in sous_liste: + phrases.append(phrase) + indices_lignes.append(i) + +embeddings = model.encode(phrases) +similarity_matrix = model.similarity(embeddings, embeddings) + +output = homogenise(phrases, similarity_matrix) + +len_ligne = len(indices_lignes) +indice_not_noise=0 +for i in range(len_data): + if i in indice_noise : + all_data[i]["value"] = [] + else: + all_data[i]["value"] = [] + for j in range(len_ligne): + if indices_lignes[j] == indice_not_noise: + all_data[i]["value"].append(output[j]) + indice_not_noise +=1 # Here we increment only if the row isn't noise + +# Write all corpus in once +for line in all_data: + sys.stdout.write(json.dumps(line)) + sys.stdout.write("\n") diff --git a/services/data-homogenise/v1/logger.cfg b/services/data-homogenise/v1/logger.cfg new file mode 100644 index 00000000..5f980e58 --- /dev/null +++ b/services/data-homogenise/v1/logger.cfg @@ -0,0 +1,55 @@ +; [use] +plugin = basics +plugin = analytics + +[metrics] +bucket = logger + +# On ne garde que la première erreur déclenchée +[shift] + +[debug] +text = Error trapped + +[assign] +path = body.identifier +value = env('identifier') + +path = body.generator +value = env('generator') + +path = body.error.type +value = get('type') + +path = body.error.scope +value = get('scope') + +path = body.error.message +value = get('message') + +path = env +value = env() + +[swing] +test = env('headers.x-webhook-failure').startsWith('http') + +[swing/URLFetch] +url = env('headers.x-webhook-failure').trim() +path = body +headers = Content-Type:application/json +target = result +retries = 5 +timeout = 30000 + +# On enregistre uniquement quelques informations (à supprimer pour avoir la trace complète) +[exchange] +value = get('body') + +[FILESave] +location = /tmp/retrieve +identifier = env('identifier') +jsonl = true +compress = false + +[debug] +text = Error was saved diff --git a/services/data-homogenise/v1/recipient.cfg b/services/data-homogenise/v1/recipient.cfg new file mode 100644 index 00000000..f7238303 --- /dev/null +++ b/services/data-homogenise/v1/recipient.cfg @@ -0,0 +1,12 @@ +[use] +plugin = basics + +[shift] +[replace] +path = id +value = env('generator') +path = value +value = env('identifier') + +[JSONString] +indent = env('indent') diff --git a/services/data-homogenise/v1/recorder.cfg b/services/data-homogenise/v1/recorder.cfg new file mode 100644 index 00000000..449c46f2 --- /dev/null +++ b/services/data-homogenise/v1/recorder.cfg @@ -0,0 +1,54 @@ +[use] +plugin = basics +plugin = analytics + +[singleton] +[singleton/debug] +text = fix('One first result received by', env('generator'), 'for', env('identifier')).join(' ') + +[metrics] +bucket = recorder + +# Step 2.2 (générique): Création d'un fichier résultat standard +[TARDump] +compress = true +manifest = fix({version: '1'}) +manifest = fix({identifier: env('identifier')}) +manifest = fix({generator: env('generator')}) + +# Step 2.3 (générique): Sauvegarder sur disque le résultat +[FILESave] +location = /tmp/retrieve +identifier = env('identifier') +jsonl = false +compress = false + +# Step 2.4 (générique): Signaler la fin du traitement via un appel à un webhook (si il a été précisé) +[swing] +test = env('headers.x-webhook-success').startsWith('http') + +# Step 2.4.1 (générique): Séléctionner les informations à envoyer au webhook +[swing/replace] +path = url +value = env('headers.x-webhook-success') +path = body +value = self().pick(['size', 'atime', 'mtime', 'ctime']).set('identifier', env('identifier')).set('generator', env('generator')).set('state', 'ready') + +[swing/debug] +text = fix('Result generated by', env('generator'), 'for', env('identifier')).join(' ') + +# Step 2.4.2 (générique): Envoyer la requête HTTP +[swing/URLFetch] +url = env('headers.x-webhook-success').trim() +path = body +headers = Content-Type:application/json +retries = 5 +timeout = 30000 + +# Step 2.4.3 (facultatif) : Ajouter une trace dans log +[swing/debug] +text = fix('WebHook triggered by', env('generator'), 'for', env('identifier')).join(' ') + +# Step 2.5 (facultatif) : Ajouter une trace dans log +[debug] +text = fix('Process completed by', env('generator'), 'for', env('identifier')).join(' ') diff --git a/services/data-homogenise/v1/retrieve-csv.ini b/services/data-homogenise/v1/retrieve-csv.ini new file mode 100644 index 00000000..23d5bcb2 --- /dev/null +++ b/services/data-homogenise/v1/retrieve-csv.ini @@ -0,0 +1,35 @@ +# Entrypoint output format +mimeType = text/csv + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-retrieve-csv +post.description = Récupération d'un résultat produit sous forme d'un flux CSV +post.summary = Les traitements étant asynchrones le résultat, une fois créé, doit être récupéré par cette route +post.tags.0 = homogenise +post.responses.default.description = Fichier corpus en version CSV +post.requestBody.content.application/json.example.0.value = xMkWJX7GU +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.requestBody.required = true + +[use] +plugin = basics + +[JSONParse] +separator = * + +[exchange] +value = get('value') + +[FILELoad] +location = /tmp/retrieve + +[TARExtract] +compress = true +path = */*.json + +[exchange] +value = self().mapValues(value => typeof value === 'object' ? JSON.stringify(value) : value) + +[CSVString] +separator = fix(',') +format = strict diff --git a/services/data-homogenise/v1/retrieve-json.ini b/services/data-homogenise/v1/retrieve-json.ini new file mode 100644 index 00000000..0ea9743e --- /dev/null +++ b/services/data-homogenise/v1/retrieve-json.ini @@ -0,0 +1,36 @@ +# Entrypoint output format +mimeType = application/json + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-retrieve-json +post.description = Récupération d'un résultat produit sous forme d'un flux json +post.summary = Les traitements étant asynchrones le résultat, une fois créé, doit être récupéré par cette route +post.tags.0 = homogenise +post.responses.default.description = Fichier corpus au format JSON +post.requestBody.content.application/json.example.0.value = xMkWJX7GU +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.requestBody.required = true +post.parameters.0.description = Indenter le JSON résultant +post.parameters.0.in = query +post.parameters.0.name = indent +post.parameters.0.schema.type = boolean + +[use] +plugin = basics + +[JSONParse] +separator = * + +[exchange] +value = get('value') + +[FILELoad] +location = /tmp/retrieve + +[TARExtract] +compress = true +path = */*.json + +[JSONString] +indent = env('indent') + diff --git a/services/data-homogenise/v1/retrieve.ini b/services/data-homogenise/v1/retrieve.ini new file mode 100644 index 00000000..db22225e --- /dev/null +++ b/services/data-homogenise/v1/retrieve.ini @@ -0,0 +1,28 @@ +# Entrypoint output format +mimeType = application/x-gzip +extension = tar.gz + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-retrieve +post.summary = Récupération d'un résultat produit sous forme d'un fichier corpus +post.description = Les traitements étant asynchrones le résultat une fois créé doit être récupéré par cette route +post.tags.0 = homogenise +post.responses.default.description = Fichier corpus au format tar.gz +post.responses.default.content.application/x-gzip.schema.type = string +post.responses.default.content.application/x-gzip.schema.format = binary +post.requestBody.content.application/json.example.0.value = xMkWJX7GU +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.requestBody.required = true + +[use] +plugin = basics + +[JSONParse] +separator = * + +[exchange] +value = get('value') + +[FILELoad] +location = /tmp/retrieve +