From d58eac760806a6b0dd3355b26970bf73895c94be Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 4 Apr 2023 08:11:57 +0200 Subject: [PATCH 1/7] add timestamps to job info --- ocrd_lib.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_lib.sh b/ocrd_lib.sh index 9320169..b263912 100644 --- a/ocrd_lib.sh +++ b/ocrd_lib.sh @@ -56,6 +56,7 @@ init() { mkdir -p /run/lock/ocrd.jobs/ { echo PID=$PID + echo TIME_CREATED=$(date --rfc-3339=seconds) echo PROCESS_ID=$PROCESS_ID echo TASK_ID=$TASK_ID echo PROCESS_DIR=$PROCESS_DIR @@ -68,7 +69,8 @@ init() { } logret() { - sed -i 1s/.*/RETVAL=$?/ /run/lock/ocrd.jobs/$REMOTEDIR + sed -i "1s/.*/RETVAL=$?/" /run/lock/ocrd.jobs/$REMOTEDIR + sed -i "2a TIME_TERMINATED=$(date --rfc-3339=seconds)" /run/lock/ocrd.jobs/$REMOTEDIR } init_task() { From fe075651e81f634138ed228e5d2867906875e8b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Apr 2023 15:44:18 +0200 Subject: [PATCH 2/7] make test: use 'run -i' to make interruptible --- Makefile | 4 ++-- ocr-workflow-default.sh => workflows/ocr-workflow-default.sh | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ocr-workflow-default.sh => workflows/ocr-workflow-default.sh (100%) diff --git a/Makefile b/Makefile index 238e6ab..5e945d2 100644 --- a/Makefile +++ b/Makefile @@ -88,7 +88,7 @@ test-production: $(DATA)/testdata-production ifeq ($(NETWORK),bridge) ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $( Date: Tue, 11 Apr 2023 15:45:21 +0200 Subject: [PATCH 3/7] provide PID file before ocrd-import already --- ocrd_lib.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_lib.sh b/ocrd_lib.sh index b263912..16eb086 100644 --- a/ocrd_lib.sh +++ b/ocrd_lib.sh @@ -69,7 +69,7 @@ init() { } logret() { - sed -i "1s/.*/RETVAL=$?/" /run/lock/ocrd.jobs/$REMOTEDIR + sed -i "1s/PID=.*/RETVAL=$?/" /run/lock/ocrd.jobs/$REMOTEDIR sed -i "2a TIME_TERMINATED=$(date --rfc-3339=seconds)" /run/lock/ocrd.jobs/$REMOTEDIR } @@ -85,15 +85,15 @@ ocrd_format_workflow() { # ocrd import from workdir ocrd_import_workdir() { + echo "echo \$\$ > $REMOTEDIR/ocrd.pid" echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; ocrd-import -i '$REMOTEDIR'; fi" echo "cd '$REMOTEDIR'" - echo 'echo $$ > ocrd.pid' } ocrd_enter_workdir() { + echo "echo \$\$ > $REMOTEDIR/ocrd.pid" echo "if test -f '$REMOTEDIR/mets.xml'; then OV=--overwrite; else OV=; fi" echo "cd '$REMOTEDIR'" - echo 'echo $$ > ocrd.pid' } ocrd_process_workflow() { From 995ecc81e5f246d3c8bf17b24e652f03e670e3e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Apr 2023 15:46:00 +0200 Subject: [PATCH 4/7] delegate kill signals to subprocesses --- ocrd_lib.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ocrd_lib.sh b/ocrd_lib.sh index 16eb086..be15d75 100644 --- a/ocrd_lib.sh +++ b/ocrd_lib.sh @@ -10,10 +10,17 @@ logerr() { logger -p user.info -t $TASK "terminating with error \$?=$? from ${BASH_COMMAND} on line $(caller)" } +stopbg() { + logger -p user.crit -t $TASK "passing SIGKILL to child $!" + # pass signal on to children + kill -KILL $! +} + # initialize variables, create ord-d work directory and exit if something is missing init() { trap logerr ERR - + trap stopbg INT TERM KILL + PID=$$ cd /data From 07227b5192d18a63afab299b33cc7e3b6b740095 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Apr 2023 15:56:24 +0200 Subject: [PATCH 5/7] use /workflows volume, cp to WORKDIR/workflow.sh if workflow file originates elsewhere --- Dockerfile | 1 + Makefile | 4 ++++ README.md | 10 ++++++---- docker-compose.yml | 1 + ocrd_lib.sh | 20 ++++++++++++-------- process_images.sh | 2 +- process_mets.sh | 2 +- 7 files changed, 26 insertions(+), 14 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1051277..28849a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,6 +70,7 @@ EXPOSE 22 WORKDIR /data VOLUME /data +VOLUME /workflows # simulate a virtual env for the makefile, # coinciding with the Python system prefix diff --git a/Makefile b/Makefile index 5e945d2..733529c 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,8 @@ Variables: currently: "$(PRIVATE)" - DATA host directory to mount into `/data` currently: "$(DATA)" + - WORKFLOWS host directory to mount into `/workflows` + currently: "$(WORKFLOWS)" - UID user id to use in logins currently: $(UID) - GID group id to use in logins @@ -47,6 +49,7 @@ help: ; @eval "$$HELP" KEYS ?= $(firstword $(wildcard $(HOME)/.ssh/authorized_keys* $(HOME)/.ssh/id_*.pub)) PRIVATE ?= $(firstword $(filter-out %.pub,$(wildcard $(HOME)/.ssh/id_*))) DATA ?= $(CURDIR) +WORKFLOWS ?= $(CURDIR)/workflows UID ?= $(shell id -u) GID ?= $(shell id -g) UMASK ?= 0002 @@ -62,6 +65,7 @@ run: $(DATA) --name ocrd_manager \ --network=$(NETWORK) \ -v $(DATA):/data \ + -v $(WORKFLOWS):/workflows \ --mount type=bind,source=$(KEYS),target=/authorized_keys \ --mount type=bind,source=$(PRIVATE),target=/id_rsa \ -e UID=$(UID) -e GID=$(GID) -e UMASK=$(UMASK) \ diff --git a/README.md b/README.md index c6e6a34..5a65907 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,12 @@ Build or pull the Docker image: ### Starting and mounting -Then run the container – providing a **host-side directory** for the volume … +Then run the container – providing a **host-side directory** for the volumes … * `DATA`: directory for data processing (including images or existing workspaces), defaults to current working directory + * `WORKFLOWS`: directory for scripts (preconfigured workflows), + defaults to `./workflows` in current working directory … but also files … @@ -73,7 +75,7 @@ Then run the container – providing a **host-side directory** for the volume … thus, for **example**: - make run DATA=/mnt/workspaces MODELS=~/.local/share KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa + make run DATA=/mnt/workspaces WORKFLOWS=/mnt/workflows KEYS=~/.ssh/id_rsa.pub PORT=9022 PRIVATE=~/.ssh/id_rsa (You can also run the service via `docker-compose` manually – just `cp .env.example .env` and edit to your needs.) @@ -140,7 +142,7 @@ which contains a trivial workflow: - preprocessing, layout analysis and text recognition with a single Tesseract processor call - format conversion of the result from PAGE-XML to ALTO-XML -It can be replaced with the (path) name of any workflow script mounted under `/data`. +It can be replaced with the (path) name of any workflow script mounted under `/workflows` or `/data`. For example (assuming `testdata` is a directory with image files mounted under `/data`): @@ -180,7 +182,7 @@ ENVIRONMENT VARIABLES: CONTROLLER: host name and port of OCR-D Controller for processing ``` -The same goes here for the `workflow parameter`. +For the `workflow` parameter, the same goes here as [above](#from-image-to-alto-files). For example (assuming `testdata` is a directory with image files mounted under `/data`): diff --git a/docker-compose.yml b/docker-compose.yml index eb9a297..d4f10bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,6 +33,7 @@ services: source: ${MANAGER_KEY} # Manager private key (for access to Controller) target: /id_rsa - ${MANAGER_DATA}:/data # metadata directory + - ${MANAGER_WORKFLOWS}:/workflows - shared:/run/lock/ocrd.jobs tty: true # docker run -t diff --git a/ocrd_lib.sh b/ocrd_lib.sh index be15d75..c250212 100644 --- a/ocrd_lib.sh +++ b/ocrd_lib.sh @@ -36,6 +36,14 @@ init() { exit 2 fi + WORKDIR=ocr-d/"$PROCESS_DIR" # use subdirectory of same volume so --reflink CoW still possible + if ! mkdir -p "$WORKDIR"; then + logger -p user.error -t $TASK "insufficient permissions on /data volume" + exit 5 + fi + # try to be unique here (to avoid clashes) + REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)" + WORKFLOW=$(command -v "$WORKFLOW" || realpath "$WORKFLOW") if ! test -f "$WORKFLOW"; then logger -p user.error -t $TASK "invalid workflow '$WORKFLOW'" @@ -43,6 +51,10 @@ init() { fi logger -p user.notice -t $TASK "using workflow '$WORKFLOW':" ocrd_format_workflow | logger -p user.notice -t $TASK + if test "${WORKFLOW#/workflows/}" = "$WORKFLOW"; then + cp -p "$WORKFLOW" "$WORKDIR/workflow.sh" + WORKFLOW="$WORKDIR/workflow.sh" + fi if test -z "$CONTROLLER" -o "$CONTROLLER" = "${CONTROLLER#*:}"; then logger -p user.error -t $TASK "envvar CONTROLLER='$CONTROLLER' must contain host:port" @@ -51,14 +63,6 @@ init() { CONTROLLERHOST=${CONTROLLER%:*} CONTROLLERPORT=${CONTROLLER#*:} - WORKDIR=ocr-d/"$PROCESS_DIR" # will use other mount-point than /data soon - if ! mkdir -p "$WORKDIR"; then - logger -p user.error -t $TASK "insufficient permissions on /data volume" - exit 5 - fi - # try to be unique here (to avoid clashes) - REMOTEDIR="KitodoJob_${PID}_$(basename $PROCESS_DIR)" - # create stats for monitor mkdir -p /run/lock/ocrd.jobs/ { diff --git a/process_images.sh b/process_images.sh index bcf418c..dd81067 100755 --- a/process_images.sh +++ b/process_images.sh @@ -16,7 +16,7 @@ parse_args() { SCRIPT= PROCESS_ID= TASK_ID= - WORKFLOW=ocr-workflow-default.sh + WORKFLOW=/workflows/ocr-workflow-default.sh IMAGES_SUBDIR=images RESULT_SUBDIR=ocr/alto while (($#)); do diff --git a/process_mets.sh b/process_mets.sh index 90ddd8d..965cc26 100755 --- a/process_mets.sh +++ b/process_mets.sh @@ -15,7 +15,7 @@ parse_args() { SCRIPT= PROCESS_ID= TASK_ID= - WORKFLOW=ocr-workflow-default.sh + WORKFLOW=/workflows/ocr-workflow-default.sh PAGES= IMAGES_GRP=DEFAULT RESULT_GRP=FULLTEXT From eb96ea8739b7bf5641ff3036879c06f40e86cb6a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Apr 2023 16:02:32 +0200 Subject: [PATCH 6/7] only docker exec -i if on tty --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 733529c..62c8f9a 100644 --- a/Makefile +++ b/Makefile @@ -92,7 +92,8 @@ test-production: $(DATA)/testdata-production ifeq ($(NETWORK),bridge) ssh -i $(PRIVATE) -Tn -p $(PORT) ocrd@localhost $(SCRIPT) $( Date: Wed, 12 Apr 2023 18:30:55 +0200 Subject: [PATCH 7/7] add comment explaining custom workflow copy --- ocrd_lib.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_lib.sh b/ocrd_lib.sh index c250212..713a93f 100644 --- a/ocrd_lib.sh +++ b/ocrd_lib.sh @@ -52,6 +52,9 @@ init() { logger -p user.notice -t $TASK "using workflow '$WORKFLOW':" ocrd_format_workflow | logger -p user.notice -t $TASK if test "${WORKFLOW#/workflows/}" = "$WORKFLOW"; then + # full path does not start with /workflows/ + # this is not a standard workflow - so make a copy + # in the workspace and use that path instead cp -p "$WORKFLOW" "$WORKDIR/workflow.sh" WORKFLOW="$WORKDIR/workflow.sh" fi