From ae8cea1bc6362a86fd835c0334d3aace780a8dd7 Mon Sep 17 00:00:00 2001 From: Sven Fischer Date: Mon, 3 Feb 2020 18:18:21 +0100 Subject: [PATCH 1/5] Added shell based watcher tool Improve handling of files due to inotifywait having an event CLOSE_WRITE after a file has been written. --- misc/watcher | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100755 misc/watcher diff --git a/misc/watcher b/misc/watcher new file mode 100755 index 000000000..8b0d2fe21 --- /dev/null +++ b/misc/watcher @@ -0,0 +1,46 @@ +#!/bin/sh + +usage() { + cat <&2 +Usage: $0 [-i ] [-o ] [-r] [-- ocroptions]" +Automatically OCRmyPDF files in input directory and put the result into +output directory. Specify the options for OCRmyPDF after the double +hyphen (--). + +Arguments: + -i: Input directory + -o: Output directory + -r: Remove input file on successful conversion +EOF + exit 1; + } + +INPUTDIR=/input +OUTPUTDIR=/output +REMOVEONSUCCESS=true + +while getopts ":i:o:hr" o; do + case "${o}" in + i) + INPUTDIR="${OPTARG}" + ;; + o) + OUTPUTDIR="${OPTARG}" + ;; + r) + REMOVEONSUCCESS=rm + ;; + h) + usage + ;; + esac +done +shift $((OPTIND-1)) + +OCRMYPDFOPTS="$*" + +while true +do + F=$(inotifywait -e CLOSE_WRITE -q "$INPUTDIR" --format %f) + ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTPUTDIR/$F" && $REMOVEONSUCCESS "$INPUTDIR/$F" +done From e64247d365ee1f05722b80d3a0845ef4fccbbdc3 Mon Sep 17 00:00:00 2001 From: Sven Fischer Date: Mon, 3 Feb 2020 18:41:02 +0100 Subject: [PATCH 2/5] Handle duplicate output file names --- misc/watcher | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/misc/watcher b/misc/watcher index 8b0d2fe21..8680c87b5 100755 --- a/misc/watcher +++ b/misc/watcher @@ -39,8 +39,19 @@ shift $((OPTIND-1)) OCRMYPDFOPTS="$*" +outfilename() { + OUTFILE="$1" + if [ -e "$OUTFILE" ]; then + name="${1%.*}" + ext="${1##*.}" + OUTFILE="$name".$NUM.$ext + fi +} + while true do F=$(inotifywait -e CLOSE_WRITE -q "$INPUTDIR" --format %f) - ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTPUTDIR/$F" && $REMOVEONSUCCESS "$INPUTDIR/$F" -done + outfilename "$OUTPUTDIR/$F" + + ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTFILE" && $REMOVEONSUCCESS "$INPUTDIR/$F" +done From 700ba06fdeebfc9fc0bf43a0a1bb4dd7ff089dfb Mon Sep 17 00:00:00 2001 From: Sven Fischer Date: Tue, 4 Feb 2020 08:11:40 +0100 Subject: [PATCH 3/5] Updated Dockerfile to include watcher script --- .docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 527fed5d5..eaa9f24ca 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -55,6 +55,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pngquant \ python3 \ qpdf \ + inotify-tools \ tesseract-ocr \ tesseract-ocr-chi-sim \ tesseract-ocr-deu \ @@ -71,6 +72,7 @@ COPY --from=builder /usr/local/bin/ /usr/local/bin/ COPY --from=builder /app/misc/webservice.py /app/ COPY --from=builder /app/misc/watcher.py /app/ +COPY --from=builder /app/misc/watcher /app/ # Copy minimal project files to get the test suite. COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/ From 6c4d916c7b6013d94b0d5c66925d5aa16305f8c2 Mon Sep 17 00:00:00 2001 From: Sven Fischer Date: Tue, 4 Feb 2020 08:31:52 +0100 Subject: [PATCH 4/5] Added documentation about watcher --- docs/docker.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/docker.rst b/docs/docker.rst index 392b82d9f..e1e8a0fc6 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -168,3 +168,17 @@ also licensed in this way. In addition to the above, please read our :ref:`general remarks on using OCRmyPDF as a service `. + +Using OCRmyPDF for automatic file conversion +============================================ + +The OCRmyPDF Docker image includes a script to automatically OCR files put +into an input directory. The watcher script is started as follows: + +.. code-block:: bash + + docker run --name ocrwatcher --entrypoint ./watcher -v $PWD/input:/input -v $PWD/output:/output jbarlow83/ocrmypdf -- + +where ```` may optionally include ``-r`` to remove correctly +processed input files from input folder, and after the two hyphens all command +line options for OCRmyPDF can be included. From 7a7ae5af829faa473f309f4898428ac8aa7c8960 Mon Sep 17 00:00:00 2001 From: Sven Fischer Date: Tue, 4 Feb 2020 22:00:29 +0100 Subject: [PATCH 5/5] Made inotifywait react on all changes Was only reacting on a single change, if another change occurred during the conversion the event was missed. --- misc/watcher | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/misc/watcher b/misc/watcher index 8680c87b5..0aceaeeaa 100755 --- a/misc/watcher +++ b/misc/watcher @@ -44,14 +44,19 @@ outfilename() { if [ -e "$OUTFILE" ]; then name="${1%.*}" ext="${1##*.}" + NUM=$(ls "$name"* | wc -l) OUTFILE="$name".$NUM.$ext fi } -while true -do - F=$(inotifywait -e CLOSE_WRITE -q "$INPUTDIR" --format %f) - outfilename "$OUTPUTDIR/$F" +convert_infile() +{ + outfilename "$OUTPUTDIR/$1" ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTFILE" && $REMOVEONSUCCESS "$INPUTDIR/$F" -done +} + +inotifywait -m -e CLOSE_WRITE -q "$INPUTDIR" --format %f | + while read F; do + convert_infile $F + done