diff --git a/.docker/Dockerfile b/.docker/Dockerfile index 527fed5d5..eaa9f24ca 100644 --- a/.docker/Dockerfile +++ b/.docker/Dockerfile @@ -55,6 +55,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pngquant \ python3 \ qpdf \ + inotify-tools \ tesseract-ocr \ tesseract-ocr-chi-sim \ tesseract-ocr-deu \ @@ -71,6 +72,7 @@ COPY --from=builder /usr/local/bin/ /usr/local/bin/ COPY --from=builder /app/misc/webservice.py /app/ COPY --from=builder /app/misc/watcher.py /app/ +COPY --from=builder /app/misc/watcher /app/ # Copy minimal project files to get the test suite. COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/ diff --git a/docs/docker.rst b/docs/docker.rst index 392b82d9f..e1e8a0fc6 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -168,3 +168,17 @@ also licensed in this way. In addition to the above, please read our :ref:`general remarks on using OCRmyPDF as a service `. + +Using OCRmyPDF for automatic file conversion +============================================ + +The OCRmyPDF Docker image includes a script to automatically OCR files put +into an input directory. The watcher script is started as follows: + +.. code-block:: bash + + docker run --name ocrwatcher --entrypoint ./watcher -v $PWD/input:/input -v $PWD/output:/output jbarlow83/ocrmypdf -- + +where ```` may optionally include ``-r`` to remove correctly +processed input files from input folder, and after the two hyphens all command +line options for OCRmyPDF can be included. diff --git a/misc/watcher b/misc/watcher new file mode 100755 index 000000000..0aceaeeaa --- /dev/null +++ b/misc/watcher @@ -0,0 +1,62 @@ +#!/bin/sh + +usage() { + cat <&2 +Usage: $0 [-i ] [-o ] [-r] [-- ocroptions]" +Automatically OCRmyPDF files in input directory and put the result into +output directory. Specify the options for OCRmyPDF after the double +hyphen (--). + +Arguments: + -i: Input directory + -o: Output directory + -r: Remove input file on successful conversion +EOF + exit 1; + } + +INPUTDIR=/input +OUTPUTDIR=/output +REMOVEONSUCCESS=true + +while getopts ":i:o:hr" o; do + case "${o}" in + i) + INPUTDIR="${OPTARG}" + ;; + o) + OUTPUTDIR="${OPTARG}" + ;; + r) + REMOVEONSUCCESS=rm + ;; + h) + usage + ;; + esac +done +shift $((OPTIND-1)) + +OCRMYPDFOPTS="$*" + +outfilename() { + OUTFILE="$1" + if [ -e "$OUTFILE" ]; then + name="${1%.*}" + ext="${1##*.}" + NUM=$(ls "$name"* | wc -l) + OUTFILE="$name".$NUM.$ext + fi +} + +convert_infile() +{ + outfilename "$OUTPUTDIR/$1" + + ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTFILE" && $REMOVEONSUCCESS "$INPUTDIR/$F" +} + +inotifywait -m -e CLOSE_WRITE -q "$INPUTDIR" --format %f | + while read F; do + convert_infile $F + done