ocrmypdf · svenihoney · Feb 3, 2020 · Feb 3, 2020 · Feb 4, 2020 · Feb 4, 2020
diff --git a/.docker/Dockerfile b/.docker/Dockerfile
@@ -55,6 +55,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
   pngquant \
   python3 \
   qpdf \
+  inotify-tools \
   tesseract-ocr \
   tesseract-ocr-chi-sim \
   tesseract-ocr-deu \
@@ -71,6 +72,7 @@ COPY --from=builder /usr/local/bin/ /usr/local/bin/
 
 COPY --from=builder /app/misc/webservice.py /app/
 COPY --from=builder /app/misc/watcher.py /app/
+COPY --from=builder /app/misc/watcher /app/
 
 # Copy minimal project files to get the test suite.
 COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/

diff --git a/docs/docker.rst b/docs/docker.rst
@@ -168,3 +168,17 @@ also licensed in this way.
 
 In addition to the above, please read our
 :ref:`general remarks on using OCRmyPDF as a service <ocr-service>`.
+
+Using OCRmyPDF for automatic file conversion
+============================================
+
+The OCRmyPDF Docker image includes a script to automatically OCR files put
+into an input directory. The watcher script is started as follows:
+
+.. code-block:: bash
+
+   docker run --name ocrwatcher --entrypoint ./watcher -v $PWD/input:/input -v $PWD/output:/output jbarlow83/ocrmypdf <watcher-options> -- <ocrmypdf-options>
+
+where ``<watcher-options>`` may optionally include ``-r`` to remove correctly
+processed input files from input folder, and after the two hyphens all command
+line options for OCRmyPDF can be included.
diff --git a/misc/watcher b/misc/watcher
@@ -0,0 +1,62 @@
+#!/bin/sh
+
+usage() {
+	cat <<EOF 1>&2
+Usage: $0 [-i <inputdir>] [-o <outputdir>] [-r] [-- ocroptions]"
+Automatically OCRmyPDF files in input directory and put the result into
+output directory. Specify the options for OCRmyPDF after the double
+hyphen (--).
+
+Arguments:
+	   -i: Input directory
+	   -o: Output directory
+	   -r: Remove input file on successful conversion
+EOF
+		  exit 1;
+		}
+
+INPUTDIR=/input
+OUTPUTDIR=/output
+REMOVEONSUCCESS=true
+
+while getopts ":i:o:hr" o; do
+	case "${o}" in
+		i)
+			INPUTDIR="${OPTARG}"
+			;;
+		o)
+			OUTPUTDIR="${OPTARG}"
+			;;
+		r)
+			REMOVEONSUCCESS=rm
+			;;
+		h)
+			usage
+			;;
+	esac
+done
+shift $((OPTIND-1))
+
+OCRMYPDFOPTS="$*"
+
+outfilename() {
+	OUTFILE="$1"
+	if [ -e "$OUTFILE" ]; then
+		name="${1%.*}"
+		ext="${1##*.}"
+		NUM=$(ls "$name"* | wc -l)
+		OUTFILE="$name".$NUM.$ext
+	fi
+}
+
+convert_infile()
+{
+	outfilename "$OUTPUTDIR/$1"
+
+	ocrmypdf $OCRMYPDFOPTS "$INPUTDIR/$F" "$OUTFILE" && $REMOVEONSUCCESS "$INPUTDIR/$F"
+}
+
+inotifywait -m -e CLOSE_WRITE -q "$INPUTDIR" --format %f |
+	while read F; do
+		  convert_infile $F
+	done