--all option to process everything, closes #10

simonw · Jun 30, 2022 · 0444883 · 0444883
1 parent 775c7c3
commit 0444883
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -15,19 +15,23 @@ Install this tool using `pip`:
 
 ## Starting OCR against PDFs in a bucket
 
-The `start` command loops through every PDF file in a bucket (every file ending in `.pdf`) and submits it to [Textract](https://aws.amazon.com/textract/) for OCR processing.
+The `start` command takes a list of keys and submits them to [Textract](https://aws.amazon.com/textract/) for OCR processing.
 
 You need to have AWS configured using environment variables or a credentials file in your home directory.
 
 You can start the process running like this:
 
-    s3-ocr start name-of-your-bucket
+    s3-ocr start name-of-your-bucket my-pdf-file.pdf
+
+The paths you specify should be paths within the bucket. If you stored your PDF files in folders inside the bucket it should look like this:
+
+    s3-ocr start name-of-your-bucket path/to/one.pdf path/to/two.pdf
 
 OCR can take some time. The results of the OCR will be stored in `textract-output` in your bucket.
 
-To start processing just one or more specific files, use the `--key` option one or more times:
+To process every file in the bucket with a `.pdf` extension use `--all`:
 
-    s3-ocr start name-of-bucket --key path/to/one.pdf --key path/to/two.pdf
+    s3-ocr start name-of-bucket --all
 
 ### s3-ocr start --help
 
@@ -43,18 +47,18 @@ cog.out(
 )
 ]]] -->
 ```
-Usage: s3-ocr start [OPTIONS] BUCKET
+Usage: s3-ocr start [OPTIONS] BUCKET [KEYS]...
 
-  Start OCR tasks for all PDF files in an S3 bucket
+  Start OCR tasks for PDF files in an S3 bucket
 
-      s3-ocr start name-of-bucket
+      s3-ocr start name-of-bucket path/to/one.pdf path/to/two.pdf
 
-  To process specific keys:
+  To process every file with a .pdf extension:
 
-      s3-ocr start name-of-bucket -k path/to/key.pdf -k path/to/key2.pdf
+      s3-ocr start name-of-bucket --all
 
 Options:
-  -k, --key TEXT        Specific keys to process
+  --all                 Process all PDF files in the bucket
   --access-key TEXT     AWS access key ID
   --secret-key TEXT     AWS secret access key
   --session-token TEXT  AWS session token

diff --git a/s3_ocr/cli.py b/s3_ocr/cli.py
@@ -88,17 +88,18 @@ def cli():
 
 @cli.command
 @click.argument("bucket")
-@click.option("keys", "-k", "--key", multiple=True, help="Specific keys to process")
+@click.argument("keys", nargs=-1)
+@click.option("--all", is_flag=True, help="Process all PDF files in the bucket")
 @common_boto3_options
-def start(bucket, keys, **boto_options):
+def start(bucket, keys, all, **boto_options):
     """
-    Start OCR tasks for all PDF files in an S3 bucket
+    Start OCR tasks for PDF files in an S3 bucket
 
-        s3-ocr start name-of-bucket
+        s3-ocr start name-of-bucket path/to/one.pdf path/to/two.pdf
 
-    To process specific keys:
+    To process every file with a .pdf extension:
 
-        s3-ocr start name-of-bucket -k path/to/key.pdf -k path/to/key2.pdf
+        s3-ocr start name-of-bucket --all
     """
     s3 = make_client("s3", **boto_options)
     textract = make_client("textract", **boto_options)
@@ -113,7 +114,10 @@ def start(bucket, keys, **boto_options):
                 if match["Key"] in (key, key + S3_OCR_JSON):
                     items.append(match)
     else:
-        # Everything
+        if not all:
+            raise click.ClickException(
+                "Specify keys, or use --all to process all PDFs in the bucket"
+            )
         items = list(paginate(s3, "list_objects_v2", "Contents", Bucket=bucket))
     # Start any item that ends in .pdf for which a .s3-ocr.json file does not exist
     keys_with_s3_ocr_files = [

diff --git a/tests/test_s3_ocr.py b/tests/test_s3_ocr.py
@@ -8,10 +8,21 @@
 import sqlite_utils
 
 
-def test_start_creates_s3_ocr_json(s3, textract):
+def test_start_with_no_options_error():
     runner = CliRunner()
     with runner.isolated_filesystem():
         result = runner.invoke(cli, ["start", "my-bucket"])
+        assert result.exit_code == 1
+        assert (
+            "Specify keys, or use --all to process all PDFs in the bucket"
+            in result.output
+        )
+
+
+def test_start_all_creates_s3_ocr_json(s3, textract):
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        result = runner.invoke(cli, ["start", "my-bucket", "--all"])
         assert result.exit_code == 0
     bucket_contents = s3.list_objects_v2(Bucket="my-bucket")["Contents"]
     assert {b["Key"] for b in bucket_contents} == {"blah.pdf", "blah.pdf.s3-ocr.json"}
@@ -20,11 +31,11 @@ def test_start_creates_s3_ocr_json(s3, textract):
     assert set(decoded.keys()) == {"job_id", "etag"}
 
 
-def test_start_with_key_option(s3, textract):
+def test_start_with_specified_key(s3, textract):
     s3.put_object(Bucket="my-bucket", Key="blah2.pdf", Body=b"Fake PDF")
     runner = CliRunner()
     with runner.isolated_filesystem():
-        result = runner.invoke(cli, ["start", "my-bucket", "-k", "blah2.pdf"])
+        result = runner.invoke(cli, ["start", "my-bucket", "blah2.pdf"])
         assert result.exit_code == 0
     bucket_contents = s3.list_objects_v2(Bucket="my-bucket")["Contents"]
     assert {b["Key"] for b in bucket_contents} == {