Skip to content

Commit

Permalink
--all option to process everything, closes #10
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Jun 30, 2022
1 parent 775c7c3 commit 0444883
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 20 deletions.
24 changes: 14 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,23 @@ Install this tool using `pip`:

## Starting OCR against PDFs in a bucket

The `start` command loops through every PDF file in a bucket (every file ending in `.pdf`) and submits it to [Textract](https://aws.amazon.com/textract/) for OCR processing.
The `start` command takes a list of keys and submits them to [Textract](https://aws.amazon.com/textract/) for OCR processing.

You need to have AWS configured using environment variables or a credentials file in your home directory.

You can start the process running like this:

s3-ocr start name-of-your-bucket
s3-ocr start name-of-your-bucket my-pdf-file.pdf

The paths you specify should be paths within the bucket. If you stored your PDF files in folders inside the bucket it should look like this:

s3-ocr start name-of-your-bucket path/to/one.pdf path/to/two.pdf

OCR can take some time. The results of the OCR will be stored in `textract-output` in your bucket.

To start processing just one or more specific files, use the `--key` option one or more times:
To process every file in the bucket with a `.pdf` extension use `--all`:

s3-ocr start name-of-bucket --key path/to/one.pdf --key path/to/two.pdf
s3-ocr start name-of-bucket --all

### s3-ocr start --help

Expand All @@ -43,18 +47,18 @@ cog.out(
)
]]] -->
```
Usage: s3-ocr start [OPTIONS] BUCKET
Usage: s3-ocr start [OPTIONS] BUCKET [KEYS]...
Start OCR tasks for all PDF files in an S3 bucket
Start OCR tasks for PDF files in an S3 bucket
s3-ocr start name-of-bucket
s3-ocr start name-of-bucket path/to/one.pdf path/to/two.pdf
To process specific keys:
To process every file with a .pdf extension:
s3-ocr start name-of-bucket -k path/to/key.pdf -k path/to/key2.pdf
s3-ocr start name-of-bucket --all
Options:
-k, --key TEXT Specific keys to process
--all Process all PDF files in the bucket
--access-key TEXT AWS access key ID
--secret-key TEXT AWS secret access key
--session-token TEXT AWS session token
Expand Down
18 changes: 11 additions & 7 deletions s3_ocr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,17 +88,18 @@ def cli():

@cli.command
@click.argument("bucket")
@click.option("keys", "-k", "--key", multiple=True, help="Specific keys to process")
@click.argument("keys", nargs=-1)
@click.option("--all", is_flag=True, help="Process all PDF files in the bucket")
@common_boto3_options
def start(bucket, keys, **boto_options):
def start(bucket, keys, all, **boto_options):
"""
Start OCR tasks for all PDF files in an S3 bucket
Start OCR tasks for PDF files in an S3 bucket
s3-ocr start name-of-bucket
s3-ocr start name-of-bucket path/to/one.pdf path/to/two.pdf
To process specific keys:
To process every file with a .pdf extension:
s3-ocr start name-of-bucket -k path/to/key.pdf -k path/to/key2.pdf
s3-ocr start name-of-bucket --all
"""
s3 = make_client("s3", **boto_options)
textract = make_client("textract", **boto_options)
Expand All @@ -113,7 +114,10 @@ def start(bucket, keys, **boto_options):
if match["Key"] in (key, key + S3_OCR_JSON):
items.append(match)
else:
# Everything
if not all:
raise click.ClickException(
"Specify keys, or use --all to process all PDFs in the bucket"
)
items = list(paginate(s3, "list_objects_v2", "Contents", Bucket=bucket))
# Start any item that ends in .pdf for which a .s3-ocr.json file does not exist
keys_with_s3_ocr_files = [
Expand Down
17 changes: 14 additions & 3 deletions tests/test_s3_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,21 @@
import sqlite_utils


def test_start_creates_s3_ocr_json(s3, textract):
def test_start_with_no_options_error():
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(cli, ["start", "my-bucket"])
assert result.exit_code == 1
assert (
"Specify keys, or use --all to process all PDFs in the bucket"
in result.output
)


def test_start_all_creates_s3_ocr_json(s3, textract):
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(cli, ["start", "my-bucket", "--all"])
assert result.exit_code == 0
bucket_contents = s3.list_objects_v2(Bucket="my-bucket")["Contents"]
assert {b["Key"] for b in bucket_contents} == {"blah.pdf", "blah.pdf.s3-ocr.json"}
Expand All @@ -20,11 +31,11 @@ def test_start_creates_s3_ocr_json(s3, textract):
assert set(decoded.keys()) == {"job_id", "etag"}


def test_start_with_key_option(s3, textract):
def test_start_with_specified_key(s3, textract):
s3.put_object(Bucket="my-bucket", Key="blah2.pdf", Body=b"Fake PDF")
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(cli, ["start", "my-bucket", "-k", "blah2.pdf"])
result = runner.invoke(cli, ["start", "my-bucket", "blah2.pdf"])
assert result.exit_code == 0
bucket_contents = s3.list_objects_v2(Bucket="my-bucket")["Contents"]
assert {b["Key"] for b in bucket_contents} == {
Expand Down

0 comments on commit 0444883

Please sign in to comment.