Skip to content

Commit

Permalink
--prefix option, closes #20
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Jul 19, 2022
1 parent e31a368 commit 08fe2cb
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ To process every file in the bucket with a `.pdf` extension use `--all`:

s3-ocr start name-of-bucket --all

To process every file with a `.pdf` extension within a specific folder, use `--prefix`:

s3-ocr start name-of-bucket --prefix path/to/folder

### s3-ocr start --help

<!-- [[[cog
Expand Down
16 changes: 12 additions & 4 deletions s3_ocr/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ def cli():
@click.argument("bucket")
@click.argument("keys", nargs=-1)
@click.option("--all", is_flag=True, help="Process all PDF files in the bucket")
@click.option("--prefix", help="Process all PDF files within this prefix")
@common_boto3_options
def start(bucket, keys, all, **boto_options):
def start(bucket, keys, all, prefix, **boto_options):
"""
Start OCR tasks for PDF files in an S3 bucket
Expand All @@ -100,6 +101,10 @@ def start(bucket, keys, all, **boto_options):
To process every file with a .pdf extension:
s3-ocr start name-of-bucket --all
To process every .pdf in the PUBLIC/ folder:
s3-ocr start name-of-bucket --prefix PUBLIC/
"""
s3 = make_client("s3", **boto_options)
textract = make_client("textract", **boto_options)
Expand All @@ -114,11 +119,14 @@ def start(bucket, keys, all, **boto_options):
if match["Key"] in (key, key + S3_OCR_JSON):
items.append(match)
else:
if not all:
if not all and not prefix:
raise click.ClickException(
"Specify keys, or use --all to process all PDFs in the bucket"
"Specify keys, --prefix or use --all to process all PDFs in the bucket"
)
items = list(paginate(s3, "list_objects_v2", "Contents", Bucket=bucket))
kwargs = dict(Bucket=bucket)
if prefix:
kwargs["Prefix"] = prefix
items = list(paginate(s3, "list_objects_v2", "Contents", **kwargs))
# Start any item that ends in .pdf for which a .s3-ocr.json file does not exist
keys_with_s3_ocr_files = [
strip_ocr_json(item["Key"])
Expand Down
19 changes: 18 additions & 1 deletion tests/test_s3_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_start_with_no_options_error(s3):
result = runner.invoke(cli, ["start", "my-bucket"])
assert result.exit_code == 1
assert (
"Specify keys, or use --all to process all PDFs in the bucket"
"Specify keys, --prefix or use --all to process all PDFs in the bucket"
in result.output
)

Expand Down Expand Up @@ -45,6 +45,23 @@ def test_start_with_specified_key(s3, textract):
}


def test_start_with_prefix(s3, textract):
s3.put_object(Bucket="my-bucket", Key="pre/blah1.pdf", Body=b"Fake PDF")
s3.put_object(Bucket="my-bucket", Key="pre/blah2.pdf", Body=b"Fake PDF")
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(cli, ["start", "my-bucket", "--prefix", "pre/"])
assert result.exit_code == 0, result.output
bucket_contents = s3.list_objects_v2(Bucket="my-bucket")["Contents"]
assert {b["Key"] for b in bucket_contents} == {
"blah.pdf",
"pre/blah1.pdf",
"pre/blah1.pdf.s3-ocr.json",
"pre/blah2.pdf",
"pre/blah2.pdf.s3-ocr.json",
}


@pytest.mark.parametrize(
"files,expected",
(
Expand Down

0 comments on commit 08fe2cb

Please sign in to comment.