diff --git a/camelot/handlers.py b/camelot/handlers.py index 5f07e5d0..2b151c5d 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +from contextlib import contextmanager +import io import os import sys @@ -8,12 +10,13 @@ from .core import TableList from .parsers import Stream, Lattice from .utils import ( + InvalidArguments, TemporaryDirectory, get_page_layout, get_text_objects, get_rotation, is_url, - download_url, + get_url_bytes, ) @@ -24,19 +27,33 @@ class PDFHandler(object): Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. password : str, optional (default: None) Password for decryption. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given """ - def __init__(self, filepath, pages="1", password=None): + def __init__(self, filepath=None, pages="1", password=None, file_bytes=None): if is_url(filepath): - filepath = download_url(filepath) + file_bytes = get_url_bytes(filepath) + + if not filepath and not file_bytes: + raise InvalidArguments('Either `filepath` or `file_bytes` is required') + if not filepath: + # filepath must either be passed, or taken from the name attribute + filepath = getattr(file_bytes, 'name') + if not filepath: + msg = ('Either pass a `filepath`, or give the ' + '`file_bytes` argument a name attribute') + raise InvalidArguments(msg) + self.file_bytes = file_bytes # ok to be None + self.filepath = filepath if not filepath.lower().endswith(".pdf"): raise NotImplementedError("File format not supported") @@ -49,6 +66,28 @@ def __init__(self, filepath, pages="1", password=None): self.password = self.password.encode("ascii") self.pages = self._get_pages(pages) + @contextmanager + def managed_file_context(self): + """Reads from either the `filepath` or `file_bytes` + attribute of this instance, to return a file-like object. + Closes any open file handles on exit or error. + + Returns + ------- + file_bytes : io.IOBase + A readable, seekable, file-like object + """ + if self.file_bytes: + # if we can't seek, write to a BytesIO object that can, + # then seek to the beginning before yielding + if not hasattr(self.file_bytes, 'seek'): + self.file_bytes = io.BytesIO(self.file_bytes.read()) + self.file_bytes.seek(0) + yield self.file_bytes + else: + with open(self.filepath, "rb") as file_bytes: + yield file_bytes + def _get_pages(self, pages): """Converts pages string to list of ints. @@ -71,7 +110,7 @@ def _get_pages(self, pages): if pages == "1": page_numbers.append({"start": 1, "end": 1}) else: - with open(self.filepath, "rb") as f: + with self.managed_file_context() as f: infile = PdfReader(f, strict=False) if infile.is_encrypted: @@ -107,7 +146,7 @@ def _save_page(self, filepath, page, temp): Tmp directory. """ - with open(filepath, "rb") as fileobj: + with self.managed_file_context() as fileobj: infile = PdfReader(fileobj, strict=False) if infile.is_encrypted: infile.decrypt(self.password) diff --git a/camelot/io.py b/camelot/io.py index a27a7c66..406963b0 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -3,16 +3,21 @@ import warnings from .handlers import PDFHandler -from .utils import validate_input, remove_extra +from .utils import ( + InvalidArguments, + validate_input, + remove_extra, +) def read_pdf( - filepath, + filepath=None, pages="1", password=None, flavor="lattice", suppress_stdout=False, layout_kwargs={}, + file_bytes=None, **kwargs ): """Read PDF and return extracted tables. @@ -22,8 +27,8 @@ def read_pdf( Parameters ---------- - filepath : str - Filepath or URL of the PDF file. + filepath : str | pathlib.Path, optional (default: None) + Filepath or URL of the PDF file. Required if file_bytes is not given pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. @@ -34,6 +39,8 @@ def read_pdf( Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + file_bytes : io.IOBase, optional (default: None) + A file-like stream. Required if filepath is not given layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. table_areas : list, optional (default: None) @@ -103,12 +110,15 @@ def read_pdf( "Unknown flavor specified." " Use either 'lattice' or 'stream'" ) + if not filepath and not file_bytes: + raise InvalidArguments('Either `filepath` or `file_bytes` is required') + with warnings.catch_warnings(): if suppress_stdout: warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/camelot/utils.py b/camelot/utils.py index 404c00b2..354b654a 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -import os +import io import re import random import shutil @@ -36,6 +36,10 @@ _VALID_URLS.discard("") +class InvalidArguments(Exception): + pass + + # https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py def is_url(url): """Check to see if a URL has a valid protocol. @@ -66,8 +70,8 @@ def random_string(length): return ret -def download_url(url): - """Download file from specified URL. +def get_url_bytes(url): + """Get a stream of bytes for url Parameters ---------- @@ -75,22 +79,21 @@ def download_url(url): Returns ------- - filepath : str or unicode - Temporary filepath. + file_bytes : io.BytesIO + a file-like object that cane be read """ - filename = f"{random_string(6)}.pdf" - with tempfile.NamedTemporaryFile("wb", delete=False) as f: - headers = {"User-Agent": "Mozilla/5.0"} - request = Request(url, None, headers) - obj = urlopen(request) - content_type = obj.info().get_content_type() - if content_type != "application/pdf": - raise NotImplementedError("File format not supported") - f.write(obj.read()) - filepath = os.path.join(os.path.dirname(f.name), filename) - shutil.move(f.name, filepath) - return filepath + file_bytes = io.BytesIO() + file_bytes.name = url + headers = {"User-Agent": "Mozilla/5.0"} + request = Request(url, data=None, headers=headers) + obj = urlopen(request) + content_type = obj.info().get_content_type() + if content_type != "application/pdf": + raise NotImplementedError("File format not supported") + file_bytes.write(obj.read()) + file_bytes.seek(0) + return file_bytes stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] diff --git a/tests/test_common.py b/tests/test_common.py index 5d0054b8..2fbae1fb 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import io import os import sys @@ -172,3 +173,20 @@ def test_handler_pages_generator(): handler = PDFHandler(filename) assert handler._get_pages("1,2,5-10") == [1, 2, 5, 6, 7, 8, 9, 10] + + +def test_from_open(): + filename = os.path.join(testdir, "foo.pdf") + with open(filename, "rb") as file_bytes: + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "" + +def test_from_bytes(): + filename = os.path.join(testdir, "foo.pdf") + file_bytes = io.BytesIO() + with open(filename, "rb") as f: + file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler + tables = camelot.read_pdf(file_bytes=file_bytes) + assert repr(tables) == "" + assert repr(tables[0]) == "
"