diff --git a/.github/workflows/pythontest.yml b/.github/workflows/pythontest.yml index 78b5c57..502b73b 100644 --- a/.github/workflows/pythontest.yml +++ b/.github/workflows/pythontest.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: [3.6, 3.7] + python-version: [3.6, 3.7, 3.8] steps: - uses: actions/checkout@v2 - name: Cache conda diff --git a/CHANGELOG.md b/CHANGELOG.md index f2e051e..487e597 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- [@HiromuHota][HiromuHota]: Support for Python 3.8. + ([#86](https://github.com/HazyResearch/pdftotree/pull/86)) + ### Changed - [@HiromuHota][HiromuHota]: Switch the output format from "HTML-like" to hOCR. ([#62](https://github.com/HazyResearch/pdftotree/pull/62)) @@ -13,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ([#68](https://github.com/HazyResearch/pdftotree/pull/68)) - [@HiromuHota][HiromuHota]: Greedily extract contents from PDF even if it looks scanned. ([#71](https://github.com/HazyResearch/pdftotree/pull/71)) +- [@HiromuHota][HiromuHota]: Upgrade Keras to 2.4.0 or later (and TensorFlow 2.2 or later). + ([#86](https://github.com/HazyResearch/pdftotree/pull/86)) ### Removed - [@HiromuHota][HiromuHota]: Remove "favor_figures" option and extract everything. diff --git a/pdftotree/visual/visual_utils.py b/pdftotree/visual/visual_utils.py index 162446e..789c748 100644 --- a/pdftotree/visual/visual_utils.py +++ b/pdftotree/visual/visual_utils.py @@ -1,14 +1,18 @@ import os +from typing import Tuple import keras.backend as K import numpy as np import selectivesearch from keras.preprocessing.image import img_to_array, load_img +from numpy import ndarray from wand.color import Color from wand.image import Image -def predict_heatmap(pdf_path, page_num, model, img_dim=448, img_dir="tmp/img"): +def predict_heatmap( + pdf_path, page_num, model, img_dim=448, img_dir="tmp/img" +) -> Tuple[ndarray, ndarray]: """ Return an image corresponding to the page of the pdf documents saved at pdf_path. If the image is not found in img_dir this diff --git a/setup.py b/setup.py index c0ea017..6c64336 100644 --- a/setup.py +++ b/setup.py @@ -5,13 +5,13 @@ setup( name="pdftotree", version=__version__, - description="Parse PDFs into HTML-like trees.", + description="Convert PDF into hOCR with text, tables, and figures being recognized and preserved.", long_description=open("README.rst").read(), packages=find_packages(), install_requires=[ "IPython", "beautifulsoup4", - "keras>=2.0.8,<2.4.0", # keras>=2.4.0 requires TensorFlow 2.2 or higher + "keras>=2.4.0", "numpy", "pandas", "pdfminer.six>=20191020", @@ -19,10 +19,10 @@ "selectivesearch", "sklearn", "tabula-py", - "tensorflow<2.0", + "tensorflow>=2.2", "wand", ], - keywords=["pdf", "parsing", "html"], + keywords=["pdf", "parsing", "html", "hocr"], setup_requires=["pytest-runner"], tests_require=["pytest"], url="https://github.com/HazyResearch/pdftotree", @@ -33,13 +33,15 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3 :: Only", ], project_urls={ "Tracker": "https://github.com/HazyResearch/pdftotree/issues", "Source": "https://github.com/HazyResearch/pdftotree", }, - python_requires=">3", + python_requires=">=3.6", author="Hazy Research", author_email="senwu@cs.stanford.edu", license="MIT", diff --git a/tests/test_basic.py b/tests/test_basic.py index 82afe25..8f9fd0d 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -108,4 +108,5 @@ def test_vision_completion(): model_type="vision", model_path="tests/input/paleo_visual_model.h5", ) - assert output is not None + soup = BeautifulSoup(output, "lxml") + assert len(soup.find_all("table")) == 2 diff --git a/tests/test_table_detection.py b/tests/test_table_detection.py new file mode 100644 index 0000000..912b195 --- /dev/null +++ b/tests/test_table_detection.py @@ -0,0 +1,20 @@ +"""Test table area detection.""" + +from pdftotree.core import load_model +from pdftotree.visual.visual_utils import predict_heatmap + + +def test_vision_model(): + """Check if the vision model runs and returns results in expected format.""" + pdf_file = "tests/input/paleo.pdf" + model_path = "tests/input/paleo_visual_model.h5" + model = load_model("vision", model_path) + page_num = 0 + image, pred = predict_heatmap( + pdf_file, page_num, model + ) # index start at 0 with wand + assert image.shape == (448, 448, 3) + assert pred.shape == (448, 448) + + +# TODO: add test_ml_model and test_heuristic_model