Upgrade Keras to 2.4.0 to be compatible with Python 3.8 (#86)

* Upgrade Keras to 2.4.0 or later (which implicitly requires TF 2.2 or later) * Support for Python 3.8
HazyResearch · Oct 9, 2020 · f631efc · f631efc
1 parent fbe6a1a
commit f631efc
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 8 deletions.
diff --git a/.github/workflows/pythontest.yml b/.github/workflows/pythontest.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: [3.6, 3.7]
+        python-version: [3.6, 3.7, 3.8]
     steps:
     - uses: actions/checkout@v2
     - name: Cache conda

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,13 +6,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- [@HiromuHota][HiromuHota]: Support for Python 3.8.
+  ([#86](https://github.com/HazyResearch/pdftotree/pull/86))
+
 ### Changed
 - [@HiromuHota][HiromuHota]: Switch the output format from "HTML-like" to hOCR.
   ([#62](https://github.com/HazyResearch/pdftotree/pull/62))
 - [@HiromuHota][HiromuHota]: Loosen Keras' version restriction, which is now unnecessarily strict.
   ([#68](https://github.com/HazyResearch/pdftotree/pull/68))
 - [@HiromuHota][HiromuHota]: Greedily extract contents from PDF even if it looks scanned.
   ([#71](https://github.com/HazyResearch/pdftotree/pull/71))
+- [@HiromuHota][HiromuHota]: Upgrade Keras to 2.4.0 or later (and TensorFlow 2.2 or later).
+  ([#86](https://github.com/HazyResearch/pdftotree/pull/86))
 
 ### Removed
 - [@HiromuHota][HiromuHota]: Remove "favor_figures" option and extract everything.

diff --git a/pdftotree/visual/visual_utils.py b/pdftotree/visual/visual_utils.py
@@ -1,14 +1,18 @@
 import os
+from typing import Tuple
 
 import keras.backend as K
 import numpy as np
 import selectivesearch
 from keras.preprocessing.image import img_to_array, load_img
+from numpy import ndarray
 from wand.color import Color
 from wand.image import Image
 
 
-def predict_heatmap(pdf_path, page_num, model, img_dim=448, img_dir="tmp/img"):
+def predict_heatmap(
+    pdf_path, page_num, model, img_dim=448, img_dir="tmp/img"
+) -> Tuple[ndarray, ndarray]:
     """
     Return an image corresponding to the page of the pdf
     documents saved at pdf_path. If the image is not found in img_dir this

diff --git a/setup.py b/setup.py
@@ -5,24 +5,24 @@
 setup(
     name="pdftotree",
     version=__version__,
-    description="Parse PDFs into HTML-like trees.",
+    description="Convert PDF into hOCR with text, tables, and figures being recognized and preserved.",
     long_description=open("README.rst").read(),
     packages=find_packages(),
     install_requires=[
         "IPython",
         "beautifulsoup4",
-        "keras>=2.0.8,<2.4.0",  # keras>=2.4.0 requires TensorFlow 2.2 or higher
+        "keras>=2.4.0",
         "numpy",
         "pandas",
         "pdfminer.six>=20191020",
         "pillow",
         "selectivesearch",
         "sklearn",
         "tabula-py",
-        "tensorflow<2.0",
+        "tensorflow>=2.2",
         "wand",
     ],
-    keywords=["pdf", "parsing", "html"],
+    keywords=["pdf", "parsing", "html", "hocr"],
     setup_requires=["pytest-runner"],
     tests_require=["pytest"],
     url="https://github.com/HazyResearch/pdftotree",
@@ -33,13 +33,15 @@
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3 :: Only",
     ],
     project_urls={
         "Tracker": "https://github.com/HazyResearch/pdftotree/issues",
         "Source": "https://github.com/HazyResearch/pdftotree",
     },
-    python_requires=">3",
+    python_requires=">=3.6",
     author="Hazy Research",
     author_email="senwu@cs.stanford.edu",
     license="MIT",

diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -108,4 +108,5 @@ def test_vision_completion():
         model_type="vision",
         model_path="tests/input/paleo_visual_model.h5",
     )
-    assert output is not None
+    soup = BeautifulSoup(output, "lxml")
+    assert len(soup.find_all("table")) == 2
diff --git a/tests/test_table_detection.py b/tests/test_table_detection.py
@@ -0,0 +1,20 @@
+"""Test table area detection."""
+
+from pdftotree.core import load_model
+from pdftotree.visual.visual_utils import predict_heatmap
+
+
+def test_vision_model():
+    """Check if the vision model runs and returns results in expected format."""
+    pdf_file = "tests/input/paleo.pdf"
+    model_path = "tests/input/paleo_visual_model.h5"
+    model = load_model("vision", model_path)
+    page_num = 0
+    image, pred = predict_heatmap(
+        pdf_file, page_num, model
+    )  # index start at 0 with wand
+    assert image.shape == (448, 448, 3)
+    assert pred.shape == (448, 448)
+
+
+# TODO: add test_ml_model and test_heuristic_model