Skip to content

Commit

Permalink
update to tesserocr 2.7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
simonflueckiger authored May 1, 2024
1 parent d7a5d47 commit 7911280
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 271 deletions.
47 changes: 29 additions & 18 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
version: 2.6.2.{build}
version: 2.7.0.{build}
image: Visual Studio 2022

# NOTE 2024-03-13: Tesseract 3.x and 4.x seem to be only compatible with leptonica .so version <= 5.x (<= v1.82)
Expand Down Expand Up @@ -243,8 +243,6 @@ for:
git submodule update --init --recursive
# copy files into newly pulled submodule
Copy-Item -Path "res\setup.py" -Destination "tesserocr"
Copy-Item -Path "res\tesserocr" -Destination "tesserocr" -Recurse
Copy-Item -Path "res\image_dataset" -Destination "tesserocr\tests\image_dataset" -Recurse
# apply tesserocr patches
Expand All @@ -254,9 +252,11 @@ for:
$patch_files = @(
# addresses this issue for tesseract 5.x https://github.com/sirfz/tesserocr/issues/295
# addresses this PR for Pillow not being installed https://github.com/sirfz/tesserocr/pull/341
# also tests whether leptonica can correctly use all image libraries
"test_api.patch"
"test_api.patch",
# adds functionality to copy dlls to the build directory
"setup.patch"
)
Foreach ($patch_file in $patch_files)
Expand All @@ -272,7 +272,7 @@ for:
cd tesserocr
# dumpbin.exe required for finding dependencies in setup.py
# dumpbin.exe required for finding dependencies in find_libraries_and_dependencies.py
$env:Path = "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin;" + $env:Path
# for each {64bit and 32bit} x {Python Versions}
Expand All @@ -286,18 +286,24 @@ for:
$env:BUILD_PLATFORM = "x64"
$python_path = "C:\Python$py_ver-x64"
$conda_path = "C:\Miniconda3-x64"
$env:INCLUDE_PATHS = "$dep_inc_64"
$env:LIB_PATHS = "$dep_lib_64"
$env:DLL_PATHS = "$dep_bin_64"
$dep_bin = "$dep_bin_64"
$env:INCLUDE = "$dep_inc_64"
$env:LIBPATH = "$dep_lib_64"
} else {
$env:BUILD_PLATFORM = "x86"
$python_path = "C:\Python$py_ver"
$conda_path = "C:\Miniconda3"
$env:INCLUDE_PATHS = "$dep_inc_32"
$env:LIB_PATHS = "$dep_lib_32"
$env:DLL_PATHS = "$dep_bin_32"
$dep_bin = "$dep_bin_32"
$env:INCLUDE = "$dep_inc_32"
$env:LIBPATH = "$dep_lib_32"
}
# collect dll path of tesseract and all other dlls it points to inside $dep_bin
$env:DLLPATHS = python ..\res\find_libraries_and_dependencies.py --libraries tesseract --search-paths "$dep_bin"
# tesseract.exe required for determining tesseract version in setup.py
$env:Path = "$dep_bin;" + $env:Path
# use miniconda if python version not found on agent
if (Test-Path $python_path) {
$env:Path = "$python_path;" + $env:Path
Expand Down Expand Up @@ -358,24 +364,29 @@ for:
# append python version and tesseract version string to file
python -c "import platform; print(f'------ Python {platform.python_version()} ({platform.architecture()[0]}) ------')" >> versions.txt
python -c "import tesserocr; print(tesserocr.tesseract_version()); print()" >> versions.txt
# https://stackoverflow.com/a/73984985/1786159
cd tesserocr\tests
# execute test suite
python tesserocr\tests\test_api.py
# execute test suite (verbose output)
python -m unittest -v test_api.py
# throw immediately if not all test passed
if ($LastExitCode -ne 0) { exit $LastExitCode }
# uninstall pillow and execute test suite again for tests which require pillow to not be present
python -m pip uninstall --yes Pillow
# execute test suite
python tesserocr\tests\test_api.py
# execute test suite again (verbose output)
python -m unittest -v test_api.py
# throw immediately if not all test passed
if ($LastExitCode -ne 0) { exit $LastExitCode }
# deactivate conda (not sure if really necessary)
conda deactivate
cd ..\..
}
}
Expand Down Expand Up @@ -435,11 +446,11 @@ for:
}
# create archive for convenience
# call "python upload_to_anaconda.py <path\to\conda_packages" to upload all packages to anaconda
# call "python upload_to_anaconda.py <path\to\conda_packages>" to upload all packages to anaconda
Compress-Archive -Path conda_packages/* -DestinationPath conda_packages.zip
artifacts:
- path: conda_build\win-*\tesserocr*.tar.bz2
- path: conda_packages\win-*\*.tar.bz2
name: conda_package
- path: conda_packages.zip
name: archive
68 changes: 68 additions & 0 deletions res/find_libraries_and_dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import re
import argparse
import subprocess

def find_libraries(library_stems, search_paths, extension):
library_paths = []
for library_stem in library_stems:
library = []
for search_path in search_paths:
try:
files = os.listdir(search_path)
pattern = r"^(?:lib)?{}(?:-(?:\d+\.)*|\d+\.|\.){}$".format(library_stem, extension)
library.extend([
{"filename": filename, "base_path": search_path} for filename in files
if re.search(pattern, filename)
])
except FileNotFoundError:
print(f"Warning: The path {search_path} does not exist.")
continue

if not library:
raise AssertionError(f"No libraries found in {search_paths} which match \"{library_stem}\" stem")
if len(library) != 1:
raise AssertionError(f"Multiple libraries found which match \"{library_stem}\" stem:\n{library}")
library_paths.append(os.path.join(library[0]["base_path"], library[0]["filename"]))

return library_paths

def find_dll_dependencies_recursively(dll_path, search_paths):
try:
dumpbin = subprocess.run(['dumpbin.exe', '/dependents', dll_path], capture_output=True, text=True)
dependency_names = re.findall(r'^\s{4}(\S*\.dll)$', dumpbin.stdout, re.MULTILINE)
except subprocess.CalledProcessError as e:
print(f"Error running dumpbin: {e}")
return []

dependencies = []
for dependency_name in dependency_names:
for search_path in search_paths:
dependency_path = os.path.join(search_path, dependency_name)
if os.path.isfile(dependency_path):
dependencies.append(dependency_path)
found = True
break

dependencies_recursive = []
for dependency in dependencies:
dependencies_recursive.extend(find_dll_dependencies_recursively(dependency, search_paths))

return list(set(dependencies + dependencies_recursive))

def main():
parser = argparse.ArgumentParser(description="Find library files and their dependencies.")
parser.add_argument('-l', '--libraries', nargs='+', required=True, help='Library base names to search for (e.g., tesseract)')
parser.add_argument('-s', '--search-paths', nargs='+', required=True, help='Directories to search within')
parser.add_argument('-e', '--extension', default='dll', help='File extension to match (default: dll)')
args = parser.parse_args()

try:
runtime_library_paths = find_libraries(args.libraries, args.search_paths, args.extension)
runtime_library_paths.extend(find_dll_dependencies_recursively(runtime_library_paths[0], args.search_paths))
print(";".join(runtime_library_paths))
except AssertionError as e:
print(f"Error: {e}")

if __name__ == "__main__":
main()
61 changes: 61 additions & 0 deletions res/patches/tesserocr/setup.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
diff --git a/setup.py b/setup.py
index f7a1c46..2945e59 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@ import os
import re
import subprocess
import sys
+import shutil
from os.path import abspath, dirname
from os.path import join as pjoin
from os.path import split as psplit
@@ -24,6 +25,7 @@ _LOGGER.addHandler(logging.StreamHandler(sys.stderr))

_TESSERACT_MIN_VERSION = "3.04.00"
_CYTHON_COMPILE_TIME_ENV = None
+_DLL_PATHS = []

# find_version from pip https://github.com/pypa/pip/blob/1.5.6/setup.py#L33
here = abspath(dirname(__file__))
@@ -192,6 +194,12 @@ def get_tesseract_version():
"TESSERACT_VERSION": version_to_int(version),
}
if sys.platform == "win32":
+ dllpaths = os.getenv("DLLPATHS", None)
+ if dllpaths:
+ dllpaths = list(filter(None, dllpaths.split(";")))
+ else:
+ dllpaths = []
+
libpaths = os.getenv("LIBPATH", None)
if libpaths:
libpaths = list(filter(None, libpaths.split(";")))
@@ -225,6 +233,7 @@ def get_tesseract_version():
else:
includepaths = []

+ config["dll_paths"] = dllpaths
config["libraries"] = [tess_lib, lept_lib]
config["library_dirs"] = libpaths
config["include_dirs"] = includepaths
@@ -253,6 +262,7 @@ def make_extension():
global _CYTHON_COMPILE_TIME_ENV
build_args = get_build_args()
_CYTHON_COMPILE_TIME_ENV = build_args.pop("compile_time_env")
+ _DLL_PATHS.extend(build_args.pop("dll_paths"))
return Extension("tesserocr.tesserocr", sources=["tesserocr/tesserocr.pyx"], language="c++", **build_args)


@@ -275,6 +285,11 @@ class my_build_ext(build_ext, object):
_LOGGER.debug("tesseract >= 03.05.02 requires c++11 compiler support")
extension.extra_compile_args = extra_args

+ # copy dlls to the build directory
+ dll_dest_dir = os.path.dirname(self.get_ext_fullpath(extension.name))
+ for dll_path in _DLL_PATHS:
+ shutil.copy(dll_path, dll_dest_dir)
+
build_ext.build_extensions(self)

def finalize_options(self):
10 changes: 1 addition & 9 deletions res/patches/tesserocr/test_api.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/tests/test_api.py b/tests/test_api.py
index 766f59b..877cc95 100644
index 0953246..877cc95 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -50,6 +50,34 @@ class TestTessBaseApi(unittest.TestCase):
Expand Down Expand Up @@ -65,11 +65,3 @@ index 766f59b..877cc95 100644
def test_LSTM_choices(self):
"""Test GetBestLSTMSymbolChoices."""
self._api.SetVariable("lstm_choice_mode", "2")
@@ -279,6 +319,7 @@ class TestTessBaseApi(unittest.TestCase):
# Test if empty
self.assertFalse(result)

+ @unittest.skipIf(not pil_installed, "Pillow not installed")
def test_layout_getcomponents(self):
self._api.Init()
self._api.SetImageFile(self._image_file)
Loading

0 comments on commit 7911280

Please sign in to comment.