From 1cb57344443122d2dd4d52b6745ddb2298991d6d Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Tue, 8 Aug 2023 11:47:49 +0200
Subject: [PATCH] ImportDocuments - Replace pdfminer3k with pypdf
---
orangecontrib/text/import_documents.py | 30 +++--------------
.../text/tests/test_import_documents.py | 32 +++++++++++++++++++
requirements.txt | 2 +-
3 files changed, 37 insertions(+), 27 deletions(-)
diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py
index 0e10eba05..e18a9e754 100644
--- a/orangecontrib/text/import_documents.py
+++ b/orangecontrib/text/import_documents.py
@@ -25,10 +25,7 @@
from Orange.data.util import get_unique_names
from Orange.misc.utils.embedder_utils import get_proxies
from Orange.util import Registry, dummy_callback
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LAParams, LTTextBox, LTTextLine
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
-from pdfminer.pdfparser import PDFDocument, PDFParser
+from pypdf import PdfReader as PyPDFReader
from requests.exceptions import ConnectionError
from orangecontrib.text.corpus import Corpus
@@ -130,28 +127,9 @@ class PdfReader(Reader):
ext = [".pdf"]
def read_file(self):
- with open(self.path, 'rb') as f:
- parser = PDFParser(f)
- doc = PDFDocument()
- parser.set_document(doc)
- doc.set_parser(parser)
- doc.initialize('')
- rsrcmgr = PDFResourceManager()
- laparams = LAParams()
- laparams.char_margin = 0.1
- laparams.word_margin = 1.0
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- extracted_text = []
-
- for page in doc.get_pages():
- interpreter.process_page(page)
- layout = device.get_result()
- for lt_obj in layout:
- if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
- LTTextLine):
- extracted_text.append(lt_obj.get_text())
- self.content = ' '.join(extracted_text).replace('\x00', '')
+ reader = PyPDFReader(self.path)
+ texts = [page.extract_text() for page in reader.pages]
+ self.content = " ".join(texts)
class XmlReader(Reader):
diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py
index 4aaf96402..3032fe331 100644
--- a/orangecontrib/text/tests/test_import_documents.py
+++ b/orangecontrib/text/tests/test_import_documents.py
@@ -16,6 +16,7 @@
TxtReader,
TextData,
XmlReader,
+ PdfReader,
)
@@ -296,5 +297,36 @@ def test_error(self):
os.remove(fp.name)
+DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents")
+
+
+class TestPdfReader(unittest.TestCase):
+ def test_file(self):
+ reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf"))
+ res = reader.read()[0]
+ exp = (
+ "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam "
+ "nonumy eirmod"
+ )
+ self.assertTrue(res.content.startswith(exp))
+
+ path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf")
+ reader = PdfReader(path)
+ res = reader.read()[0]
+ self.assertEqual("This is a test pdf file", res.content)
+ self.assertEqual("sample_pdf", res.name)
+ self.assertEqual(os.path.join(path), res.path)
+ self.assertListEqual([".pdf"], res.ext)
+ self.assertEqual("good", res.category)
+
+ def test_error(self):
+ reader = PdfReader(
+ os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf")
+ )
+ res = reader.read()
+ self.assertIsNone(res[0])
+ self.assertEqual("sample_pdf_corrupted.pdf", res[1])
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/requirements.txt b/requirements.txt
index 8522e6c28..ccdc4fa48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ orange-widget-base >=4.20.0
orange-canvas-core
owlready2
pandas
-pdfminer3k>=1.3.1
+pypdf
pyqtgraph
pyyaml
requests