From 1cb57344443122d2dd4d52b6745ddb2298991d6d Mon Sep 17 00:00:00 2001 From: PrimozGodec Date: Tue, 8 Aug 2023 11:47:49 +0200 Subject: [PATCH] ImportDocuments - Replace pdfminer3k with pypdf --- orangecontrib/text/import_documents.py | 30 +++-------------- .../text/tests/test_import_documents.py | 32 +++++++++++++++++++ requirements.txt | 2 +- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 0e10eba05..e18a9e754 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -25,10 +25,7 @@ from Orange.data.util import get_unique_names from Orange.misc.utils.embedder_utils import get_proxies from Orange.util import Registry, dummy_callback -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTTextBox, LTTextLine -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfparser import PDFDocument, PDFParser +from pypdf import PdfReader as PyPDFReader from requests.exceptions import ConnectionError from orangecontrib.text.corpus import Corpus @@ -130,28 +127,9 @@ class PdfReader(Reader): ext = [".pdf"] def read_file(self): - with open(self.path, 'rb') as f: - parser = PDFParser(f) - doc = PDFDocument() - parser.set_document(doc) - doc.set_parser(parser) - doc.initialize('') - rsrcmgr = PDFResourceManager() - laparams = LAParams() - laparams.char_margin = 0.1 - laparams.word_margin = 1.0 - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - extracted_text = [] - - for page in doc.get_pages(): - interpreter.process_page(page) - layout = device.get_result() - for lt_obj in layout: - if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, - LTTextLine): - extracted_text.append(lt_obj.get_text()) - self.content = ' '.join(extracted_text).replace('\x00', '') + reader = PyPDFReader(self.path) + texts = [page.extract_text() for page in reader.pages] + self.content = " ".join(texts) class XmlReader(Reader): diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 4aaf96402..3032fe331 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -16,6 +16,7 @@ TxtReader, TextData, XmlReader, + PdfReader, ) @@ -296,5 +297,36 @@ def test_error(self): os.remove(fp.name) +DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents") + + +class TestPdfReader(unittest.TestCase): + def test_file(self): + reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf")) + res = reader.read()[0] + exp = ( + "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam " + "nonumy eirmod" + ) + self.assertTrue(res.content.startswith(exp)) + + path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf") + reader = PdfReader(path) + res = reader.read()[0] + self.assertEqual("This is a test pdf file", res.content) + self.assertEqual("sample_pdf", res.name) + self.assertEqual(os.path.join(path), res.path) + self.assertListEqual([".pdf"], res.ext) + self.assertEqual("good", res.category) + + def test_error(self): + reader = PdfReader( + os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf") + ) + res = reader.read() + self.assertIsNone(res[0]) + self.assertEqual("sample_pdf_corrupted.pdf", res[1]) + + if __name__ == "__main__": unittest.main() diff --git a/requirements.txt b/requirements.txt index 8522e6c28..ccdc4fa48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0 orange-canvas-core owlready2 pandas -pdfminer3k>=1.3.1 +pypdf pyqtgraph pyyaml requests