diff --git a/MANIFEST.in b/MANIFEST.in index 9469f3249..f055a3756 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,12 @@ recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata recursive-include orangecontrib/text/models *.ftz recursive-include orangecontrib/text/sentiment *.txt -recursive-include orangecontrib/text/tests *.txt *.json *.pkl *.udpipe +recursive-include orangecontrib/text/tests *.txt *.json +recursive-include orangecontrib/text/tests/data * recursive-include orangecontrib/text/tutorials *.ows recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai recursive-include orangecontrib/text/widgets/resources *.js *.css *.html -recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu *.csv *.tab *.tab.metadata +recursive-include orangecontrib/text/widgets/tests/data *.txt *.conllu *.csv *.tab *.tab.metadata include orangecontrib/text/widgets/tests/bow-test recursive-include scripts *.sh *.py diff --git a/orangecontrib/text/import_documents.py b/orangecontrib/text/import_documents.py index 0e10eba05..e18a9e754 100644 --- a/orangecontrib/text/import_documents.py +++ b/orangecontrib/text/import_documents.py @@ -25,10 +25,7 @@ from Orange.data.util import get_unique_names from Orange.misc.utils.embedder_utils import get_proxies from Orange.util import Registry, dummy_callback -from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import LAParams, LTTextBox, LTTextLine -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager -from pdfminer.pdfparser import PDFDocument, PDFParser +from pypdf import PdfReader as PyPDFReader from requests.exceptions import ConnectionError from orangecontrib.text.corpus import Corpus @@ -130,28 +127,9 @@ class PdfReader(Reader): ext = [".pdf"] def read_file(self): - with open(self.path, 'rb') as f: - parser = PDFParser(f) - doc = PDFDocument() - parser.set_document(doc) - doc.set_parser(parser) - doc.initialize('') - rsrcmgr = PDFResourceManager() - laparams = LAParams() - laparams.char_margin = 0.1 - laparams.word_margin = 1.0 - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - extracted_text = [] - - for page in doc.get_pages(): - interpreter.process_page(page) - layout = device.get_result() - for lt_obj in layout: - if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, - LTTextLine): - extracted_text.append(lt_obj.get_text()) - self.content = ' '.join(extracted_text).replace('\x00', '') + reader = PyPDFReader(self.path) + texts = [page.extract_text() for page in reader.pages] + self.content = " ".join(texts) class XmlReader(Reader): diff --git a/orangecontrib/text/widgets/tests/data/documents/corrupted/sample_pdf_corrupted.pdf b/orangecontrib/text/tests/data/documents/corrupted/sample_pdf_corrupted.pdf similarity index 62% rename from orangecontrib/text/widgets/tests/data/documents/corrupted/sample_pdf_corrupted.pdf rename to orangecontrib/text/tests/data/documents/corrupted/sample_pdf_corrupted.pdf index 1dfbf06e6..c0d513236 100644 Binary files a/orangecontrib/text/widgets/tests/data/documents/corrupted/sample_pdf_corrupted.pdf and b/orangecontrib/text/tests/data/documents/corrupted/sample_pdf_corrupted.pdf differ diff --git a/orangecontrib/text/tests/data/documents/good/minimal-document.pdf b/orangecontrib/text/tests/data/documents/good/minimal-document.pdf new file mode 100644 index 000000000..af5e73245 Binary files /dev/null and b/orangecontrib/text/tests/data/documents/good/minimal-document.pdf differ diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_docx.docx b/orangecontrib/text/tests/data/documents/good/sample_docx.docx similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_docx.docx rename to orangecontrib/text/tests/data/documents/good/sample_docx.docx diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_odt.odt b/orangecontrib/text/tests/data/documents/good/sample_odt.odt similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_odt.odt rename to orangecontrib/text/tests/data/documents/good/sample_odt.odt diff --git a/orangecontrib/text/widgets/tests/data/documents/good/sample_pdf.pdf b/orangecontrib/text/tests/data/documents/good/sample_pdf.pdf similarity index 100% rename from orangecontrib/text/widgets/tests/data/documents/good/sample_pdf.pdf rename to orangecontrib/text/tests/data/documents/good/sample_pdf.pdf diff --git "a/orangecontrib/text/widgets/tests/data/documents/good/sample_txt_\305\276.txt" "b/orangecontrib/text/tests/data/documents/good/sample_txt_\305\276.txt" similarity index 100% rename from "orangecontrib/text/widgets/tests/data/documents/good/sample_txt_\305\276.txt" rename to "orangecontrib/text/tests/data/documents/good/sample_txt_\305\276.txt" diff --git a/orangecontrib/text/tests/test_import_documents.py b/orangecontrib/text/tests/test_import_documents.py index 4aaf96402..3032fe331 100644 --- a/orangecontrib/text/tests/test_import_documents.py +++ b/orangecontrib/text/tests/test_import_documents.py @@ -16,6 +16,7 @@ TxtReader, TextData, XmlReader, + PdfReader, ) @@ -296,5 +297,36 @@ def test_error(self): os.remove(fp.name) +DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents") + + +class TestPdfReader(unittest.TestCase): + def test_file(self): + reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf")) + res = reader.read()[0] + exp = ( + "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam " + "nonumy eirmod" + ) + self.assertTrue(res.content.startswith(exp)) + + path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf") + reader = PdfReader(path) + res = reader.read()[0] + self.assertEqual("This is a test pdf file", res.content) + self.assertEqual("sample_pdf", res.name) + self.assertEqual(os.path.join(path), res.path) + self.assertListEqual([".pdf"], res.ext) + self.assertEqual("good", res.category) + + def test_error(self): + reader = PdfReader( + os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf") + ) + res = reader.read() + self.assertIsNone(res[0]) + self.assertEqual("sample_pdf_corrupted.pdf", res[1]) + + if __name__ == "__main__": unittest.main() diff --git a/orangecontrib/text/widgets/tests/test_owimportdocuments.py b/orangecontrib/text/widgets/tests/test_owimportdocuments.py index 2930747cb..d4997039b 100644 --- a/orangecontrib/text/widgets/tests/test_owimportdocuments.py +++ b/orangecontrib/text/widgets/tests/test_owimportdocuments.py @@ -8,20 +8,25 @@ from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments +DATA_PATH = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "tests", "data", "documents") +) + + class TestOWImportDocuments(WidgetTest): def setUp(self) -> None: self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) - path = os.path.join(os.path.dirname(__file__), "data/documents") + path = os.path.join(os.path.dirname(__file__), DATA_PATH) self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() def test_current_path(self): - path = os.path.join(os.path.dirname(__file__), "data/documents") + path = os.path.join(os.path.dirname(__file__), DATA_PATH) self.assertEqual(path, self.widget.currentPath) def test_no_skipped(self): - path = os.path.join(os.path.dirname(__file__), "data/documents", "good") + path = os.path.join(DATA_PATH, "good") self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() @@ -29,16 +34,23 @@ def test_no_skipped(self): def test_output(self): output = self.get_output(self.widget.Outputs.data) - self.assertEqual(4, len(output)) + self.assertEqual(5, len(output)) self.assertEqual(3, len(output.domain.metas)) names = output.get_column("name") self.assertListEqual( # ž in sample_text_ž must be unicode char 0x17E not decomposed # 0x7A + 0x30C as it is in file name - ["sample_docx", "sample_odt", "sample_pdf", "sample_txt_ž"], + [ + "minimal-document", + "sample_docx", + "sample_odt", + "sample_pdf", + "sample_txt_ž", + ], sorted(names.tolist()), ) - texts = output.get_column("content") + # skip first document - it contains different text + texts = output.get_column("content")[1:] self.assertListEqual( # ž in sample_text_ž must be unicode char 0x17E not decomposed # 0x7A + 0x30C as it is in file name @@ -99,9 +111,7 @@ def test_conllu_cb(self): self.assertEqual(len(corpus.domain.metas), 4) def test_info_box(self): - self.assertEqual( - "4 documents, 1 skipped", self.widget.info_area.text() - ) + self.assertEqual("5 documents, 1 skipped", self.widget.info_area.text()) # empty widget self.widget: OWImportDocuments = self.create_widget(OWImportDocuments) @@ -124,7 +134,7 @@ def tests_context(self): # change default to something else to see if language is changed self.widget.language = "Slovenian" - path = os.path.join(os.path.dirname(__file__), "data/documents", "good") + path = os.path.join(DATA_PATH, "good") self.widget.setCurrentPath(path) self.widget.reload() self.wait_until_finished() diff --git a/requirements.txt b/requirements.txt index 8522e6c28..ccdc4fa48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0 orange-canvas-core owlready2 pandas -pdfminer3k>=1.3.1 +pypdf pyqtgraph pyyaml requests