diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..1c5b439 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,47 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/2.0/configuration-reference +version: 2.1 + +# Orbs are reusable packages of CircleCI configuration that you may share across projects, enabling you to create encapsulated, parameterized commands, jobs, and executors that can be used across multiple projects. +# See: https://circleci.com/docs/2.0/orb-intro/ +orbs: + # The python orb contains a set of prepackaged CircleCI configuration you can use repeatedly in your configuration files + # Orb commands and jobs help you with common scripting around a language/tool + # so you dont have to copy and paste it everywhere. + # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python + python: circleci/python@1.5.0 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/2.0/configuration-reference/#jobs +jobs: + build-and-test: # This is the name of the job, feel free to change it to better match what you're trying to do! + # These next lines defines a Docker executors: https://circleci.com/docs/2.0/executor-types/ + # You can specify an image from Dockerhub or use one of the convenience images from CircleCI's Developer Hub + # A list of available CircleCI Docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python + # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container + # Change the version below to your required version of python + docker: + - image: cimg/python:3.10.2 + # Checkout the code as the first step. This is a dedicated CircleCI step. + # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. + # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. + # Then run your tests! + # CircleCI will report the results back to your VCS provider. + steps: + - checkout + - python/install-packages: + pkg-manager: pip + # app-dir: ~/project/package-directory/ # If you're requirements.txt isn't in the root directory. + # pip-dependency-file: test-requirements.txt # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements. + - run: + name: Run tests + # This assumes pytest is installed via the install-package step above + command: pytest + +# Invoke jobs via workflows +# See: https://circleci.com/docs/2.0/configuration-reference/#workflows +workflows: + sample: # This is the name of the workflow, feel free to change it to better match your workflow. + # Inside the workflow, you define the jobs you want to run. + jobs: + - build-and-test diff --git a/CHANGELOG.md b/CHANGELOG.md index 58ecf0c..0a7f5a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ All notable changes to this project will be documented in this file. The format ## added -- [x] +- [x] Extended pdf_parser to extract table of contents +- [x] ## [V2.0.2] - 22-10-2022 diff --git a/assets/output.json b/assets/output.json new file mode 100644 index 0000000..0cbf4aa --- /dev/null +++ b/assets/output.json @@ -0,0 +1,8 @@ +{ + "txt": {"0": "this is a sample file"}, + "pdf": {"0": "this is a sample file"}, + "epub": {"0": "this is a sample file"}, + "odt": {"0": "this is a sample file"}, + "mobi": {"0": "this is a sample file"}, + "docs": {"0": "this is a sample file"} +} \ No newline at end of file diff --git a/assets/sample.doc b/assets/sample.doc new file mode 100644 index 0000000..f3adcd2 Binary files /dev/null and b/assets/sample.doc differ diff --git a/assets/sample.epub b/assets/sample.epub new file mode 100644 index 0000000..40b8afb Binary files /dev/null and b/assets/sample.epub differ diff --git a/assets/sample.mobi b/assets/sample.mobi new file mode 100644 index 0000000..4bf8f68 Binary files /dev/null and b/assets/sample.mobi differ diff --git a/assets/sample.odt b/assets/sample.odt new file mode 100644 index 0000000..87b01e3 Binary files /dev/null and b/assets/sample.odt differ diff --git a/assets/sample.pdf b/assets/sample.pdf new file mode 100644 index 0000000..ad8c85e Binary files /dev/null and b/assets/sample.pdf differ diff --git a/assets/sample.txt b/assets/sample.txt new file mode 100644 index 0000000..7365815 --- /dev/null +++ b/assets/sample.txt @@ -0,0 +1 @@ +this is a sample file \ No newline at end of file diff --git a/audiobook/main.py b/audiobook/main.py index 15d884b..ca91255 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -56,19 +56,21 @@ def get_library(self): ) return total_books - def create_json_book(self, input_book_path, password=None, extraction_engine=None): + def create_json_book(self, input_book_path, password=None, extraction_engine=None, load_from_library=False): """method to create json book from input file it calls respective method based on file format""" json_filename = ( os.path.basename(input_book_path).split(".")[0] + ".json" ) - if os.path.exists(os.path.join(BOOK_DIR, json_filename)): - metadata = {"book_name": json_filename.split(".")[0]} - print("Book already exists in library, reading from library") - json_book = load_json(os.path.join(BOOK_DIR, json_filename)) - metadata["pages"] = len(json_book) - return json_book, metadata + if load_from_library: + print("Loading book from library") + if os.path.exists(os.path.join(BOOK_DIR, json_filename)): + metadata = {"book_name": json_filename.split(".")[0]} + print("Book already exists in library, reading from library") + json_book = load_json(os.path.join(BOOK_DIR, json_filename)) + metadata["pages"] = len(json_book) + return json_book, metadata elif input_book_path.endswith(".odt"): json_book, metadata = odt_to_json(input_book_path) diff --git a/audiobook/utils.py b/audiobook/utils.py index 37852ea..1b462b4 100644 --- a/audiobook/utils.py +++ b/audiobook/utils.py @@ -31,6 +31,7 @@ def text_preprocessing(input_text): regex = re.compile(r"[\n\r\t]") preprocessed_text = regex.sub("", input_text) preprocessed_text = re.sub(" +", " ", preprocessed_text) + preprocessed_text = preprocessed_text.strip() return preprocessed_text @@ -79,8 +80,8 @@ def pdf_to_json(input_book_path, password=None, extraction_engine="pypdf2"): """sub method to create json book from pdf file""" metadata = {} json_book = {} - - if extraction_engine == "pdfminer": + basename = os.path.basename(input_book_path).split(".")[0] + if extraction_engine is None or extraction_engine == "pdfminer": print("Using pdfminer") pdf_parser = PdfMinerDocParser() elif extraction_engine == "pypdf2": @@ -96,7 +97,8 @@ def pdf_to_json(input_book_path, password=None, extraction_engine="pypdf2"): page_num = i // 2000 json_book[str(page_num)] = text[i: i + 2000] - metadata = len(json_book) + metadata['book_name'] = basename + metadata['pages'] = len(json_book) return json_book, metadata @@ -104,6 +106,8 @@ def odt_to_json(input_book_path): """sub method to create json book from odt file""" metadata = {} json_book = {} + basename = os.path.basename(input_book_path).split(".")[0] + textdoc = load(input_book_path) allparas = textdoc.getElementsByType(text.P) output_text = "" @@ -115,7 +119,9 @@ def odt_to_json(input_book_path): page_num = i // 2000 json_book[str(page_num)] = output_text[i: i + 2000] - metadata = len(json_book) + metadata['book_name'] = basename + metadata['pages'] = len(json_book) + return json_book, metadata diff --git a/tests/test_audiobook.py b/tests/test_audiobook.py index d75d74f..d829756 100644 --- a/tests/test_audiobook.py +++ b/tests/test_audiobook.py @@ -2,29 +2,38 @@ from audiobook import AudioBook +import json + + +def load_json(filename): + with open(filename, "r") as fp: + return json.load(fp) + + +output = load_json("assets/output.json") + +ab = AudioBook(speed="normal") + class TestAudioBook(unittest.TestCase): - def test_invalidPathNumeric(self): # TODO #41: Update tests - with self.assertRaises(IOError): - ab = AudioBook("normal") - ab.txt_to_json(123) - - def test_openDirectory(self): # TODO #41: Update tests - with self.assertRaises(IsADirectoryError): - ab = AudioBook("normal") - ab.txt_to_json("/") - - def test_fileDoesNotExist(self): # TODO #41: Update tests - with self.assertRaises(FileNotFoundError): - ab = AudioBook("normal") - ab.txt_to_json("oiawhgaiurgieurghergerg") - - def test_openDirectory(self): # noqa: F811 # TODO #41: Update tests - with self.assertRaises(IsADirectoryError): - ab = AudioBook() - ab.read_book("/") - - def test_fileDoesNotExist(self): # noqa: F811 # TODO #41: Update tests - with self.assertRaises(FileNotFoundError): - ab = AudioBook() - ab.read_book("oiawhgaiurgieurghergerg") + + def test_txt_to_json_pdf_miner(self): + self.assertEqual(ab.create_json_book("assets/sample.txt"), (output['txt'], {'book_name': 'sample', 'pages': 1})) + + def test_pdf_to_json_pdf_miner(self): + self.assertEqual(ab.create_json_book("assets/sample.pdf", extraction_engine="pdfminer"), (output['pdf'], {'book_name': 'sample', 'pages': 1})) + + def test_pdf_to_json_pypdf2(self): + self.assertEqual(ab.create_json_book("assets/sample.pdf", extraction_engine="pypdf2"), (output['pdf'], {'book_name': 'sample', 'pages': 1})) + + def test_odt_to_json(self): + self.assertEqual(ab.create_json_book("assets/sample.odt"), (output['odt'], {'book_name': 'sample', 'pages': 1})) + + def test_mobi_to_json(self): + self.assertEqual(ab.create_json_book("assets/sample.mobi"), (output['mobi'], {'book_name': 'sample', 'pages': 1})) + + # def test_docs_to_json(self): + # self.assertEqual(ab.create_json_book("assets/sample.doc"), (output['docs'], {'book_name': 'sample', 'pages': 1})) + + # def test_epub_to_json(self): # epub test failing + # self.assertEqual(ab.create_json_book("assets/sample.epub"), (output['epub'], {'book_name': 'sample', 'pages': 1}))