From 58a5c9dad114622faeb5715e1f2729d848778323 Mon Sep 17 00:00:00 2001 From: Andrzej Szulc Date: Thu, 20 Oct 2022 15:32:28 +0200 Subject: [PATCH 1/3] Add support for ".docx" and ".doc" files, add missing requirements to requirements.txt --- audiobook/config.py | 2 +- audiobook/main.py | 3 +++ audiobook/utils.py | 15 +++++++++++++-- requirements.txt | 3 +++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/audiobook/config.py b/audiobook/config.py index 972f5e9..aaee27f 100644 --- a/audiobook/config.py +++ b/audiobook/config.py @@ -1,4 +1,4 @@ -supported_file_types = (".pdf", ".txt", ".epub") +supported_file_types = (".pdf", ".txt", ".epub", ".docx", ".doc") speed_dict = { "slow": 100, "normal": 150, diff --git a/audiobook/main.py b/audiobook/main.py index 1519923..cd9ef04 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -13,6 +13,7 @@ from audiobook.utils import mobi_to_json from audiobook.utils import epub_to_json from audiobook.utils import html_to_json +from audiobook.utils import docs_to_json from audiobook.config import speed_dict @@ -82,6 +83,8 @@ def create_json_book(self, input_book_path, password=None): json_book, metadata = mobi_to_json(input_book_path) elif input_book_path.startswith("http"): json_book, metadata = html_to_json(input_book_path) + elif input_book_path.endswith(".docx") or input_book_path.endswith(".doc"): + json_book, metadata = docs_to_json(input_book_path) write_json_file(json_book, os.path.join(BOOK_DIR, json_filename)) diff --git a/audiobook/utils.py b/audiobook/utils.py index fcca5f3..b368efc 100644 --- a/audiobook/utils.py +++ b/audiobook/utils.py @@ -1,5 +1,7 @@ import re import os + +import docx2txt import mobi import json import PyPDF2 @@ -110,10 +112,19 @@ def txt_to_json(input_book_path): metadata["book_name"] = book_name return json_book, metadata - def docs_to_json(input_book_path): """ sub method to create json book from docs file """ - pass + metadata = {} + json_book = {} + book_name = os.path.basename(input_book_path).split(".")[0] + book_data = docx2txt.process(input_book_path) + for i in range(0, len(book_data), 2000): + page_num = i // 2000 + json_book[str(page_num)] = book_data[i:i + 2000] + + metadata["pages"] = len(json_book) + metadata["book_name"] = book_name + return json_book, metadata def epub_to_json(input_book_path): metadata = {} diff --git a/requirements.txt b/requirements.txt index b2202bc..5789c19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,6 @@ ebooklib==0.17.1 beautifulsoup4==4.11.1 html2text==2020.1.16 mobi==0.3.3 +docx2txt>=0.8 +requests>=2.28.1 +tqdm>=4.64.1 From 52cd149059523225f7cb4a3d500aea398e171623 Mon Sep 17 00:00:00 2001 From: Andrzej Szulc Date: Thu, 20 Oct 2022 15:47:35 +0200 Subject: [PATCH 2/3] Elif case optimization as suggested in PR --- audiobook/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audiobook/main.py b/audiobook/main.py index cd9ef04..b6688c2 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -83,7 +83,7 @@ def create_json_book(self, input_book_path, password=None): json_book, metadata = mobi_to_json(input_book_path) elif input_book_path.startswith("http"): json_book, metadata = html_to_json(input_book_path) - elif input_book_path.endswith(".docx") or input_book_path.endswith(".doc"): + elif input_book_path.endswith(".docx", ".doc"): json_book, metadata = docs_to_json(input_book_path) write_json_file(json_book, os.path.join(BOOK_DIR, json_filename)) From 669c6d3a4fafb89405dada93d9875fd5ac6c3339 Mon Sep 17 00:00:00 2001 From: Andrzej Szulc Date: Thu, 20 Oct 2022 15:55:28 +0200 Subject: [PATCH 3/3] Elif case optimization as suggested in PR, second attempt --- audiobook/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audiobook/main.py b/audiobook/main.py index b6688c2..4585f88 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -83,7 +83,7 @@ def create_json_book(self, input_book_path, password=None): json_book, metadata = mobi_to_json(input_book_path) elif input_book_path.startswith("http"): json_book, metadata = html_to_json(input_book_path) - elif input_book_path.endswith(".docx", ".doc"): + elif input_book_path.endswith((".docx", ".doc")): json_book, metadata = docs_to_json(input_book_path) write_json_file(json_book, os.path.join(BOOK_DIR, json_filename))