diff --git a/README.md b/README.md index 556939d..ce43e0f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@

discord invite -pyttsx3 total contributors

diff --git a/audiobook/__init__.py b/audiobook/__init__.py index 86c95fc..b2a3a40 100644 --- a/audiobook/__init__.py +++ b/audiobook/__init__.py @@ -1 +1 @@ -from audiobook.main import AudioBook \ No newline at end of file +from audiobook.main import AudioBook diff --git a/audiobook/main.py b/audiobook/main.py index 3af7d55..1519923 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -1,21 +1,16 @@ import os -from tqdm import tqdm import pyttsx3 import logging -logger = logging.getLogger("PyPDF2") -logger.setLevel(logging.INFO) +from tqdm import tqdm -from audiobook.utils import response_to_text from audiobook.utils import speak_text -from audiobook.utils import text_preprocessing from audiobook.utils import load_json from audiobook.utils import write_json_file from audiobook.utils import pdf_to_json from audiobook.utils import txt_to_json from audiobook.utils import mobi_to_json -from audiobook.utils import docs_to_json from audiobook.utils import epub_to_json from audiobook.utils import html_to_json @@ -23,6 +18,8 @@ from audiobook.config import speed_dict from audiobook.config import supported_file_types +logger = logging.getLogger("PyPDF2") +logger.setLevel(logging.INFO) expand_usr = os.path.expanduser("~") BOOK_DIR = os.path.join(expand_usr, "audiobook/library") @@ -44,7 +41,7 @@ class AudioBook: save_json_to_audio: save .mp3 audios from a json file in a folder save_book_audio: saves audio files in folder read_book: reads the book - + sample usage: ab = AudioBook(speed="normal") ab.read_book(file_path, password="abcd") @@ -62,48 +59,49 @@ def get_library(self): return "You have no books in your library" print("You Have total {} books in your library".format(len(total_books))) return total_books - + def create_json_book(self, input_book_path, password=None): """ method to create json book from input file it calls respective method based on file format """ json_filename = os.path.basename(input_book_path).split(".")[0] + ".json" - + if os.path.exists(os.path.join(BOOK_DIR, json_filename)): + metadata = {} print("Book already exists in library, reading from library") json_book = load_json(os.path.join(BOOK_DIR, json_filename)) - pages = len(json_book) - return json_book, pages - + metadata["pages"] = len(json_book) + return json_book, metadata + elif input_book_path.endswith(".pdf"): - json_book, pages = pdf_to_json(input_book_path, password) + json_book, metadata = pdf_to_json(input_book_path, password) elif input_book_path.endswith(".txt"): - json_book, pages = txt_to_json(input_book_path) + json_book, metadata = txt_to_json(input_book_path) elif input_book_path.endswith(".epub"): - json_book, pages = epub_to_json(input_book_path) + json_book, metadata = epub_to_json(input_book_path) elif input_book_path.endswith(".mobi"): - json_book, pages = mobi_to_json(input_book_path) + json_book, metadata = mobi_to_json(input_book_path) elif input_book_path.startswith("http"): - json_book, pages = html_to_json(input_book_path) - + json_book, metadata = html_to_json(input_book_path) + write_json_file(json_book, os.path.join(BOOK_DIR, json_filename)) - return json_book, pages + return json_book, metadata def save_audio(self, input_book_path, password=None, save_page_wise=False): """ method to save audio files in folder """ - json_book, _ = self.create_json_book(input_book_path, password) - - book_name = os.path.basename(input_book_path).split(".")[0] + json_book, metadata = self.create_json_book(input_book_path, password) + + book_name = metadata['book_name'] os.makedirs(book_name, exist_ok=True) - + print('Saving audio files in folder: {}'.format(book_name)) - + if save_page_wise: for page_num, text in tqdm(json_book.items()): - self.engine.save_to_file(text, os.path.join(book_name, - book_name + - "_page_" + - (str(page_num)) + + self.engine.save_to_file(text, os.path.join(book_name, + book_name + + "_page_" + + (str(page_num)) + ".mp3")) self.engine.runAndWait() @@ -111,14 +109,16 @@ def save_audio(self, input_book_path, password=None, save_page_wise=False): all_text = " ".join([text for text in json_book.values()]) self.engine.save_to_file(all_text, os.path.join(book_name, book_name + ".mp3")) self.engine.runAndWait() - + def read_book(self, input_book_path, password=None): - """ method to read the book - + """ method to read the book + input_book_path: filepath, url path or book name """ - json_book, pages = self.create_json_book(input_book_path, password) - + json_book, metadata = self.create_json_book(input_book_path, password) + + pages = metadata["pages"] + speak_text(self.engine, f"The book has total {str(pages)} pages!") speak_text(self.engine, "Please enter the page number: ", display=False) start_page = int(input("Please enter the page number: ")) - 1 diff --git a/audiobook/utils.py b/audiobook/utils.py index 0be392c..fcca5f3 100644 --- a/audiobook/utils.py +++ b/audiobook/utils.py @@ -1,10 +1,12 @@ -from bs4 import BeautifulSoup import re +import os +import mobi import json import PyPDF2 import ebooklib from ebooklib import epub -import mobi +from bs4 import BeautifulSoup + import html2text regex = re.compile(r'[\n\r\t]') @@ -21,34 +23,92 @@ def write_json_file(json_data, filename): def text_preprocessing(input_text): """ function to preprocess text """ - preprocessed_text = regex.sub("", input_text) + preprocessed_text = regex.sub("", input_text) preprocessed_text = re.sub(' +', ' ', preprocessed_text) return preprocessed_text - + +def response_to_text(chapter): + """ fuction to convert response to text + + required for epub files + maybe required for html files + """ + soup = BeautifulSoup(chapter, 'html.parser') + extracted_text = [para.get_text() for para in soup.find_all('p')] + extracted_text = ' '.join(extracted_text) + preprocessed_text = text_preprocessing(extracted_text) + return preprocessed_text + +def speak_text(engine, text, display=True): + """ function to speak text and display it """ + if display: + print(text) + engine.say(text) + engine.runAndWait() + +def mobi_to_json(input_book_path): + """ sub method to create json book from mobi file """ + metadata = {} + json_book = {} + book_name = os.path.basename(input_book_path).split(".")[0] + tempdir, filepath = mobi.extract(input_book_path) + with open(filepath, "r", encoding='utf-8') as fp: + content = fp.read() + book_data = html2text.html2text(content) + book_data = text_preprocessing(book_data) + + for i in range(0, len(book_data), 2000): + page_num = i // 2000 + json_book[str(page_num)] = book_data[i:i + 2000] + + metadata["pages"] = len(json_book) + metadata["book_name"] = book_name + return json_book, metadata + def pdf_to_json(input_book_path, password=None): """ sub method to create json book from pdf file""" + metadata = {} json_book = {} + book_name = os.path.basename(input_book_path).split(".")[0] with open(input_book_path, "rb") as fp: pdfReader = PyPDF2.PdfFileReader(fp) if pdfReader.isEncrypted: pdfReader.decrypt(password) + + information = pdfReader.getDocumentInfo() + + metadata["author"] = information.author + metadata["creator"] = information.creator + metadata["producer"] = information.producer + metadata["subject"] = information.subject + metadata["title"] = information.title + metadata["pages"] = pdfReader.numPages + metadata["book_name"] = book_name + pages = pdfReader.numPages for page_num in range(0, pages): pageObj = pdfReader.getPage(page_num) extracted_text = pageObj.extractText() json_book[str(page_num)] = extracted_text - return json_book, pages + + return json_book, metadata def txt_to_json(input_book_path): """ sub method to create json book from txt file """ json_book = {} + metadata = {} + book_name = os.path.basename(input_book_path).split(".")[0] with open(input_book_path, "r") as fp: file_txt_data = fp.read() file_txt_data = text_preprocessing(file_txt_data) + for i in range(0, len(file_txt_data), 2000): page_num = i // 2000 json_book[str(page_num)] = file_txt_data[i:i + 2000] - return json_book, len(json_book) + + metadata["pages"] = len(json_book) + metadata["book_name"] = book_name + return json_book, metadata def docs_to_json(input_book_path): @@ -56,18 +116,24 @@ def docs_to_json(input_book_path): pass def epub_to_json(input_book_path): + metadata = {} json_book = {} + book_name = os.path.basename(input_book_path).split(".")[0] book = epub.read_epub(input_book_path) text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)]) for i in range(1, len(text) + 1, 2000): page_num = i // 2000 json_book[str(page_num)] = text[i:i + 2000] - return json_book, len(json_book) + metadata["pages"] = len(json_book) + metadata["book_name"] = book_name + return json_book, metadata def html_to_json(url): """ method to create json book from web article """ + metadata = {} json_book = {} + book_name = os.path.basename(url).split(".")[0] article_scraper = ArticleWebScraper(url) page_data = article_scraper.get_page_data() page_data = text_preprocessing(page_data) @@ -75,40 +141,8 @@ def html_to_json(url): page_num = i // 2000 json_book[str(page_num)] = page_data[i:i + 2000] - return json_book, len(json_book) - -def response_to_text(chapter): - """ fuction to convert response to text - - required for epub files - maybe required for html files - """ - soup = BeautifulSoup(chapter, 'html.parser') - extracted_text = [para.get_text() for para in soup.find_all('p')] - extracted_text = ' '.join(extracted_text) - preprocessed_text = text_preprocessing(extracted_text) - return preprocessed_text - + metadata["pages"] = len(json_book) + metadata["book_name"] = book_name + return json_book, metadata -def speak_text(engine, text, display=True): - """ function to speak text and display it """ - if display: - print(text) - engine.say(text) - engine.runAndWait() - - -def mobi_to_json(input_book_path): - """ sub method to create json book from mobi file """ - json_book = {} - tempdir, filepath = mobi.extract(input_book_path) - with open(filepath, "r", encoding='utf-8') as fp: - content = fp.read() - book_data = html2text.html2text(content) - book_data = text_preprocessing(book_data) - - for i in range(0, len(book_data), 2000): - page_num = i // 2000 - json_book[str(page_num)] = book_data[i:i + 2000] - return json_book, len(json_book)