From c239886736be6782d4817479c50e3e097d7a64e5 Mon Sep 17 00:00:00 2001 From: Deepak Raj <54245038+codePerfectPlus@users.noreply.github.com> Date: Sun, 16 Oct 2022 21:12:59 +0530 Subject: [PATCH] support: web article support added --- audiobook/article_web_scraper.py | 33 ++---- audiobook/main.py | 183 ++++++++----------------------- audiobook/utils.py | 65 +++++++++++ 3 files changed, 124 insertions(+), 157 deletions(-) diff --git a/audiobook/article_web_scraper.py b/audiobook/article_web_scraper.py index 13a990f..b87c9d0 100644 --- a/audiobook/article_web_scraper.py +++ b/audiobook/article_web_scraper.py @@ -26,25 +26,16 @@ def get_title_from_article (self): """ returns the tag from the html page """ return self.soup.title.text - def get_json_from_web_article (self): + def get_page_data(self): """ returns a json from a non-empty <article> tag """ - if hasattr(self.soup, 'article') and self.soup.article is not None: - article_text_tag_items = [ - self.soup.article.findChildren(text_formatting , recursive=True) - for text_formatting in html_text_formattings - ] - - json_article = {} - text_lines = [] - # list(dict.fromkeys(lines))) removes duplicate words in same tag type - for article_text_tag_item in article_text_tag_items: - for article_text_tag in article_text_tag_item: - text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None])) - text_lines += text_line - # list(dict.fromkeys(lines))) removes duplicate words among all tags - text_lines = list(dict.fromkeys(text_lines)) - for num in range(0, len(text_lines)): - json_article[num] = text_lines[num] - return json_article, len(json_article) - else: - raise ValueError(f"<article> tag not found in {self.article_url}") + json_book = {} + response = requests.get(self.article_url) + + if response.status_code != 200: + return None + + soup = BeautifulSoup(response.content, "html.parser") + + text_data = soup.getText().replace("\n","") + + return text_data diff --git a/audiobook/main.py b/audiobook/main.py index d6bb078..45bb1be 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -1,10 +1,8 @@ + import os +from re import I from tqdm import tqdm -import PyPDF2 import pyttsx3 -import ebooklib -from ebooklib import epub - import logging logger = logging.getLogger("PyPDF2") logger.setLevel(logging.INFO) @@ -15,7 +13,13 @@ from audiobook.utils import load_json from audiobook.utils import write_json_file -from audiobook.article_web_scraper import ArticleWebScraper +from audiobook.utils import pdf_to_json +from audiobook.utils import txt_to_json +from audiobook.utils import mobi_to_json +from audiobook.utils import docs_to_json +from audiobook.utils import epub_to_json +from audiobook.utils import html_to_json + from audiobook.config import speed_dict from audiobook.config import supported_file_types @@ -35,13 +39,13 @@ class AudioBook: methods: file_check: checks if file exists pdf_to_json: converts pdf to json format + web_page_to_json: converts web article to json create_json_book: Creates json book from input file by calling respective method read_json: reads a json file save_json_to_audio: save .mp3 audios from a json file in a folder save_book_audio: saves audio files in folder read_book: reads the book - read_web_article: read web article from a given url - save_web_article_audio: save web article to a .mp3 file from a given url + sample usage: ab = AudioBook(speed="normal") ab.read_book(file_path, password="abcd") @@ -51,9 +55,7 @@ def __init__(self, speed="normal", volume=1.0): self.engine = pyttsx3.init() self.engine.setProperty("rate", speed_dict[speed]) self.engine.setProperty("volume", volume) - # set escape key to stop pyttsx3 - # get all books in library def get_library(self): """ get all books in library """ total_books = os.listdir(BOOK_DIR) @@ -61,112 +63,53 @@ def get_library(self): return "You have no books in your library" print("You Have total {} books in your library".format(len(total_books))) return total_books - - def file_check(self, input_file_path): - """ checks file format and if file exists """ - if not os.path.exists(input_file_path): - raise FileNotFoundError("File not found!") - - if not input_file_path.endswith(supported_file_types): - raise IsADirectoryError("File format not supported!") - - def pdf_to_json(self, input_file_path, password=None): - """ sub method to create json book from pdf file""" - json_book = {} - with open(input_file_path, "rb") as fp: - pdfReader = PyPDF2.PdfFileReader(fp) - if pdfReader.isEncrypted: - logging.info("File is encrypted, trying to decrypt...") - pdfReader.decrypt(password) - pages = pdfReader.numPages - for page_num in range(0, pages): - pageObj = pdfReader.getPage(page_num) - extracted_text = pageObj.extractText() - json_book[str(page_num)] = extracted_text - return json_book, pages - - def txt_to_json(self, input_file_path): - """ sub method to create json book from txt file """ - json_book = {} - with open(input_file_path, "r") as fp: - file_txt_data = fp.read() - for i in range(0, len(file_txt_data), 2000): - page_num = i // 2000 - json_book[str(page_num)] = file_txt_data[i:i + 2000] - return json_book, len(json_book) - - def mobi_to_json(self, input_file_path): - """ sub method to create json book from mobi file """ - pass - - def docs_to_json(self, input_file_path): - """ sub method to create json book from docs file """ - pass - - def epub_to_json(self, input_file_path): - json_book = {} - book = epub.read_epub(input_file_path) - text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)]) - for i in range(1, len(text) + 1, 2000): - page_num = i // 2000 - json_book[str(page_num)] = text[i:i + 2000] - - return json_book, len(json_book) - - def create_json_book(self, input_file_path, password=None): + + def create_json_book(self, input_book_path, password=None): """ method to create json book from input file it calls respective method based on file format """ - self.file_check(input_file_path) - filename = os.path.basename(input_file_path).split(".")[0] + ".json" - - if input_file_path.endswith(".pdf"): - json_book, pages = self.pdf_to_json(input_file_path, password) - elif input_file_path.endswith(".txt"): - json_book, pages = self.txt_to_json(input_file_path) - elif input_file_path.endswith(".epub"): - json_book, pages = self.epub_to_json(input_file_path) - - write_json_file(json_book, os.path.join(BOOK_DIR, filename)) - - return json_book, pages - - def save_audio(self, input_file_path, password=None): - """ method to save audio files in folder """ - self.file_check(input_file_path) - - json_filename = os.path.basename(input_file_path).split(".")[0] + ".json" - book_name = os.path.basename(input_file_path).split(".")[0] - - # if json book already exists, load it from library + json_filename = os.path.basename(input_book_path).split(".")[0] + ".json" + if os.path.exists(os.path.join(BOOK_DIR, json_filename)): - print("Book already exists in library") - logging.info("Loading json book from {}".format(json_filename)) json_book = load_json(os.path.join(BOOK_DIR, json_filename)) pages = len(json_book) - else: - print("Creating your audiobook... Please wait...") - json_book, pages = self.create_json_book(input_file_path, password) + return json_book, pages + elif input_book_path.endswith(".pdf"): + json_book, pages = pdf_to_json(input_book_path, password) + elif input_book_path.endswith(".txt"): + json_book, pages = txt_to_json(input_book_path) + elif input_book_path.endswith(".epub"): + json_book, pages = epub_to_json(input_book_path) + elif input_book_path.startswith("http"): + json_book, pages = html_to_json(input_book_path) + + write_json_file(json_book, os.path.join(BOOK_DIR, json_filename)) + + return json_book, pages + def save_audio(self, input_book_path, password=None): + """ method to save audio files in folder """ + json_book, pages = self.create_json_book(input_book_path, password) + + book_name = os.path.basename(input_book_path).split(".")[0] os.makedirs(book_name, exist_ok=True) + print('Saving audio files in folder: {}'.format(book_name)) for page_num, text in tqdm(json_book.items()): - self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3")) + self.engine.save_to_file(text, os.path.join(book_name, + book_name + + "_page_" + + (str(page_num)) + + ".mp3")) self.engine.runAndWait() - def read_book(self, input_file_path, password=None): # argument to be added, save_audio=False, save_json_book=False - """ method to read the book """ - self.file_check(input_file_path) - json_filename = os.path.basename(input_file_path).split(".")[0] + ".json" - - # if json book already exists, load it from library - if os.path.exists(os.path.join(BOOK_DIR, json_filename)): - logging.info("Loading json book from {}".format(json_filename)) - json_book = load_json(os.path.join(BOOK_DIR, json_filename)) - pages = len(json_book) - else: - print("Creating your audiobook... Please wait...") - json_book, pages = self.create_json_book(input_file_path, password) - + def read_book(self, input_book_path, password=None): + """ method to read the book + + input_book_path: filepath, url path or book name + """ + + json_book, pages = self.create_json_book(input_book_path, password) + speak_text(self.engine, f"The book has total {str(pages)} pages!") speak_text(self.engine, "Please enter the page number: ", display=False) start_page = int(input("Please enter the page number: ")) - 1 @@ -202,35 +145,3 @@ def read_book(self, input_file_path, password=None): # argument to be added, sa else: user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n") continue - - - def save_json_to_audio(self, json, audio_name): - """ save json to a list of file in a folder having audio_name, one for each page """ - os.makedirs(audio_name, exist_ok=True) - logger.info('Saving audio files in folder: {}'.format(audio_name)) - for page_num, text in json.items(): - self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3"))) - self.engine.runAndWait() - - def read_web_article(self, article_url): - """ read web article from a article_url containing an <article> tag """ - ws = ArticleWebScraper(article_url) - json_article, pages = ws.get_json_from_web_article() - if len(json_article) > 0: - self.read_json(json_article, pages, "article") - else: - raise ValueError("<article> tag has no text.") - - def save_web_article_audio(self, article_url): - """ save web article from a article_url containing an <article> tag """ - ws = ArticleWebScraper(article_url) - json_article, _ = ws.get_json_from_web_article() - if len(json_article) > 0: - title = ws.get_title_from_article() - folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n") - self.save_json_to_audio(json_article, folder_name) - else: - raise ValueError("<article> tag is empty.") - - - diff --git a/audiobook/utils.py b/audiobook/utils.py index 2419e66..9a6aba6 100644 --- a/audiobook/utils.py +++ b/audiobook/utils.py @@ -1,9 +1,13 @@ from bs4 import BeautifulSoup import re import json +import PyPDF2 +import ebooklib +from ebooklib import epub regex = re.compile(r'[\n\r\t]') +from audiobook.article_web_scraper import ArticleWebScraper def load_json(filename): with open(filename, "r") as fp: @@ -20,7 +24,59 @@ def text_preprocessing(input_text): preprocessed_text = [re.sub(' +', ' ', t) for t in preprocessed_text] return preprocessed_text +def pdf_to_json(self, input_book_path, password=None): + """ sub method to create json book from pdf file""" + json_book = {} + with open(input_book_path, "rb") as fp: + pdfReader = PyPDF2.PdfFileReader(fp) + if pdfReader.isEncrypted: + pdfReader.decrypt(password) + pages = pdfReader.numPages + for page_num in range(0, pages): + pageObj = pdfReader.getPage(page_num) + extracted_text = pageObj.extractText() + json_book[str(page_num)] = extracted_text + return json_book, pages +def txt_to_json(self, input_book_path): + """ sub method to create json book from txt file """ + json_book = {} + with open(input_book_path, "r") as fp: + file_txt_data = fp.read() + for i in range(0, len(file_txt_data), 2000): + page_num = i // 2000 + json_book[str(page_num)] = file_txt_data[i:i + 2000] + return json_book, len(json_book) + +def mobi_to_json(self, input_book_path): + """ sub method to create json book from mobi file """ + pass + +def docs_to_json(self, input_book_path): + """ sub method to create json book from docs file """ + pass + +def epub_to_json(self, input_book_path): + json_book = {} + book = epub.read_epub(input_book_path) + text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)]) + for i in range(1, len(text) + 1, 2000): + page_num = i // 2000 + json_book[str(page_num)] = text[i:i + 2000] + + return json_book, len(json_book) + +def html_to_json(self, url): + """ method to create json book from web article """ + json_book = {} + article_scraper = ArticleWebScraper(url) + page_data = article_scraper.get_page_data() + for i in range(0, len(page_data), 2000): + page_num = i // 2000 + json_book[str(page_num)] = page_data[i:i + 2000] + + return json_book, len(json_book) + def response_to_text(chapter): """ fuction to convert response to text @@ -40,3 +96,12 @@ def speak_text(engine, text, display=True): print(text) engine.say(text) engine.runAndWait() + + +# def file_check(self, input_book_path): +# """ checks file format and if file exists """ +# if not os.path.exists(input_book_path): +# raise FileNotFoundError("File not found!") + +# if not input_book_path.endswith(supported_file_types): +# raise IsADirectoryError("File format not supported!") \ No newline at end of file