From c239886736be6782d4817479c50e3e097d7a64e5 Mon Sep 17 00:00:00 2001
From: Deepak Raj <54245038+codePerfectPlus@users.noreply.github.com>
Date: Sun, 16 Oct 2022 21:12:59 +0530
Subject: [PATCH] support: web article support added

---
 audiobook/article_web_scraper.py |  33 ++----
 audiobook/main.py                | 183 ++++++++-----------------------
 audiobook/utils.py               |  65 +++++++++++
 3 files changed, 124 insertions(+), 157 deletions(-)
diff --git a/audiobook/article_web_scraper.py b/audiobook/article_web_scraper.py
index 13a990f..b87c9d0 100644
--- a/audiobook/article_web_scraper.py
+++ b/audiobook/article_web_scraper.py
@@ -26,25 +26,16 @@ def get_title_from_article (self):
         """ returns the <title> tag from the html page """
         return self.soup.title.text
     
-    def get_json_from_web_article (self):
+    def get_page_data(self):
         """ returns a json from a non-empty <article> tag """
-        if hasattr(self.soup, 'article') and self.soup.article is not None: 
-            article_text_tag_items = [
-                self.soup.article.findChildren(text_formatting , recursive=True) 
-                for text_formatting in html_text_formattings
-            ]
-
-            json_article = {}
-            text_lines = []
-            # list(dict.fromkeys(lines))) removes duplicate words in same tag type
-            for article_text_tag_item in article_text_tag_items:
-                for article_text_tag in article_text_tag_item:
-                    text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None])) 
-                    text_lines += text_line
-            # list(dict.fromkeys(lines))) removes duplicate words among all tags
-            text_lines = list(dict.fromkeys(text_lines))
-            for num in range(0, len(text_lines)):
-                json_article[num] = text_lines[num]
-            return json_article, len(json_article)
-        else:
-            raise ValueError(f"<article> tag not found in {self.article_url}")
+        json_book = {}
+        response = requests.get(self.article_url)
+        
+        if response.status_code != 200:
+            return None
+        
+        soup = BeautifulSoup(response.content, "html.parser")
+        
+        text_data = soup.getText().replace("\n","")
+        
+        return text_data
diff --git a/audiobook/main.py b/audiobook/main.py
index d6bb078..45bb1be 100644
--- a/audiobook/main.py
+++ b/audiobook/main.py
@@ -1,10 +1,8 @@
+
 import os
+from re import I
 from tqdm import tqdm
-import PyPDF2
 import pyttsx3
-import ebooklib
-from ebooklib import epub
-
 import logging
 logger = logging.getLogger("PyPDF2")
 logger.setLevel(logging.INFO)
@@ -15,7 +13,13 @@
 from audiobook.utils import load_json
 from audiobook.utils import write_json_file
 
-from audiobook.article_web_scraper import ArticleWebScraper
+from audiobook.utils import pdf_to_json
+from audiobook.utils import txt_to_json
+from audiobook.utils import mobi_to_json
+from audiobook.utils import docs_to_json
+from audiobook.utils import epub_to_json
+from audiobook.utils import html_to_json
+
 
 from audiobook.config import speed_dict
 from audiobook.config import supported_file_types
@@ -35,13 +39,13 @@ class AudioBook:
     methods:
         file_check: checks if file exists
         pdf_to_json: converts pdf to json format
+        web_page_to_json: converts web article to json
         create_json_book: Creates json book from input file by calling respective method
         read_json: reads a json file
         save_json_to_audio: save .mp3 audios from a json file in a folder
         save_book_audio: saves audio files in folder
         read_book: reads the book
-        read_web_article: read web article from a given url
-        save_web_article_audio: save web article to a .mp3 file from a given url
+        
     sample usage:
         ab = AudioBook(speed="normal")
         ab.read_book(file_path, password="abcd")
@@ -51,9 +55,7 @@ def __init__(self, speed="normal", volume=1.0):
         self.engine = pyttsx3.init()
         self.engine.setProperty("rate", speed_dict[speed])
         self.engine.setProperty("volume", volume)
-        # set escape key to stop pyttsx3
 
-    # get all books in library
     def get_library(self):
         """ get all books in library """
         total_books = os.listdir(BOOK_DIR)
@@ -61,112 +63,53 @@ def get_library(self):
             return "You have no books in your library"
         print("You Have total {} books in your library".format(len(total_books)))
         return total_books
-
-    def file_check(self, input_file_path):
-        """ checks file format and if file exists """
-        if not os.path.exists(input_file_path):
-            raise FileNotFoundError("File not found!")
-
-        if not input_file_path.endswith(supported_file_types):
-            raise IsADirectoryError("File format not supported!")
-
-    def pdf_to_json(self, input_file_path, password=None):
-        """ sub method to create json book from pdf file"""
-        json_book = {}
-        with open(input_file_path, "rb") as fp:
-            pdfReader = PyPDF2.PdfFileReader(fp)
-            if pdfReader.isEncrypted:
-                logging.info("File is encrypted, trying to decrypt...")
-                pdfReader.decrypt(password)
-            pages = pdfReader.numPages
-            for page_num in range(0, pages):
-                pageObj = pdfReader.getPage(page_num)
-                extracted_text = pageObj.extractText()
-                json_book[str(page_num)] = extracted_text
-        return json_book, pages
-
-    def txt_to_json(self, input_file_path):
-        """ sub method to create json book from txt file """
-        json_book = {}
-        with open(input_file_path, "r") as fp:
-            file_txt_data = fp.read()
-        for i in range(0, len(file_txt_data), 2000):
-            page_num = i // 2000
-            json_book[str(page_num)] = file_txt_data[i:i + 2000]
-        return json_book, len(json_book)
-
-    def mobi_to_json(self, input_file_path):
-        """ sub method to create json book from mobi file """
-        pass
-
-    def docs_to_json(self, input_file_path):
-        """ sub method to create json book from docs file """
-        pass
-
-    def epub_to_json(self, input_file_path):
-        json_book = {}
-        book = epub.read_epub(input_file_path)
-        text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
-        for i in range(1, len(text) + 1, 2000):
-            page_num = i // 2000
-            json_book[str(page_num)] = text[i:i + 2000]
-
-        return json_book, len(json_book)
-
-    def create_json_book(self, input_file_path, password=None):
+    
+    def create_json_book(self, input_book_path, password=None):
         """ method to create json book from input file
             it calls respective method based on file format """
-        self.file_check(input_file_path)
-        filename = os.path.basename(input_file_path).split(".")[0] + ".json"
-
-        if input_file_path.endswith(".pdf"):
-            json_book, pages = self.pdf_to_json(input_file_path, password)
-        elif input_file_path.endswith(".txt"):
-            json_book, pages = self.txt_to_json(input_file_path)
-        elif input_file_path.endswith(".epub"):
-            json_book, pages = self.epub_to_json(input_file_path)
-
-        write_json_file(json_book, os.path.join(BOOK_DIR, filename))
-
-        return json_book, pages
-
-   def save_audio(self, input_file_path, password=None):
-        """ method to save audio files in folder """
-        self.file_check(input_file_path)
-
-        json_filename = os.path.basename(input_file_path).split(".")[0] + ".json"
-        book_name = os.path.basename(input_file_path).split(".")[0]
-
-        # if json book already exists, load it from library
+        json_filename = os.path.basename(input_book_path).split(".")[0] + ".json"
+        
         if os.path.exists(os.path.join(BOOK_DIR, json_filename)):
-            print("Book already exists in library")
-            logging.info("Loading json book from {}".format(json_filename))
             json_book = load_json(os.path.join(BOOK_DIR, json_filename))
             pages = len(json_book)
-        else:
-            print("Creating your audiobook... Please wait...")
-            json_book, pages = self.create_json_book(input_file_path, password)
+            return json_book, pages
+        elif input_book_path.endswith(".pdf"):
+            json_book, pages = pdf_to_json(input_book_path, password)
+        elif input_book_path.endswith(".txt"):
+            json_book, pages = txt_to_json(input_book_path)
+        elif input_book_path.endswith(".epub"):
+            json_book, pages = epub_to_json(input_book_path)
+        elif input_book_path.startswith("http"):
+            json_book, pages = html_to_json(input_book_path)
+            
+        write_json_file(json_book, os.path.join(BOOK_DIR, json_filename))
+
+        return json_book, pages
 
+    def save_audio(self, input_book_path, password=None):
+        """ method to save audio files in folder """
+        json_book, pages = self.create_json_book(input_book_path, password)
+        
+        book_name = os.path.basename(input_book_path).split(".")[0]
         os.makedirs(book_name, exist_ok=True)
+        
         print('Saving audio files in folder: {}'.format(book_name))
         for page_num, text in tqdm(json_book.items()):
-            self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3"))
+            self.engine.save_to_file(text, os.path.join(book_name, 
+                                                        book_name + 
+                                                        "_page_" + 
+                                                        (str(page_num)) + 
+                                                        ".mp3"))
             self.engine.runAndWait()
     
-    def read_book(self, input_file_path, password=None):  # argument to be added, save_audio=False, save_json_book=False
-        """ method to read the book """
-        self.file_check(input_file_path)
-        json_filename = os.path.basename(input_file_path).split(".")[0] + ".json"
-
-        # if json book already exists, load it from library
-        if os.path.exists(os.path.join(BOOK_DIR, json_filename)):
-            logging.info("Loading json book from {}".format(json_filename))
-            json_book = load_json(os.path.join(BOOK_DIR, json_filename))
-            pages = len(json_book)
-        else:
-            print("Creating your audiobook... Please wait...")
-            json_book, pages = self.create_json_book(input_file_path, password)
-
+    def read_book(self, input_book_path, password=None):
+        """ method to read the book 
+        
+        input_book_path: filepath, url path or book name
+        """
+        
+        json_book, pages = self.create_json_book(input_book_path, password)
+        
         speak_text(self.engine, f"The book has total {str(pages)} pages!")
         speak_text(self.engine, "Please enter the page number: ", display=False)
         start_page = int(input("Please enter the page number: ")) - 1
@@ -202,35 +145,3 @@ def read_book(self, input_file_path, password=None):  # argument to be added, sa
             else:
                 user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n")
                 continue
-
-
-    def save_json_to_audio(self, json, audio_name):
-        """ save json to a list of file in a folder having audio_name, one for each page """
-        os.makedirs(audio_name, exist_ok=True)
-        logger.info('Saving audio files in folder: {}'.format(audio_name))
-        for page_num, text in json.items():
-            self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3")))
-            self.engine.runAndWait()
-
-    def read_web_article(self, article_url):
-        """ read web article from a article_url containing an <article> tag """
-        ws = ArticleWebScraper(article_url)
-        json_article, pages = ws.get_json_from_web_article()
-        if len(json_article) > 0:
-            self.read_json(json_article, pages, "article")
-        else:
-            raise ValueError("<article> tag has no text.")
-    
-    def save_web_article_audio(self, article_url):
-        """ save web article from a article_url containing an <article> tag """
-        ws = ArticleWebScraper(article_url)
-        json_article, _ = ws.get_json_from_web_article()
-        if len(json_article) > 0:
-            title = ws.get_title_from_article()
-            folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n")
-            self.save_json_to_audio(json_article, folder_name)
-        else:
-            raise ValueError("<article> tag is empty.")
-        
-
-        
diff --git a/audiobook/utils.py b/audiobook/utils.py
index 2419e66..9a6aba6 100644
--- a/audiobook/utils.py
+++ b/audiobook/utils.py
@@ -1,9 +1,13 @@
 from bs4 import BeautifulSoup
 import re
 import json
+import PyPDF2
+import ebooklib
+from ebooklib import epub
 
 regex = re.compile(r'[\n\r\t]')
 
+from audiobook.article_web_scraper import ArticleWebScraper
 
 def load_json(filename):
     with open(filename, "r") as fp:
@@ -20,7 +24,59 @@ def text_preprocessing(input_text):
     preprocessed_text = [re.sub(' +', ' ', t) for t in preprocessed_text]
     return preprocessed_text
 
+def pdf_to_json(self, input_book_path, password=None):
+    """ sub method to create json book from pdf file"""
+    json_book = {}
+    with open(input_book_path, "rb") as fp:
+        pdfReader = PyPDF2.PdfFileReader(fp)
+        if pdfReader.isEncrypted:
+            pdfReader.decrypt(password)
+        pages = pdfReader.numPages
+        for page_num in range(0, pages):
+            pageObj = pdfReader.getPage(page_num)
+            extracted_text = pageObj.extractText()
+            json_book[str(page_num)] = extracted_text
+    return json_book, pages
 
+def txt_to_json(self, input_book_path):
+    """ sub method to create json book from txt file """
+    json_book = {}
+    with open(input_book_path, "r") as fp:
+        file_txt_data = fp.read()
+    for i in range(0, len(file_txt_data), 2000):
+        page_num = i // 2000
+        json_book[str(page_num)] = file_txt_data[i:i + 2000]
+    return json_book, len(json_book)     
+
+def mobi_to_json(self, input_book_path):
+    """ sub method to create json book from mobi file """
+    pass
+
+def docs_to_json(self, input_book_path):
+    """ sub method to create json book from docs file """
+    pass
+
+def epub_to_json(self, input_book_path):
+    json_book = {}
+    book = epub.read_epub(input_book_path)
+    text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
+    for i in range(1, len(text) + 1, 2000):
+        page_num = i // 2000
+        json_book[str(page_num)] = text[i:i + 2000]
+
+    return json_book, len(json_book)
+
+def html_to_json(self, url):
+    """ method to create json book from web article """
+    json_book = {}
+    article_scraper = ArticleWebScraper(url)
+    page_data = article_scraper.get_page_data()
+    for i in range(0, len(page_data), 2000):
+        page_num = i // 2000
+        json_book[str(page_num)] = page_data[i:i + 2000]
+
+    return json_book, len(json_book)   
+    
 def response_to_text(chapter):
     """ fuction to convert response to text
 
@@ -40,3 +96,12 @@ def speak_text(engine, text, display=True):
         print(text)
     engine.say(text)
     engine.runAndWait()
+
+
+# def file_check(self, input_book_path):
+#     """ checks file format and if file exists """
+#     if not os.path.exists(input_book_path):
+#         raise FileNotFoundError("File not found!")
+
+#     if not input_book_path.endswith(supported_file_types):
+#         raise IsADirectoryError("File format not supported!")
\ No newline at end of file