Skip to content

Commit

Permalink
support: web article support added
Browse files Browse the repository at this point in the history
  • Loading branch information
codeperfectplus committed Oct 16, 2022
1 parent f8b85b0 commit c239886
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 157 deletions.
33 changes: 12 additions & 21 deletions audiobook/article_web_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,16 @@ def get_title_from_article (self):
""" returns the <title> tag from the html page """
return self.soup.title.text

def get_json_from_web_article (self):
def get_page_data(self):
""" returns a json from a non-empty <article> tag """
if hasattr(self.soup, 'article') and self.soup.article is not None:
article_text_tag_items = [
self.soup.article.findChildren(text_formatting , recursive=True)
for text_formatting in html_text_formattings
]

json_article = {}
text_lines = []
# list(dict.fromkeys(lines))) removes duplicate words in same tag type
for article_text_tag_item in article_text_tag_items:
for article_text_tag in article_text_tag_item:
text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None]))
text_lines += text_line
# list(dict.fromkeys(lines))) removes duplicate words among all tags
text_lines = list(dict.fromkeys(text_lines))
for num in range(0, len(text_lines)):
json_article[num] = text_lines[num]
return json_article, len(json_article)
else:
raise ValueError(f"<article> tag not found in {self.article_url}")
json_book = {}
response = requests.get(self.article_url)

if response.status_code != 200:
return None

soup = BeautifulSoup(response.content, "html.parser")

text_data = soup.getText().replace("\n","")

return text_data
183 changes: 47 additions & 136 deletions audiobook/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@

import os
from re import I
from tqdm import tqdm
import PyPDF2
import pyttsx3
import ebooklib
from ebooklib import epub

import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.INFO)
Expand All @@ -15,7 +13,13 @@
from audiobook.utils import load_json
from audiobook.utils import write_json_file

from audiobook.article_web_scraper import ArticleWebScraper
from audiobook.utils import pdf_to_json
from audiobook.utils import txt_to_json
from audiobook.utils import mobi_to_json
from audiobook.utils import docs_to_json
from audiobook.utils import epub_to_json
from audiobook.utils import html_to_json


from audiobook.config import speed_dict
from audiobook.config import supported_file_types
Expand All @@ -35,13 +39,13 @@ class AudioBook:
methods:
file_check: checks if file exists
pdf_to_json: converts pdf to json format
web_page_to_json: converts web article to json
create_json_book: Creates json book from input file by calling respective method
read_json: reads a json file
save_json_to_audio: save .mp3 audios from a json file in a folder
save_book_audio: saves audio files in folder
read_book: reads the book
read_web_article: read web article from a given url
save_web_article_audio: save web article to a .mp3 file from a given url
sample usage:
ab = AudioBook(speed="normal")
ab.read_book(file_path, password="abcd")
Expand All @@ -51,122 +55,61 @@ def __init__(self, speed="normal", volume=1.0):
self.engine = pyttsx3.init()
self.engine.setProperty("rate", speed_dict[speed])
self.engine.setProperty("volume", volume)
# set escape key to stop pyttsx3

# get all books in library
def get_library(self):
""" get all books in library """
total_books = os.listdir(BOOK_DIR)
if len(total_books) == 0:
return "You have no books in your library"
print("You Have total {} books in your library".format(len(total_books)))
return total_books

def file_check(self, input_file_path):
""" checks file format and if file exists """
if not os.path.exists(input_file_path):
raise FileNotFoundError("File not found!")

if not input_file_path.endswith(supported_file_types):
raise IsADirectoryError("File format not supported!")

def pdf_to_json(self, input_file_path, password=None):
""" sub method to create json book from pdf file"""
json_book = {}
with open(input_file_path, "rb") as fp:
pdfReader = PyPDF2.PdfFileReader(fp)
if pdfReader.isEncrypted:
logging.info("File is encrypted, trying to decrypt...")
pdfReader.decrypt(password)
pages = pdfReader.numPages
for page_num in range(0, pages):
pageObj = pdfReader.getPage(page_num)
extracted_text = pageObj.extractText()
json_book[str(page_num)] = extracted_text
return json_book, pages

def txt_to_json(self, input_file_path):
""" sub method to create json book from txt file """
json_book = {}
with open(input_file_path, "r") as fp:
file_txt_data = fp.read()
for i in range(0, len(file_txt_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = file_txt_data[i:i + 2000]
return json_book, len(json_book)

def mobi_to_json(self, input_file_path):
""" sub method to create json book from mobi file """
pass

def docs_to_json(self, input_file_path):
""" sub method to create json book from docs file """
pass

def epub_to_json(self, input_file_path):
json_book = {}
book = epub.read_epub(input_file_path)
text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
for i in range(1, len(text) + 1, 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i:i + 2000]

return json_book, len(json_book)

def create_json_book(self, input_file_path, password=None):

def create_json_book(self, input_book_path, password=None):
""" method to create json book from input file
it calls respective method based on file format """
self.file_check(input_file_path)
filename = os.path.basename(input_file_path).split(".")[0] + ".json"

if input_file_path.endswith(".pdf"):
json_book, pages = self.pdf_to_json(input_file_path, password)
elif input_file_path.endswith(".txt"):
json_book, pages = self.txt_to_json(input_file_path)
elif input_file_path.endswith(".epub"):
json_book, pages = self.epub_to_json(input_file_path)

write_json_file(json_book, os.path.join(BOOK_DIR, filename))

return json_book, pages

def save_audio(self, input_file_path, password=None):
""" method to save audio files in folder """
self.file_check(input_file_path)

json_filename = os.path.basename(input_file_path).split(".")[0] + ".json"
book_name = os.path.basename(input_file_path).split(".")[0]

# if json book already exists, load it from library
json_filename = os.path.basename(input_book_path).split(".")[0] + ".json"

if os.path.exists(os.path.join(BOOK_DIR, json_filename)):
print("Book already exists in library")
logging.info("Loading json book from {}".format(json_filename))
json_book = load_json(os.path.join(BOOK_DIR, json_filename))
pages = len(json_book)
else:
print("Creating your audiobook... Please wait...")
json_book, pages = self.create_json_book(input_file_path, password)
return json_book, pages
elif input_book_path.endswith(".pdf"):
json_book, pages = pdf_to_json(input_book_path, password)
elif input_book_path.endswith(".txt"):
json_book, pages = txt_to_json(input_book_path)
elif input_book_path.endswith(".epub"):
json_book, pages = epub_to_json(input_book_path)
elif input_book_path.startswith("http"):
json_book, pages = html_to_json(input_book_path)

write_json_file(json_book, os.path.join(BOOK_DIR, json_filename))

return json_book, pages

def save_audio(self, input_book_path, password=None):
""" method to save audio files in folder """
json_book, pages = self.create_json_book(input_book_path, password)

book_name = os.path.basename(input_book_path).split(".")[0]
os.makedirs(book_name, exist_ok=True)

print('Saving audio files in folder: {}'.format(book_name))
for page_num, text in tqdm(json_book.items()):
self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3"))
self.engine.save_to_file(text, os.path.join(book_name,
book_name +
"_page_" +
(str(page_num)) +
".mp3"))
self.engine.runAndWait()

def read_book(self, input_file_path, password=None): # argument to be added, save_audio=False, save_json_book=False
""" method to read the book """
self.file_check(input_file_path)
json_filename = os.path.basename(input_file_path).split(".")[0] + ".json"

# if json book already exists, load it from library
if os.path.exists(os.path.join(BOOK_DIR, json_filename)):
logging.info("Loading json book from {}".format(json_filename))
json_book = load_json(os.path.join(BOOK_DIR, json_filename))
pages = len(json_book)
else:
print("Creating your audiobook... Please wait...")
json_book, pages = self.create_json_book(input_file_path, password)

def read_book(self, input_book_path, password=None):
""" method to read the book
input_book_path: filepath, url path or book name
"""

json_book, pages = self.create_json_book(input_book_path, password)

speak_text(self.engine, f"The book has total {str(pages)} pages!")
speak_text(self.engine, "Please enter the page number: ", display=False)
start_page = int(input("Please enter the page number: ")) - 1
Expand Down Expand Up @@ -202,35 +145,3 @@ def read_book(self, input_file_path, password=None): # argument to be added, sa
else:
user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n")
continue


def save_json_to_audio(self, json, audio_name):
""" save json to a list of file in a folder having audio_name, one for each page """
os.makedirs(audio_name, exist_ok=True)
logger.info('Saving audio files in folder: {}'.format(audio_name))
for page_num, text in json.items():
self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3")))
self.engine.runAndWait()

def read_web_article(self, article_url):
""" read web article from a article_url containing an <article> tag """
ws = ArticleWebScraper(article_url)
json_article, pages = ws.get_json_from_web_article()
if len(json_article) > 0:
self.read_json(json_article, pages, "article")
else:
raise ValueError("<article> tag has no text.")

def save_web_article_audio(self, article_url):
""" save web article from a article_url containing an <article> tag """
ws = ArticleWebScraper(article_url)
json_article, _ = ws.get_json_from_web_article()
if len(json_article) > 0:
title = ws.get_title_from_article()
folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n")
self.save_json_to_audio(json_article, folder_name)
else:
raise ValueError("<article> tag is empty.")



65 changes: 65 additions & 0 deletions audiobook/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from bs4 import BeautifulSoup
import re
import json
import PyPDF2
import ebooklib
from ebooklib import epub

regex = re.compile(r'[\n\r\t]')

from audiobook.article_web_scraper import ArticleWebScraper

def load_json(filename):
with open(filename, "r") as fp:
Expand All @@ -20,7 +24,59 @@ def text_preprocessing(input_text):
preprocessed_text = [re.sub(' +', ' ', t) for t in preprocessed_text]
return preprocessed_text

def pdf_to_json(self, input_book_path, password=None):
""" sub method to create json book from pdf file"""
json_book = {}
with open(input_book_path, "rb") as fp:
pdfReader = PyPDF2.PdfFileReader(fp)
if pdfReader.isEncrypted:
pdfReader.decrypt(password)
pages = pdfReader.numPages
for page_num in range(0, pages):
pageObj = pdfReader.getPage(page_num)
extracted_text = pageObj.extractText()
json_book[str(page_num)] = extracted_text
return json_book, pages

def txt_to_json(self, input_book_path):
""" sub method to create json book from txt file """
json_book = {}
with open(input_book_path, "r") as fp:
file_txt_data = fp.read()
for i in range(0, len(file_txt_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = file_txt_data[i:i + 2000]
return json_book, len(json_book)

def mobi_to_json(self, input_book_path):
""" sub method to create json book from mobi file """
pass

def docs_to_json(self, input_book_path):
""" sub method to create json book from docs file """
pass

def epub_to_json(self, input_book_path):
json_book = {}
book = epub.read_epub(input_book_path)
text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
for i in range(1, len(text) + 1, 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i:i + 2000]

return json_book, len(json_book)

def html_to_json(self, url):
""" method to create json book from web article """
json_book = {}
article_scraper = ArticleWebScraper(url)
page_data = article_scraper.get_page_data()
for i in range(0, len(page_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = page_data[i:i + 2000]

return json_book, len(json_book)

def response_to_text(chapter):
""" fuction to convert response to text
Expand All @@ -40,3 +96,12 @@ def speak_text(engine, text, display=True):
print(text)
engine.say(text)
engine.runAndWait()


# def file_check(self, input_book_path):
# """ checks file format and if file exists """
# if not os.path.exists(input_book_path):
# raise FileNotFoundError("File not found!")

# if not input_book_path.endswith(supported_file_types):
# raise IsADirectoryError("File format not supported!")

0 comments on commit c239886

Please sign in to comment.