Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support: mobi file format support added #27

Merged
merged 2 commits into from
Oct 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions audiobook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def create_json_book(self, input_book_path, password=None):
json_book, pages = txt_to_json(input_book_path)
elif input_book_path.endswith(".epub"):
json_book, pages = epub_to_json(input_book_path)
elif input_book_path.endswith(".mobi"):
json_book, pages = mobi_to_json(input_book_path)
elif input_book_path.startswith("http"):
json_book, pages = html_to_json(input_book_path)

Expand Down
32 changes: 24 additions & 8 deletions audiobook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import PyPDF2
import ebooklib
from ebooklib import epub
import mobi

import html2text
regex = re.compile(r'[\n\r\t]')

from audiobook.article_web_scraper import ArticleWebScraper
Expand All @@ -18,13 +20,12 @@ def write_json_file(json_data, filename):
with open(filename, "w") as fp:
json.dump(json_data, fp)


def text_preprocessing(input_text):
preprocessed_text = [regex.sub("", t) for t in input_text]
preprocessed_text = [re.sub(' +', ' ', t) for t in preprocessed_text]
return preprocessed_text

def pdf_to_json(self, input_book_path, password=None):
def pdf_to_json(input_book_path, password=None):
""" sub method to create json book from pdf file"""
json_book = {}
with open(input_book_path, "rb") as fp:
Expand All @@ -38,7 +39,7 @@ def pdf_to_json(self, input_book_path, password=None):
json_book[str(page_num)] = extracted_text
return json_book, pages

def txt_to_json(self, input_book_path):
def txt_to_json(input_book_path):
""" sub method to create json book from txt file """
json_book = {}
with open(input_book_path, "r") as fp:
Expand All @@ -48,15 +49,15 @@ def txt_to_json(self, input_book_path):
json_book[str(page_num)] = file_txt_data[i:i + 2000]
return json_book, len(json_book)

def mobi_to_json(self, input_book_path):
def mobi_to_json(input_book_path):
""" sub method to create json book from mobi file """
pass

def docs_to_json(self, input_book_path):
def docs_to_json(input_book_path):
""" sub method to create json book from docs file """
pass

def epub_to_json(self, input_book_path):
def epub_to_json(input_book_path):
json_book = {}
book = epub.read_epub(input_book_path)
text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
Expand All @@ -66,7 +67,7 @@ def epub_to_json(self, input_book_path):

return json_book, len(json_book)

def html_to_json(self, url):
def html_to_json(url):
""" method to create json book from web article """
json_book = {}
article_scraper = ArticleWebScraper(url)
Expand Down Expand Up @@ -98,10 +99,25 @@ def speak_text(engine, text, display=True):
engine.runAndWait()


def mobi_to_json(input_book_path):
""" sub method to create json book from mobi file """
json_book = {}
tempdir, filepath = mobi.extract(input_book_path)
with open(filepath, "r", encoding='utf-8') as fp:
content = fp.read()
book_data = html2text.html2text(content)

for i in range(0, len(book_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = book_data[i:i + 2000]

return json_book, len(json_book)

#mobi_to_json(r"C:\Users\dr\Downloads\sample1.mobi")
# def file_check(self, input_book_path):
# """ checks file format and if file exists """
# if not os.path.exists(input_book_path):
# raise FileNotFoundError("File not found!")

# if not input_book_path.endswith(supported_file_types):
# raise IsADirectoryError("File format not supported!")
# raise IsADirectoryError("File format not supported!")