Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: metadata information from pdf #37

Merged
merged 5 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
</p>
<p align="center">
<a href="https://discord.gg/JfbK3bS"><img src="https://img.shields.io/discord/758030555005714512.svg?label=Discord&logo=Discord&colorB=7289da&style=for-the-badge" alt="discord invite"></a>
<img src="https://img.shields.io/github/pipenv/locked/dependency-version/py-contributors/audiobook/pyttsx3?style=for-the-badge" alt="pyttsx3">
<a href="https://api.github.com/repos/py-contributors/audiobook/contributors"><img src="https://img.shields.io/github/contributors/py-contributors/audiobook?style=for-the-badge" alt="total contributors"></a>
</p>

Expand Down
2 changes: 1 addition & 1 deletion audiobook/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from audiobook.main import AudioBook
from audiobook.main import AudioBook
66 changes: 33 additions & 33 deletions audiobook/main.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@

import os
from tqdm import tqdm
import pyttsx3
import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.INFO)
from tqdm import tqdm

from audiobook.utils import response_to_text
from audiobook.utils import speak_text
from audiobook.utils import text_preprocessing
from audiobook.utils import load_json
from audiobook.utils import write_json_file

from audiobook.utils import pdf_to_json
from audiobook.utils import txt_to_json
from audiobook.utils import mobi_to_json
from audiobook.utils import docs_to_json
from audiobook.utils import epub_to_json
from audiobook.utils import html_to_json


from audiobook.config import speed_dict
from audiobook.config import supported_file_types

logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.INFO)

expand_usr = os.path.expanduser("~")
BOOK_DIR = os.path.join(expand_usr, "audiobook/library")
Expand All @@ -44,7 +41,7 @@ class AudioBook:
save_json_to_audio: save .mp3 audios from a json file in a folder
save_book_audio: saves audio files in folder
read_book: reads the book

sample usage:
ab = AudioBook(speed="normal")
ab.read_book(file_path, password="abcd")
Expand All @@ -62,63 +59,66 @@ def get_library(self):
return "You have no books in your library"
print("You Have total {} books in your library".format(len(total_books)))
return total_books

def create_json_book(self, input_book_path, password=None):
""" method to create json book from input file
it calls respective method based on file format """
json_filename = os.path.basename(input_book_path).split(".")[0] + ".json"

if os.path.exists(os.path.join(BOOK_DIR, json_filename)):
metadata = {}
print("Book already exists in library, reading from library")
json_book = load_json(os.path.join(BOOK_DIR, json_filename))
pages = len(json_book)
return json_book, pages
metadata["pages"] = len(json_book)
return json_book, metadata

elif input_book_path.endswith(".pdf"):
json_book, pages = pdf_to_json(input_book_path, password)
json_book, metadata = pdf_to_json(input_book_path, password)
elif input_book_path.endswith(".txt"):
json_book, pages = txt_to_json(input_book_path)
json_book, metadata = txt_to_json(input_book_path)
elif input_book_path.endswith(".epub"):
json_book, pages = epub_to_json(input_book_path)
json_book, metadata = epub_to_json(input_book_path)
elif input_book_path.endswith(".mobi"):
json_book, pages = mobi_to_json(input_book_path)
json_book, metadata = mobi_to_json(input_book_path)
elif input_book_path.startswith("http"):
json_book, pages = html_to_json(input_book_path)
json_book, metadata = html_to_json(input_book_path)

write_json_file(json_book, os.path.join(BOOK_DIR, json_filename))

return json_book, pages
return json_book, metadata

def save_audio(self, input_book_path, password=None, save_page_wise=False):
""" method to save audio files in folder """
json_book, _ = self.create_json_book(input_book_path, password)
book_name = os.path.basename(input_book_path).split(".")[0]
json_book, metadata = self.create_json_book(input_book_path, password)

book_name = metadata['book_name']
os.makedirs(book_name, exist_ok=True)

print('Saving audio files in folder: {}'.format(book_name))

if save_page_wise:
for page_num, text in tqdm(json_book.items()):
self.engine.save_to_file(text, os.path.join(book_name,
book_name +
"_page_" +
(str(page_num)) +
self.engine.save_to_file(text, os.path.join(book_name,
book_name +
"_page_" +
(str(page_num)) +
".mp3"))
self.engine.runAndWait()

elif not save_page_wise:
all_text = " ".join([text for text in json_book.values()])
self.engine.save_to_file(all_text, os.path.join(book_name, book_name + ".mp3"))
self.engine.runAndWait()

def read_book(self, input_book_path, password=None):
""" method to read the book
""" method to read the book

input_book_path: filepath, url path or book name
"""
json_book, pages = self.create_json_book(input_book_path, password)

json_book, metadata = self.create_json_book(input_book_path, password)

pages = metadata["pages"]

speak_text(self.engine, f"The book has total {str(pages)} pages!")
speak_text(self.engine, "Please enter the page number: ", display=False)
start_page = int(input("Please enter the page number: ")) - 1
Expand Down
118 changes: 76 additions & 42 deletions audiobook/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from bs4 import BeautifulSoup
import re
import os
import mobi
import json
import PyPDF2
import ebooklib
from ebooklib import epub
import mobi
from bs4 import BeautifulSoup


import html2text
regex = re.compile(r'[\n\r\t]')
Expand All @@ -21,94 +23,126 @@ def write_json_file(json_data, filename):

def text_preprocessing(input_text):
""" function to preprocess text """
preprocessed_text = regex.sub("", input_text)
preprocessed_text = regex.sub("", input_text)
preprocessed_text = re.sub(' +', ' ', preprocessed_text)
return preprocessed_text


def response_to_text(chapter):
""" fuction to convert response to text

required for epub files
maybe required for html files
"""
soup = BeautifulSoup(chapter, 'html.parser')
extracted_text = [para.get_text() for para in soup.find_all('p')]
extracted_text = ' '.join(extracted_text)
preprocessed_text = text_preprocessing(extracted_text)
return preprocessed_text

def speak_text(engine, text, display=True):
""" function to speak text and display it """
if display:
print(text)
engine.say(text)
engine.runAndWait()

def mobi_to_json(input_book_path):
""" sub method to create json book from mobi file """
metadata = {}
json_book = {}
book_name = os.path.basename(input_book_path).split(".")[0]
tempdir, filepath = mobi.extract(input_book_path)
with open(filepath, "r", encoding='utf-8') as fp:
content = fp.read()
book_data = html2text.html2text(content)
book_data = text_preprocessing(book_data)

for i in range(0, len(book_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = book_data[i:i + 2000]

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata

def pdf_to_json(input_book_path, password=None):
""" sub method to create json book from pdf file"""
metadata = {}
json_book = {}
book_name = os.path.basename(input_book_path).split(".")[0]
with open(input_book_path, "rb") as fp:
pdfReader = PyPDF2.PdfFileReader(fp)
if pdfReader.isEncrypted:
pdfReader.decrypt(password)

information = pdfReader.getDocumentInfo()

metadata["author"] = information.author
metadata["creator"] = information.creator
metadata["producer"] = information.producer
metadata["subject"] = information.subject
metadata["title"] = information.title
metadata["pages"] = pdfReader.numPages
metadata["book_name"] = book_name

pages = pdfReader.numPages
for page_num in range(0, pages):
pageObj = pdfReader.getPage(page_num)
extracted_text = pageObj.extractText()
json_book[str(page_num)] = extracted_text
return json_book, pages

return json_book, metadata

def txt_to_json(input_book_path):
""" sub method to create json book from txt file """
json_book = {}
metadata = {}
book_name = os.path.basename(input_book_path).split(".")[0]
with open(input_book_path, "r") as fp:
file_txt_data = fp.read()
file_txt_data = text_preprocessing(file_txt_data)

for i in range(0, len(file_txt_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = file_txt_data[i:i + 2000]
return json_book, len(json_book)

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata


def docs_to_json(input_book_path):
""" sub method to create json book from docs file """
pass

def epub_to_json(input_book_path):
metadata = {}
json_book = {}
book_name = os.path.basename(input_book_path).split(".")[0]
book = epub.read_epub(input_book_path)
text = " ".join([response_to_text(chapter.get_body_content()) for chapter in book.get_items_of_type(ebooklib.ITEM_DOCUMENT)])
for i in range(1, len(text) + 1, 2000):
page_num = i // 2000
json_book[str(page_num)] = text[i:i + 2000]

return json_book, len(json_book)
metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata

def html_to_json(url):
""" method to create json book from web article """
metadata = {}
json_book = {}
book_name = os.path.basename(url).split(".")[0]
article_scraper = ArticleWebScraper(url)
page_data = article_scraper.get_page_data()
page_data = text_preprocessing(page_data)
for i in range(0, len(page_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = page_data[i:i + 2000]

return json_book, len(json_book)

def response_to_text(chapter):
""" fuction to convert response to text

required for epub files
maybe required for html files
"""
soup = BeautifulSoup(chapter, 'html.parser')
extracted_text = [para.get_text() for para in soup.find_all('p')]
extracted_text = ' '.join(extracted_text)
preprocessed_text = text_preprocessing(extracted_text)
return preprocessed_text

metadata["pages"] = len(json_book)
metadata["book_name"] = book_name
return json_book, metadata

def speak_text(engine, text, display=True):
""" function to speak text and display it """
if display:
print(text)
engine.say(text)
engine.runAndWait()


def mobi_to_json(input_book_path):
""" sub method to create json book from mobi file """
json_book = {}
tempdir, filepath = mobi.extract(input_book_path)
with open(filepath, "r", encoding='utf-8') as fp:
content = fp.read()
book_data = html2text.html2text(content)
book_data = text_preprocessing(book_data)

for i in range(0, len(book_data), 2000):
page_num = i // 2000
json_book[str(page_num)] = book_data[i:i + 2000]

return json_book, len(json_book)