Skip to content

Commit

Permalink
Merge pull request #12 from LasseR15/feature/login-via-browser
Browse files Browse the repository at this point in the history
Add login via browser + smaller docker image
  • Loading branch information
LasseR15 authored Nov 30, 2024
2 parents 4883c47 + 982b8ed commit 504e2af
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 65 deletions.
20 changes: 7 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
FROM python:3-bookworm
# Fixed alpine version due to: https://github.com/ocrmypdf/OCRmyPDF/issues/1395
FROM python:3-alpine3.19

LABEL maintainer=LasseR15
LABEL email=lasse.roth@lasse-it.de


RUN apt update
RUN apt install -y tesseract-ocr tesseract-ocr-deu ghostscript


COPY /src /app/src
COPY /requirements.txt /app/requirements.txt
LABEL email=lasse.roth@nexy.dev

WORKDIR /app

RUN pip3 install -r requirements.txt
RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-deu ghostscript

COPY /src ./src
COPY /requirements.txt ./requirements.txt

RUN python -m playwright install-deps chromium
RUN python -m playwright install chromium
RUN pip3 install -r requirements.txt

ENV PYTHONPATH=/app/src/
ENV BASE_OUTPUT_PATH=/app/output
Expand Down
18 changes: 6 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,35 +92,29 @@ There are two variants/tags available:
#### Docker Compose
To use the ocr version of the script with Docker Compose run the following command:
```bash
docker compose run --rm -it bibox-to-pdf \
'{USERNAME}' '{PASSWORD}' {BOOK_ID}
docker compose run --rm -Pit bibox-to-pdf {BOOK_ID}
```
<!-- CURRENTLY NOT AVAILABLE
If you want to run the non-ocr version run the following command.
You can also simply add `--no-ocr` before the username in the above command.
```bash
docker compose -f ./docker-compose.non-ocr.yml --rm -it run bibox-to-cli \
'{USERNAME}' '{PASSWORD}' {BOOK_ID}
docker compose -f ./docker-compose.non-ocr.yml --rm -Pit run bibox-to-cli {BOOK_ID}
```
-->
#### Docker CLI
To use the script with ocr via Docker run the following command:
```bash
docker run --rm -it \
-v ./books:/app/output/books \
ghcr.io/lasser15/bibox-to-pdf:latest \
'{USERNAME}' '{PASSWORD}' {book_id}
docker run --rm -it -p 4200:4200 -v ./books:/app/output/books \
ghcr.io/lasser15/bibox-to-pdf:latest {book_id}
```
<!-- CURRENTLY NOT AVAILABLE
To use it without ocr, run the following command.
You can also simply add `--no-ocr` before the username in the above command.
```bash
docker run --rm -it \
-v ./books:/app/output/books \
ghcr.io/lasser15/bibox-to-pdf:latest-non-ocr \
'{USERNAME}' '{PASSWORD}' {book_id}
docker run --rm -it -p 4200:4200 -v ./books:/app/output/books \
ghcr.io/lasser15/bibox-to-pdf:latest-non-ocr {book_id}
```
-->

Expand Down
3 changes: 3 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
services:
bibox-to-pdf:
image: ghcr.io/lasser15/bibox-to-pdf:latest
restart: no
build:
context: .
volumes:
- ./books:/app/output/books
ports:
- '4200:4200'
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
typer~=0.12.5
playwright~=1.47.0
ocrmypdf~=16.5.0
typer~=0.14.0
ocrmypdf~=16.6.2
img2pdf~=0.5.1
requests~=2.32.3
fastapi~=0.115.5
uvicorn~=0.32.1
2 changes: 1 addition & 1 deletion src/bibox_to_pdf/bibox/BiboxImageDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def get_bibox_images(access_token: str, book_id: int):

if response.status_code != 200:
print(f"Response code from server was not 200. "
f"Either the book id '{book_id}' doesn't exist or the login wasn't successful. "
f"Are you sure the book id '{book_id}' exists and you have access to it? Response code was {response.status_code}.\n"
f"Exiting!")
raise typer.Exit(1)

Expand Down
103 changes: 81 additions & 22 deletions src/bibox_to_pdf/bibox/BiboxLogin.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,94 @@
import typer
from playwright.sync_api import sync_playwright
from bibox_to_pdf.values.BiboxSelectors import BiboxSelectors
import asyncio
import base64
import hashlib
import secrets
import sys

import requests
import uvicorn
from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.responses import HTMLResponse
from rich import print as rprint

from bibox_to_pdf.values.Constants import Constants
from rich import print

login_endpoint_queue = asyncio.Queue()

app = FastAPI()
config = uvicorn.Config(app, host='0.0.0.0', port=4200, log_level="warning")
server = uvicorn.Server(config)

@app.get('/login', response_class=HTMLResponse)
async def login(req: Request, background_tasks: BackgroundTasks):
code = req.query_params.get('code')
if code is None:
return '<h1>Error: Code is missing from request params</h1>'

background_tasks.add_task(login_endpoint_queue.put, code)

return '<h1>You can close this window now</h1>'


async def start_webserver():
try:
await server.serve()
except Exception as e:
rprint(f'Error starting webserver: {e}')
sys.exit(1)

def create_login_link():
login_url = Constants.biboxOauthLoginUrl
client_id = Constants.biboxOauthClientId
redirect_uri = 'http://localhost:4200/login'
code_verifier = secrets.token_urlsafe(96)[:96]

code_verifier_hashed = hashlib.sha256(code_verifier.encode('ascii')).digest()
code_verifier_encoded = base64.urlsafe_b64encode(code_verifier_hashed)
code_challenge = code_verifier_encoded.decode('ascii')[:-1]

login_url = login_url + f'?client_id={client_id}&response_type=code&scope=openid&redirect_uri={redirect_uri}&code_challenge_method=S256&code_challenge={code_challenge}'

return {
'redirect_uri': redirect_uri,
'code_verifier': code_verifier,
'login_url': login_url,
}

def get_access_token(code: str, code_verifier: str, redirect_uri: str) -> str:
token_endpoint = Constants.biboxOauthTokenUrl

token_result = requests.post(token_endpoint, data={
'redirect_uri': redirect_uri,
'code': code,
'code_verifier': code_verifier,
})

if token_result.status_code != 201 | 200:
raise Exception(f'Error getting access token: {token_result.text}')

def login_to_bibox(username: str, password: str) -> str:
with sync_playwright() as p:
print(f"Logging in to BiBox with user '{username}'")
return token_result.json().get('access_token')

browser = p.chromium.launch()
page = browser.new_page()

page.goto(Constants.biboxLoginUrl)
page.wait_for_selector(BiboxSelectors.loginBtn)
async def login_to_bibox() -> str:
# Create a task in a separate thread with a webserver to handle the login callback
webserver_task = asyncio.create_task(start_webserver())

page.type(BiboxSelectors.loginUsernameField, username)
page.type(BiboxSelectors.loginPasswordField, password)
while True:
login_result = create_login_link()
rprint('To log in to bibox open the following link in your browser: ')
print(login_result["login_url"])

with page.expect_navigation():
page.click(BiboxSelectors.loginBtn)
code_result = await login_endpoint_queue.get()

try:
page.wait_for_selector(BiboxSelectors.logoutBtn, timeout=10000)
except:
print('Login credentials incorrect or a network error occurred.')
raise typer.Exit(1)
access_token = get_access_token(code_result, login_result['code_verifier'], login_result['redirect_uri'])
except Exception as e:
rprint(f'Error getting access token: {e}')
continue

access_token = page.evaluate('() => window.localStorage.getItem("oauth.accessToken")')
rprint('Successfully logged in to bibox')

page.close()
browser.close()
await server.shutdown()
webserver_task.cancel()

return access_token
2 changes: 1 addition & 1 deletion src/bibox_to_pdf/pdf/PdfOcr.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def ocr_pdf(book_id: int, pdf_non_ocr_path: str):
pdf_output_dir = Constants.pdfOutputDir.format(book_id)
os.makedirs(pdf_output_dir, exist_ok=True)

print("Starting PDF ocr in German...")
print("Starting PDF ocr in German (if you need another language please open an issue on GitHub)...")

pdf_output_file = Constants.pdfOutputFile.format(book_id, 'ocr-version')

Expand Down
5 changes: 0 additions & 5 deletions src/bibox_to_pdf/values/BiboxSelectors.py

This file was deleted.

4 changes: 3 additions & 1 deletion src/bibox_to_pdf/values/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@


class Constants:
biboxLoginUrl = 'https://bibox2.westermann.de'
biboxOauthLoginUrl = 'https://mein.westermann.de/auth/login'
biboxOauthTokenUrl = 'https://backend.bibox2.westermann.de/token'
biboxOauthClientId = 'Nvw0ZA8Z'
biboxBookInfoUrl = 'https://backend.bibox2.westermann.de/v1/api/sync/{}?materialtypes[]=default&materialtypes[]=addon'

baseOutputPath = os.getenv('BASE_OUTPUT_PATH', default='.')
Expand Down
13 changes: 6 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
import asyncio

import typer
from rich import print
from typing_extensions import Annotated

from bibox_to_pdf.bibox.BiboxImageDownloader import get_bibox_images, download_images_from_bibox
from bibox_to_pdf.bibox.BiboxLogin import login_to_bibox
from bibox_to_pdf.pdf.PdfCreator import create_pdf_from_images
from bibox_to_pdf.pdf.PdfOcr import ocr_pdf
from bibox_to_pdf.values.Constants import Constants
from rich import print


def main(
username: Annotated[str, typer.Argument()],
password: Annotated[str, typer.Argument()],
book_id: Annotated[int, typer.Argument()]):

def main(book_id: Annotated[int, typer.Argument()]):
book_dest_path = Constants.bookBaseOutputDir.format(book_id)
print(f"Downloading book with id '{book_id}' to '{book_dest_path}'...")

access_token = login_to_bibox(username, password)
access_token = asyncio.run(login_to_bibox())

bibox_images = get_bibox_images(access_token, book_id)
image_paths = download_images_from_bibox(bibox_images, book_id)
Expand Down

0 comments on commit 504e2af

Please sign in to comment.