From 64438510437da8711f0ea8877ab959b638368454 Mon Sep 17 00:00:00 2001 From: Lasse Roth Date: Sat, 30 Nov 2024 22:07:03 +0100 Subject: [PATCH 1/3] Add login via browser --- requirements.txt | 7 +- .../bibox/BiboxImageDownloader.py | 2 +- src/bibox_to_pdf/bibox/BiboxLogin.py | 103 ++++++++++++++---- src/bibox_to_pdf/pdf/PdfOcr.py | 2 +- src/bibox_to_pdf/values/BiboxSelectors.py | 5 - src/bibox_to_pdf/values/Constants.py | 4 +- src/main.py | 13 +-- 7 files changed, 96 insertions(+), 40 deletions(-) delete mode 100644 src/bibox_to_pdf/values/BiboxSelectors.py diff --git a/requirements.txt b/requirements.txt index 498baf2..538beb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -typer~=0.12.5 -playwright~=1.47.0 -ocrmypdf~=16.5.0 +typer~=0.14.0 +ocrmypdf~=16.6.2 img2pdf~=0.5.1 requests~=2.32.3 +fastapi~=0.115.5 +uvicorn~=0.32.1 diff --git a/src/bibox_to_pdf/bibox/BiboxImageDownloader.py b/src/bibox_to_pdf/bibox/BiboxImageDownloader.py index 79e4ae0..452b3cb 100644 --- a/src/bibox_to_pdf/bibox/BiboxImageDownloader.py +++ b/src/bibox_to_pdf/bibox/BiboxImageDownloader.py @@ -14,7 +14,7 @@ def get_bibox_images(access_token: str, book_id: int): if response.status_code != 200: print(f"Response code from server was not 200. " - f"Either the book id '{book_id}' doesn't exist or the login wasn't successful. " + f"Are you sure the book id '{book_id}' exists and you have access to it? Response code was {response.status_code}.\n" f"Exiting!") raise typer.Exit(1) diff --git a/src/bibox_to_pdf/bibox/BiboxLogin.py b/src/bibox_to_pdf/bibox/BiboxLogin.py index 9d5d91e..e9b5a11 100644 --- a/src/bibox_to_pdf/bibox/BiboxLogin.py +++ b/src/bibox_to_pdf/bibox/BiboxLogin.py @@ -1,35 +1,94 @@ -import typer -from playwright.sync_api import sync_playwright -from bibox_to_pdf.values.BiboxSelectors import BiboxSelectors +import asyncio +import base64 +import hashlib +import secrets +import sys + +import requests +import uvicorn +from fastapi import FastAPI, Request, BackgroundTasks +from fastapi.responses import HTMLResponse +from rich import print as rprint + from bibox_to_pdf.values.Constants import Constants -from rich import print +login_endpoint_queue = asyncio.Queue() + +app = FastAPI() +config = uvicorn.Config(app, host='0.0.0.0', port=4200, log_level="warning") +server = uvicorn.Server(config) + +@app.get('/login', response_class=HTMLResponse) +async def login(req: Request, background_tasks: BackgroundTasks): + code = req.query_params.get('code') + if code is None: + return '

Error: Code is missing from request params

' + + background_tasks.add_task(login_endpoint_queue.put, code) + + return '

You can close this window now

' + + +async def start_webserver(): + try: + await server.serve() + except Exception as e: + rprint(f'Error starting webserver: {e}') + sys.exit(1) + +def create_login_link(): + login_url = Constants.biboxOauthLoginUrl + client_id = Constants.biboxOauthClientId + redirect_uri = 'http://localhost:4200/login' + code_verifier = secrets.token_urlsafe(96)[:96] + + code_verifier_hashed = hashlib.sha256(code_verifier.encode('ascii')).digest() + code_verifier_encoded = base64.urlsafe_b64encode(code_verifier_hashed) + code_challenge = code_verifier_encoded.decode('ascii')[:-1] + + login_url = login_url + f'?client_id={client_id}&response_type=code&scope=openid&redirect_uri={redirect_uri}&code_challenge_method=S256&code_challenge={code_challenge}' + + return { + 'redirect_uri': redirect_uri, + 'code_verifier': code_verifier, + 'login_url': login_url, + } + +def get_access_token(code: str, code_verifier: str, redirect_uri: str) -> str: + token_endpoint = Constants.biboxOauthTokenUrl + + token_result = requests.post(token_endpoint, data={ + 'redirect_uri': redirect_uri, + 'code': code, + 'code_verifier': code_verifier, + }) + + if token_result.status_code != 201 | 200: + raise Exception(f'Error getting access token: {token_result.text}') -def login_to_bibox(username: str, password: str) -> str: - with sync_playwright() as p: - print(f"Logging in to BiBox with user '{username}'") + return token_result.json().get('access_token') - browser = p.chromium.launch() - page = browser.new_page() - page.goto(Constants.biboxLoginUrl) - page.wait_for_selector(BiboxSelectors.loginBtn) +async def login_to_bibox() -> str: + # Create a task in a separate thread with a webserver to handle the login callback + webserver_task = asyncio.create_task(start_webserver()) - page.type(BiboxSelectors.loginUsernameField, username) - page.type(BiboxSelectors.loginPasswordField, password) + while True: + login_result = create_login_link() + rprint('To log in to bibox open the following link in your browser: ') + print(login_result["login_url"]) - with page.expect_navigation(): - page.click(BiboxSelectors.loginBtn) + code_result = await login_endpoint_queue.get() try: - page.wait_for_selector(BiboxSelectors.logoutBtn, timeout=10000) - except: - print('Login credentials incorrect or a network error occurred.') - raise typer.Exit(1) + access_token = get_access_token(code_result, login_result['code_verifier'], login_result['redirect_uri']) + except Exception as e: + rprint(f'Error getting access token: {e}') + continue - access_token = page.evaluate('() => window.localStorage.getItem("oauth.accessToken")') + rprint('Successfully logged in to bibox') - page.close() - browser.close() + await server.shutdown() + webserver_task.cancel() return access_token diff --git a/src/bibox_to_pdf/pdf/PdfOcr.py b/src/bibox_to_pdf/pdf/PdfOcr.py index 684daa5..e98b360 100644 --- a/src/bibox_to_pdf/pdf/PdfOcr.py +++ b/src/bibox_to_pdf/pdf/PdfOcr.py @@ -8,7 +8,7 @@ def ocr_pdf(book_id: int, pdf_non_ocr_path: str): pdf_output_dir = Constants.pdfOutputDir.format(book_id) os.makedirs(pdf_output_dir, exist_ok=True) - print("Starting PDF ocr in German...") + print("Starting PDF ocr in German (if you need another language please open an issue on GitHub)...") pdf_output_file = Constants.pdfOutputFile.format(book_id, 'ocr-version') diff --git a/src/bibox_to_pdf/values/BiboxSelectors.py b/src/bibox_to_pdf/values/BiboxSelectors.py deleted file mode 100644 index 2c78e34..0000000 --- a/src/bibox_to_pdf/values/BiboxSelectors.py +++ /dev/null @@ -1,5 +0,0 @@ -class BiboxSelectors: - loginUsernameField = '#account' - loginPasswordField = '#password' - loginBtn = '#form_login > div > div:nth-child(4) > button' - logoutBtn = '#bbx > app-root > app-shelf > div.header > div > div.right-side > div > button' diff --git a/src/bibox_to_pdf/values/Constants.py b/src/bibox_to_pdf/values/Constants.py index 5d63dfb..33adc22 100644 --- a/src/bibox_to_pdf/values/Constants.py +++ b/src/bibox_to_pdf/values/Constants.py @@ -2,7 +2,9 @@ class Constants: - biboxLoginUrl = 'https://bibox2.westermann.de' + biboxOauthLoginUrl = 'https://mein.westermann.de/auth/login' + biboxOauthTokenUrl = 'https://backend.bibox2.westermann.de/token' + biboxOauthClientId = 'Nvw0ZA8Z' biboxBookInfoUrl = 'https://backend.bibox2.westermann.de/v1/api/sync/{}?materialtypes[]=default&materialtypes[]=addon' baseOutputPath = os.getenv('BASE_OUTPUT_PATH', default='.') diff --git a/src/main.py b/src/main.py index 6dd5c5e..81ad30d 100644 --- a/src/main.py +++ b/src/main.py @@ -1,22 +1,21 @@ +import asyncio + import typer +from rich import print from typing_extensions import Annotated + from bibox_to_pdf.bibox.BiboxImageDownloader import get_bibox_images, download_images_from_bibox from bibox_to_pdf.bibox.BiboxLogin import login_to_bibox from bibox_to_pdf.pdf.PdfCreator import create_pdf_from_images from bibox_to_pdf.pdf.PdfOcr import ocr_pdf from bibox_to_pdf.values.Constants import Constants -from rich import print - -def main( - username: Annotated[str, typer.Argument()], - password: Annotated[str, typer.Argument()], - book_id: Annotated[int, typer.Argument()]): +def main(book_id: Annotated[int, typer.Argument()]): book_dest_path = Constants.bookBaseOutputDir.format(book_id) print(f"Downloading book with id '{book_id}' to '{book_dest_path}'...") - access_token = login_to_bibox(username, password) + access_token = asyncio.run(login_to_bibox()) bibox_images = get_bibox_images(access_token, book_id) image_paths = download_images_from_bibox(bibox_images, book_id) From e4ac8924262d7c5ffd418d8eaa90a57fc2d3a075 Mon Sep 17 00:00:00 2001 From: Lasse Roth Date: Sat, 30 Nov 2024 22:18:42 +0100 Subject: [PATCH 2/3] Update docker related files to reflect changes --- Dockerfile | 20 +++++++------------- docker-compose.yml | 3 +++ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index e00af53..a9a348c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,17 @@ -FROM python:3-bookworm +# Fixed alpine version due to: https://github.com/ocrmypdf/OCRmyPDF/issues/1395 +FROM python:3-alpine3.19 LABEL maintainer=LasseR15 -LABEL email=lasse.roth@lasse-it.de - - -RUN apt update -RUN apt install -y tesseract-ocr tesseract-ocr-deu ghostscript - - -COPY /src /app/src -COPY /requirements.txt /app/requirements.txt +LABEL email=lasse.roth@nexy.dev WORKDIR /app -RUN pip3 install -r requirements.txt +RUN apk add --no-cache tesseract-ocr tesseract-ocr-data-deu ghostscript +COPY /src ./src +COPY /requirements.txt ./requirements.txt -RUN python -m playwright install-deps chromium -RUN python -m playwright install chromium +RUN pip3 install -r requirements.txt ENV PYTHONPATH=/app/src/ ENV BASE_OUTPUT_PATH=/app/output diff --git a/docker-compose.yml b/docker-compose.yml index 4d407cc..cc09da7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,10 @@ services: bibox-to-pdf: image: ghcr.io/lasser15/bibox-to-pdf:latest + restart: no build: context: . volumes: - ./books:/app/output/books + ports: + - '4200:4200' From 982b8eda5c39c4c7539a4aaad9e462f1f32c2b2f Mon Sep 17 00:00:00 2001 From: Lasse Roth Date: Sat, 30 Nov 2024 22:19:01 +0100 Subject: [PATCH 3/3] Update README.md to reflect changes --- README.md | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 178d8b6..9a4c258 100644 --- a/README.md +++ b/README.md @@ -92,35 +92,29 @@ There are two variants/tags available: #### Docker Compose To use the ocr version of the script with Docker Compose run the following command: ```bash -docker compose run --rm -it bibox-to-pdf \ - '{USERNAME}' '{PASSWORD}' {BOOK_ID} +docker compose run --rm -Pit bibox-to-pdf {BOOK_ID} ``` #### Docker CLI To use the script with ocr via Docker run the following command: ```bash -docker run --rm -it \ - -v ./books:/app/output/books \ - ghcr.io/lasser15/bibox-to-pdf:latest \ - '{USERNAME}' '{PASSWORD}' {book_id} +docker run --rm -it -p 4200:4200 -v ./books:/app/output/books \ + ghcr.io/lasser15/bibox-to-pdf:latest {book_id} ```