diff --git a/Dockerfile b/Dockerfile index 49cc49e..92163a0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,4 +14,4 @@ RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt COPY ./app /app/app # -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]j \ No newline at end of file +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] \ No newline at end of file diff --git a/app/main.py b/app/main.py index 6b5025a..61df815 100644 --- a/app/main.py +++ b/app/main.py @@ -2,6 +2,7 @@ from contextlib import asynccontextmanager from io import StringIO import os +import time import aiohttp.client_exceptions from bs4 import BeautifulSoup, ResultSet import uvicorn @@ -33,6 +34,7 @@ from pydantic import BaseModel import pandas as pd from datetime import datetime +import logfire from app.models import ( ContextRow, @@ -45,12 +47,23 @@ TopPaper, ) +import dotenv + +dotenv.load_dotenv() + + +logfire.configure(token=os.getenv("LOGFIRE_TOKEN")) + MAX_PAPERS_TO_SEARCH = 600 gallica_session: aiohttp.ClientSession +# todo +# setup logfire +# fix multi-term search for gallicagram + @asynccontextmanager async def gallica_session_lifespan(app: FastAPI): @@ -73,7 +86,7 @@ def session(): allow_methods=["*"], allow_headers=["*"], ) - +logfire.instrument_fastapi(app) # limit number of requests for routes... top_paper is more intensive @@ -122,7 +135,7 @@ async def full_text(ark: str, page: Optional[int] = None) -> List[ConvertedXMLPa ] if page_data and len(page_data) > 0: return page_data - return None + return [] except aiohttp.client_exceptions.ClientConnectorError: raise HTTPException(status_code=503, detail="Could not connect to Gallica.") @@ -757,7 +770,7 @@ def get_unix_timestamp(row) -> int: month = int(row.get("mois", 1)) dt = datetime(year, month, 1) - return dt.timestamp() * 1000 + return int(dt.timestamp() * 1000) data = series_dataframe.apply( lambda row: (get_unix_timestamp(row), row["ratio"]), axis=1 @@ -773,7 +786,10 @@ def get_unix_timestamp(row) -> int: async def fetch_series_dataframe(url: str, params: Dict): async with aiohttp.ClientSession() as session: + start = time.time() async with session.get(url, params=params) as response: + print(f"Fetched {response.url}") + print(f"Took {time.time() - start} seconds") if response.status != 200: raise HTTPException( status_code=503, detail="Could not connect to Gallicagram! Egads!" diff --git a/fly.toml b/fly.toml index 56bcc80..8ef8ec7 100644 --- a/fly.toml +++ b/fly.toml @@ -1,16 +1,12 @@ -# fly.toml app configuration file generated for gallica-getter on 2024-02-09T20:21:59+01:00 +# fly.toml app configuration file generated for gallica-getter-little-snow-3158 on 2024-03-01T13:00:50+01:00 # # See https://fly.io/docs/reference/configuration/ for information about how to use this file. # -app = 'gallica-getter' +app = 'gallica-getter-little-snow-3158' primary_region = 'cdg' [build] - builder = 'paketobuildpacks/builder:base' - -[env] - PORT = '8080' [http_service] internal_port = 8080 @@ -21,6 +17,6 @@ primary_region = 'cdg' processes = ['app'] [[vm]] + memory = '1gb' cpu_kind = 'shared' cpus = 1 - memory_mb = 1024 diff --git a/gallica-getter-bruno/bruno.json b/gallica-getter-bruno/bruno.json new file mode 100644 index 0000000..546ee4a --- /dev/null +++ b/gallica-getter-bruno/bruno.json @@ -0,0 +1,9 @@ +{ + "version": "1", + "name": "gallica-getter", + "type": "collection", + "ignore": [ + "node_modules", + ".git" + ] +} \ No newline at end of file diff --git a/gallica-getter-bruno/collection.bru b/gallica-getter-bruno/collection.bru new file mode 100644 index 0000000..7b25838 --- /dev/null +++ b/gallica-getter-bruno/collection.bru @@ -0,0 +1,3 @@ +vars:pre-request { + baseUrl: https://gallica-getter-little-snow-3158.fly.dev +} diff --git a/gallica-getter-bruno/direct-gallica.bru b/gallica-getter-bruno/direct-gallica.bru new file mode 100644 index 0000000..54bbd60 --- /dev/null +++ b/gallica-getter-bruno/direct-gallica.bru @@ -0,0 +1,21 @@ +meta { + name: direct-gallica + type: http + seq: 5 +} + +get { + url: https://gallica.bnf.fr/SRU?operation=searchRetrieve&exactSearch=True&version=1.2&startRecord=0&maximumRecords=10&collapsing=false&query=%28text+adj+%22brazza%22%29+and+dc.type+all+%22fascicule%22+or+dc.type+all+%22monographie%22 + body: none + auth: none +} + +params:query { + operation: searchRetrieve + exactSearch: True + version: 1.2 + startRecord: 0 + maximumRecords: 10 + collapsing: false + query: (text adj "brazza") and dc.type all "fascicule" or dc.type all "monographie" +} diff --git a/gallica-getter-bruno/gallicaRecords.bru b/gallica-getter-bruno/gallicaRecords.bru new file mode 100644 index 0000000..c33afe5 --- /dev/null +++ b/gallica-getter-bruno/gallicaRecords.bru @@ -0,0 +1,15 @@ +meta { + name: gallicaRecords + type: http + seq: 4 +} + +get { + url: {{baseUrl}}/api/gallicaRecords?terms=brazza + body: none + auth: none +} + +params:query { + terms: brazza +} diff --git a/gallica-getter-bruno/gallica_image.bru b/gallica-getter-bruno/gallica_image.bru new file mode 100644 index 0000000..4c6a81d --- /dev/null +++ b/gallica-getter-bruno/gallica_image.bru @@ -0,0 +1,26 @@ +meta { + name: gallica_image + type: http + seq: 3 +} + +post { + url: https://rapportgallica.bnf.fr/api/snippet + body: json + auth: none +} + +headers { + Content-Type: application/json +} + +body:json { + { + "ark": "bpt6k58069086", + "pages": [ + "1" + ], + "isPeriodique": true, + "query": "(gallica any \"brazza\")" + } +} diff --git a/gallica-getter-bruno/image.bru b/gallica-getter-bruno/image.bru new file mode 100644 index 0000000..f86df07 --- /dev/null +++ b/gallica-getter-bruno/image.bru @@ -0,0 +1,17 @@ +meta { + name: image + type: http + seq: 2 +} + +get { + url: {{baseUrl}}/api/image?ark=bpt6k93804234&term=brazza&page=15 + body: none + auth: none +} + +params:query { + ark: bpt6k93804234 + term: brazza + page: 15 +} diff --git a/requirements.txt b/requirements.txt index a6c632d..317ff00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,35 +1,56 @@ aiohttp==3.8.4 aiosignal==1.3.1 anyio==3.6.2 +asgiref==3.8.1 async-timeout==4.0.2 attrs==23.1.0 beautifulsoup4==4.12.2 +certifi==2024.8.30 charset-normalizer==3.1.0 ciso8601==2.2.0 click==8.1.3 +Deprecated==1.2.14 exceptiongroup==1.1.1 +executing==2.1.0 fastapi==0.100.0 fastapi-utils==0.2.1 frozenlist==1.3.3 +googleapis-common-protos==1.65.0 greenlet==2.0.2 h11==0.14.0 idna==3.4 +importlib_metadata==8.4.0 iniconfig==2.0.0 +logfire==0.55.0 lxml==4.9.2 +markdown-it-py==3.0.0 +mdurl==0.1.2 multidict==6.0.4 numpy==1.26.0 +opentelemetry-api==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-http==1.27.0 +opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-proto==1.27.0 +opentelemetry-sdk==1.27.0 +opentelemetry-semantic-conventions==0.48b0 +opentelemetry-util-http==0.48b0 packaging==23.1 pandas==2.1.1 -pip==22.0.2 pluggy==1.0.0 +protobuf==4.25.5 pydantic==1.10.2 +Pygments==2.18.0 pytest==7.2.1 pytest-asyncio==0.21.0 python-dateutil==2.8.2 python-dotenv==1.0.0 pytz==2023.3.post1 redis==4.6.0 -setuptools==59.6.0 +requests==2.32.3 +rich==13.8.1 six==1.16.0 sniffio==1.3.0 soupsieve==2.4.1 @@ -38,6 +59,8 @@ starlette==0.27.0 tomli==2.0.1 typing_extensions==4.5.0 tzdata==2023.3 +urllib3==2.2.3 uvicorn==0.23.0 -wheel==0.40.0 +wrapt==1.16.0 yarl==1.9.2 +zipp==3.20.2