Skip to content

Commit

Permalink
latest
Browse files Browse the repository at this point in the history
  • Loading branch information
gleasonw committed Sep 28, 2024
1 parent 4134fff commit b508e07
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
COPY ./app /app/app

#
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]j
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]
22 changes: 19 additions & 3 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from contextlib import asynccontextmanager
from io import StringIO
import os
import time
import aiohttp.client_exceptions
from bs4 import BeautifulSoup, ResultSet
import uvicorn
Expand Down Expand Up @@ -33,6 +34,7 @@
from pydantic import BaseModel
import pandas as pd
from datetime import datetime
import logfire

from app.models import (
ContextRow,
Expand All @@ -45,12 +47,23 @@
TopPaper,
)

import dotenv

dotenv.load_dotenv()


logfire.configure(token=os.getenv("LOGFIRE_TOKEN"))


MAX_PAPERS_TO_SEARCH = 600


gallica_session: aiohttp.ClientSession

# todo
# setup logfire
# fix multi-term search for gallicagram


@asynccontextmanager
async def gallica_session_lifespan(app: FastAPI):
Expand All @@ -73,7 +86,7 @@ def session():
allow_methods=["*"],
allow_headers=["*"],
)

logfire.instrument_fastapi(app)
# limit number of requests for routes... top_paper is more intensive


Expand Down Expand Up @@ -122,7 +135,7 @@ async def full_text(ark: str, page: Optional[int] = None) -> List[ConvertedXMLPa
]
if page_data and len(page_data) > 0:
return page_data
return None
return []
except aiohttp.client_exceptions.ClientConnectorError:
raise HTTPException(status_code=503, detail="Could not connect to Gallica.")

Expand Down Expand Up @@ -757,7 +770,7 @@ def get_unix_timestamp(row) -> int:
month = int(row.get("mois", 1))

dt = datetime(year, month, 1)
return dt.timestamp() * 1000
return int(dt.timestamp() * 1000)

data = series_dataframe.apply(
lambda row: (get_unix_timestamp(row), row["ratio"]), axis=1
Expand All @@ -773,7 +786,10 @@ def get_unix_timestamp(row) -> int:

async def fetch_series_dataframe(url: str, params: Dict):
async with aiohttp.ClientSession() as session:
start = time.time()
async with session.get(url, params=params) as response:
print(f"Fetched {response.url}")
print(f"Took {time.time() - start} seconds")
if response.status != 200:
raise HTTPException(
status_code=503, detail="Could not connect to Gallicagram! Egads!"
Expand Down
10 changes: 3 additions & 7 deletions fly.toml
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
# fly.toml app configuration file generated for gallica-getter on 2024-02-09T20:21:59+01:00
# fly.toml app configuration file generated for gallica-getter-little-snow-3158 on 2024-03-01T13:00:50+01:00
#
# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
#

app = 'gallica-getter'
app = 'gallica-getter-little-snow-3158'
primary_region = 'cdg'

[build]
builder = 'paketobuildpacks/builder:base'

[env]
PORT = '8080'

[http_service]
internal_port = 8080
Expand All @@ -21,6 +17,6 @@ primary_region = 'cdg'
processes = ['app']

[[vm]]
memory = '1gb'
cpu_kind = 'shared'
cpus = 1
memory_mb = 1024
9 changes: 9 additions & 0 deletions gallica-getter-bruno/bruno.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"version": "1",
"name": "gallica-getter",
"type": "collection",
"ignore": [
"node_modules",
".git"
]
}
3 changes: 3 additions & 0 deletions gallica-getter-bruno/collection.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
vars:pre-request {
baseUrl: https://gallica-getter-little-snow-3158.fly.dev
}
21 changes: 21 additions & 0 deletions gallica-getter-bruno/direct-gallica.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
meta {
name: direct-gallica
type: http
seq: 5
}

get {
url: https://gallica.bnf.fr/SRU?operation=searchRetrieve&exactSearch=True&version=1.2&startRecord=0&maximumRecords=10&collapsing=false&query=%28text+adj+%22brazza%22%29+and+dc.type+all+%22fascicule%22+or+dc.type+all+%22monographie%22
body: none
auth: none
}

params:query {
operation: searchRetrieve
exactSearch: True
version: 1.2
startRecord: 0
maximumRecords: 10
collapsing: false
query: (text adj "brazza") and dc.type all "fascicule" or dc.type all "monographie"
}
15 changes: 15 additions & 0 deletions gallica-getter-bruno/gallicaRecords.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
meta {
name: gallicaRecords
type: http
seq: 4
}

get {
url: {{baseUrl}}/api/gallicaRecords?terms=brazza
body: none
auth: none
}

params:query {
terms: brazza
}
26 changes: 26 additions & 0 deletions gallica-getter-bruno/gallica_image.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
meta {
name: gallica_image
type: http
seq: 3
}

post {
url: https://rapportgallica.bnf.fr/api/snippet
body: json
auth: none
}

headers {
Content-Type: application/json
}

body:json {
{
"ark": "bpt6k58069086",
"pages": [
"1"
],
"isPeriodique": true,
"query": "(gallica any \"brazza\")"
}
}
17 changes: 17 additions & 0 deletions gallica-getter-bruno/image.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
meta {
name: image
type: http
seq: 2
}

get {
url: {{baseUrl}}/api/image?ark=bpt6k93804234&term=brazza&page=15
body: none
auth: none
}

params:query {
ark: bpt6k93804234
term: brazza
page: 15
}
29 changes: 26 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,35 +1,56 @@
aiohttp==3.8.4
aiosignal==1.3.1
anyio==3.6.2
asgiref==3.8.1
async-timeout==4.0.2
attrs==23.1.0
beautifulsoup4==4.12.2
certifi==2024.8.30
charset-normalizer==3.1.0
ciso8601==2.2.0
click==8.1.3
Deprecated==1.2.14
exceptiongroup==1.1.1
executing==2.1.0
fastapi==0.100.0
fastapi-utils==0.2.1
frozenlist==1.3.3
googleapis-common-protos==1.65.0
greenlet==2.0.2
h11==0.14.0
idna==3.4
importlib_metadata==8.4.0
iniconfig==2.0.0
logfire==0.55.0
lxml==4.9.2
markdown-it-py==3.0.0
mdurl==0.1.2
multidict==6.0.4
numpy==1.26.0
opentelemetry-api==1.27.0
opentelemetry-exporter-otlp-proto-common==1.27.0
opentelemetry-exporter-otlp-proto-http==1.27.0
opentelemetry-instrumentation==0.48b0
opentelemetry-instrumentation-asgi==0.48b0
opentelemetry-instrumentation-fastapi==0.48b0
opentelemetry-proto==1.27.0
opentelemetry-sdk==1.27.0
opentelemetry-semantic-conventions==0.48b0
opentelemetry-util-http==0.48b0
packaging==23.1
pandas==2.1.1
pip==22.0.2
pluggy==1.0.0
protobuf==4.25.5
pydantic==1.10.2
Pygments==2.18.0
pytest==7.2.1
pytest-asyncio==0.21.0
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3.post1
redis==4.6.0
setuptools==59.6.0
requests==2.32.3
rich==13.8.1
six==1.16.0
sniffio==1.3.0
soupsieve==2.4.1
Expand All @@ -38,6 +59,8 @@ starlette==0.27.0
tomli==2.0.1
typing_extensions==4.5.0
tzdata==2023.3
urllib3==2.2.3
uvicorn==0.23.0
wheel==0.40.0
wrapt==1.16.0
yarl==1.9.2
zipp==3.20.2

0 comments on commit b508e07

Please sign in to comment.