Skip to content

Commit

Permalink
latest
Browse files Browse the repository at this point in the history
  • Loading branch information
gleasonw committed Sep 16, 2023
2 parents f87d9ef + 88a20e1 commit c288645
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 79 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# gallica-getter

Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services using Python classes that represent each service.
Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services with service-specific classes.

Alongside Gallica wrappers, this project contains a JSON API. Deploy this API anywhere. Railway, Google App Engine, AWS, Fly.io, a Raspberry PI.

Expand Down Expand Up @@ -41,5 +41,5 @@ http://localhost:8000/api/gallicaRecords?terms=portland&source=periodical&link_t
An end-end test suite calls each Gallica wrapper endpoint and verifies the result.

```
python -m pytest gallicaGetter/tests/test_gallicaWrapper.py
python -m pytest gallicaGetter/tests/gallicaE2E.py
```
38 changes: 22 additions & 16 deletions gallicaGetter/mostFrequent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from typing import Dict, List
from io import StringIO
import random
from collections import Counter
import aiohttp
from bs4 import BeautifulSoup
import os
Expand All @@ -23,8 +23,9 @@
async def get_gallica_core(
root_gram: str,
start_date: str,
end_date: str,
max_n: int,
session: aiohttp.ClientSession,
end_date: str | None = None,
sample_size: int = 50,
) -> Dict[str, int]:
"""An experimental tool that returns the most frequent words in the surrounding context of a target word occurrence."""
Expand All @@ -44,7 +45,10 @@ async def get_gallica_core(
num_volumes = sum(
query.gallica_results_for_params for query in num_volumes_with_root_gram
)
indices_to_sample = random.sample(range(num_volumes), sample_size)
corrected_sample_size = min(sample_size, num_volumes)
if corrected_sample_size == 0:
return {}
indices_to_sample = random.sample(range(num_volumes), corrected_sample_size)
volumes_with_root_gram = await VolumeOccurrence.get(
OccurrenceArgs(
terms=[root_gram],
Expand All @@ -62,8 +66,21 @@ async def get_gallica_core(
codes=volume_codes, target_word=root_gram, session=session
)
)
notable_words_in_distance = get_associated_words(text_to_analyze, root_gram)
return notable_words_in_distance
lower_text_string = text_to_analyze.read().lower()
lower_text_array = re.findall(r"\b[a-zA-Z'àâéèêëîïôûùüç-]+\b", lower_text_string)
root_grams = set(root_gram.split())
filtered_array = []
for i in range(len(lower_text_array)):
word = lower_text_array[i]
if word not in stopwords_fr | stopwords_en | root_grams:
filtered_array.append(word)
counts = {}
for i in range(len(filtered_array)):
for j in range(1, max_n + 1):
if i + j <= len(filtered_array):
word = " ".join(filtered_array[i : i + j])
counts[word] = counts.get(word, 0) + 1
return counts


async def get_text_for_codes(
Expand All @@ -78,14 +95,3 @@ async def get_text_for_codes(
soup = BeautifulSoup(page.context, "html.parser")
text += soup.get_text()
return text


def get_associated_words(text_to_analyze: StringIO, root_gram: str) -> Dict[str, int]:
counts = Counter(text_to_analyze.read().split())
counts = {k.lower(): v for k, v in counts.items()}
for stopword in stopwords_fr | stopwords_en:
counts.pop(stopword, None)
counts = {k: v for k, v in counts.items() if k.isalnum()}
for word in root_gram.split():
counts.pop(word, None)
return counts
13 changes: 6 additions & 7 deletions gallicaGetter/tests/gallicaE2E.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@
@pytest.mark.asyncio
async def test_pagination():
async with aiohttp.ClientSession() as session:
records = Pagination.get("bpt6k607811b", session=session)
list_records = [record async for record in records]
first = list_records[0]
assert first.ark == "bpt6k607811b"
assert first.page_count == 4
assert first.has_content == True
assert first.has_toc == False
pagination = await Pagination.get("bpt6k607811b", session=session)
assert pagination
assert pagination.ark == "bpt6k607811b"
assert pagination.page_count == 4
assert pagination.has_content == True
assert pagination.has_toc == False


@pytest.mark.asyncio
Expand Down
4 changes: 3 additions & 1 deletion gallicaGetter/utils/base_query_builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def build_base_queries(
for code_bundle in bundle_codes(args.codes):
if type(args.start_index) is int:
cursor = [args.start_index]
for c in cursor: # type: ignore
else:
cursor = args.start_index
for c in cursor:
base_queries.append(
VolumeQuery(
terms=args.terms,
Expand Down
15 changes: 7 additions & 8 deletions gallicaGetter/utils/index_query_builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ async def build_indexed_queries(
new_query = query.make_copy(start_index=offset or 0, num_records=limit)
queries_with_num_results.append(new_query)
else:
queries_with_num_results = await get_num_results_for_queries(
queries, session
)
queries_with_num_results = await get_num_results_for_queries(queries, session)
if on_get_total_records:
on_get_total_records(sum(query.gallica_results_for_params for query in queries))
return index_queries_by_num_results(
Expand All @@ -38,11 +36,12 @@ async def get_num_results_for_queries(
responses = await fetch_queries_concurrently(queries, session)
queries_with_num_results_state = []
for response in responses:
assert response.query is type(VolumeQuery) or type(PaperQuery)
response.query.gallica_results_for_params = get_num_records_from_gallica_xml(
response.text
)
queries_with_num_results_state.append(response.query)
if response:
assert response.query is type(VolumeQuery) or type(PaperQuery)
response.query.gallica_results_for_params = (
get_num_records_from_gallica_xml(response.text)
)
queries_with_num_results_state.append(response.query)
return queries_with_num_results_state


Expand Down
38 changes: 35 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
ContextSnippets,
ExtractRoot,
)
from gallicaGetter.mostFrequent import get_gallica_core
from gallicaGetter.fetch import APIRequest, fetch_queries_concurrently
from gallicaGetter.imageSnippet import ImageQuery, ImageSnippet
from gallicaGetter.pageText import PageQuery, PageText
Expand All @@ -27,6 +28,7 @@
get_records_from_xml,
)
from gallicaGetter.volumeOccurrence import VolumeOccurrence, VolumeRecord
from pydantic import BaseModel

from models import (
ContextRow,
Expand Down Expand Up @@ -98,7 +100,6 @@ async def full_text(ark: str, page: Optional[int] = None):
page_data = []
if page is None:
pagination_data = await Pagination.get(ark=ark, session=gallica_session)
print(pagination_data)
if pagination_data:
page_data = [
page
Expand Down Expand Up @@ -140,7 +141,7 @@ async def top(
lock: asyncio.Lock = Depends(get_lock),
session: aiohttp.ClientSession = Depends(session),
):
# have to lock this route because it's the most intensive on Gallica's servers...
# have to lock this route because it's the most intense on Gallica's servers...

async with lock:
try:
Expand Down Expand Up @@ -196,7 +197,7 @@ def sort_by_count_and_return_top_limit(items):
items.sort(key=lambda x: x.count, reverse=True)
return items[:limit]

item = {
return {
"top_papers": sort_by_count_and_return_top_limit(top_papers),
"top_cities": sort_by_count_and_return_top_limit(top_cities),
}
Expand Down Expand Up @@ -328,6 +329,37 @@ async def fetch_volume_context(ark: str, term: str):
return rows


@app.get("/api/pagination")
async def pagination(
ark: str,
session: aiohttp.ClientSession = Depends(session),
):
return await Pagination.get(ark=ark, session=session)


@app.get("/api/mostTermsAtTime")
async def most_terms_at_time(
term: str,
year: int,
month: int | None = None,
max_n: int = 1,
session: aiohttp.ClientSession = Depends(session),
sample_size: int = 50,
):
counts = await get_gallica_core(
root_gram=term,
max_n=max_n,
start_date=make_date_from_year_mon_day(year=year, month=month),
session=session,
sample_size=sample_size,
)
return sorted(
[(term, count) for term, count in counts.items()],
key=lambda x: x[1],
reverse=True,
)[0:20]


@app.get("/api/sru")
async def fetch_sru(
terms: List[str] = Query(),
Expand Down
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
aiohttp==3.8.4
beautifulsoup4==4.12.2
fastapi==0.100.0
fastapi==0.103.0
lxml==4.9.2
pydantic==1.10.2
pydantic==2.3.0
pytest==7.2.1
redis==4.6.0
uvicorn==0.23.0
uvicorn==0.23.2
38 changes: 0 additions & 38 deletions tests/test_main.py

This file was deleted.

0 comments on commit c288645

Please sign in to comment.