latest

gleasonw · Sep 16, 2023 · c288645 · c288645
2 parents f87d9ef + 88a20e1
commit c288645
Show file tree

Hide file tree

Showing 8 changed files with 78 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # gallica-getter
 
-Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services using Python classes that represent each service.
+Find documents where a word occurs, context for the occurrence, full text for OCR document pages. Compose Gallica services with service-specific classes.
 
 Alongside Gallica wrappers, this project contains a JSON API. Deploy this API anywhere. Railway, Google App Engine, AWS, Fly.io, a Raspberry PI.
 
@@ -41,5 +41,5 @@ http://localhost:8000/api/gallicaRecords?terms=portland&source=periodical&link_t
 An end-end test suite calls each Gallica wrapper endpoint and verifies the result.
 
 ```
-python -m pytest gallicaGetter/tests/test_gallicaWrapper.py
+python -m pytest gallicaGetter/tests/gallicaE2E.py
 ```
diff --git a/gallicaGetter/mostFrequent.py b/gallicaGetter/mostFrequent.py
@@ -1,7 +1,7 @@
+import re
 from typing import Dict, List
 from io import StringIO
 import random
-from collections import Counter
 import aiohttp
 from bs4 import BeautifulSoup
 import os
@@ -23,8 +23,9 @@
 async def get_gallica_core(
     root_gram: str,
     start_date: str,
-    end_date: str,
+    max_n: int,
     session: aiohttp.ClientSession,
+    end_date: str | None = None,
     sample_size: int = 50,
 ) -> Dict[str, int]:
     """An experimental tool that returns the most frequent words in the surrounding context of a target word occurrence."""
@@ -44,7 +45,10 @@ async def get_gallica_core(
     num_volumes = sum(
         query.gallica_results_for_params for query in num_volumes_with_root_gram
     )
-    indices_to_sample = random.sample(range(num_volumes), sample_size)
+    corrected_sample_size = min(sample_size, num_volumes)
+    if corrected_sample_size == 0:
+        return {}
+    indices_to_sample = random.sample(range(num_volumes), corrected_sample_size)
     volumes_with_root_gram = await VolumeOccurrence.get(
         OccurrenceArgs(
             terms=[root_gram],
@@ -62,8 +66,21 @@ async def get_gallica_core(
             codes=volume_codes, target_word=root_gram, session=session
         )
     )
-    notable_words_in_distance = get_associated_words(text_to_analyze, root_gram)
-    return notable_words_in_distance
+    lower_text_string = text_to_analyze.read().lower()
+    lower_text_array = re.findall(r"\b[a-zA-Z'àâéèêëîïôûùüç-]+\b", lower_text_string)
+    root_grams = set(root_gram.split())
+    filtered_array = []
+    for i in range(len(lower_text_array)):
+        word = lower_text_array[i]
+        if word not in stopwords_fr | stopwords_en | root_grams:
+            filtered_array.append(word)
+    counts = {}
+    for i in range(len(filtered_array)):
+        for j in range(1, max_n + 1):
+            if i + j <= len(filtered_array):
+                word = " ".join(filtered_array[i : i + j])
+                counts[word] = counts.get(word, 0) + 1
+    return counts
 
 
 async def get_text_for_codes(
@@ -78,14 +95,3 @@ async def get_text_for_codes(
             soup = BeautifulSoup(page.context, "html.parser")
             text += soup.get_text()
     return text
-
-
-def get_associated_words(text_to_analyze: StringIO, root_gram: str) -> Dict[str, int]:
-    counts = Counter(text_to_analyze.read().split())
-    counts = {k.lower(): v for k, v in counts.items()}
-    for stopword in stopwords_fr | stopwords_en:
-        counts.pop(stopword, None)
-    counts = {k: v for k, v in counts.items() if k.isalnum()}
-    for word in root_gram.split():
-        counts.pop(word, None)
-    return counts
diff --git a/gallicaGetter/tests/gallicaE2E.py b/gallicaGetter/tests/gallicaE2E.py
@@ -15,13 +15,12 @@
 @pytest.mark.asyncio
 async def test_pagination():
     async with aiohttp.ClientSession() as session:
-        records = Pagination.get("bpt6k607811b", session=session)
-        list_records = [record async for record in records]
-        first = list_records[0]
-        assert first.ark == "bpt6k607811b"
-        assert first.page_count == 4
-        assert first.has_content == True
-        assert first.has_toc == False
+        pagination = await Pagination.get("bpt6k607811b", session=session)
+        assert pagination
+        assert pagination.ark == "bpt6k607811b"
+        assert pagination.page_count == 4
+        assert pagination.has_content == True
+        assert pagination.has_toc == False
 
 
 @pytest.mark.asyncio

diff --git a/gallicaGetter/utils/base_query_builds.py b/gallicaGetter/utils/base_query_builds.py
@@ -24,7 +24,9 @@ def build_base_queries(
         for code_bundle in bundle_codes(args.codes):
             if type(args.start_index) is int:
                 cursor = [args.start_index]
-            for c in cursor:  # type: ignore
+            else:
+                cursor = args.start_index
+            for c in cursor:
                 base_queries.append(
                     VolumeQuery(
                         terms=args.terms,

diff --git a/gallicaGetter/utils/index_query_builds.py b/gallicaGetter/utils/index_query_builds.py
@@ -21,9 +21,7 @@ async def build_indexed_queries(
             new_query = query.make_copy(start_index=offset or 0, num_records=limit)
             queries_with_num_results.append(new_query)
     else:
-        queries_with_num_results = await get_num_results_for_queries(
-            queries, session
-        )
+        queries_with_num_results = await get_num_results_for_queries(queries, session)
     if on_get_total_records:
         on_get_total_records(sum(query.gallica_results_for_params for query in queries))
     return index_queries_by_num_results(
@@ -38,11 +36,12 @@ async def get_num_results_for_queries(
     responses = await fetch_queries_concurrently(queries, session)
     queries_with_num_results_state = []
     for response in responses:
-        assert response.query is type(VolumeQuery) or type(PaperQuery)
-        response.query.gallica_results_for_params = get_num_records_from_gallica_xml(
-            response.text
-        )
-        queries_with_num_results_state.append(response.query)
+        if response:
+            assert response.query is type(VolumeQuery) or type(PaperQuery)
+            response.query.gallica_results_for_params = (
+                get_num_records_from_gallica_xml(response.text)
+            )
+            queries_with_num_results_state.append(response.query)
     return queries_with_num_results_state
 
 

diff --git a/main.py b/main.py
@@ -11,6 +11,7 @@
     ContextSnippets,
     ExtractRoot,
 )
+from gallicaGetter.mostFrequent import get_gallica_core
 from gallicaGetter.fetch import APIRequest, fetch_queries_concurrently
 from gallicaGetter.imageSnippet import ImageQuery, ImageSnippet
 from gallicaGetter.pageText import PageQuery, PageText
@@ -27,6 +28,7 @@
     get_records_from_xml,
 )
 from gallicaGetter.volumeOccurrence import VolumeOccurrence, VolumeRecord
+from pydantic import BaseModel
 
 from models import (
     ContextRow,
@@ -98,7 +100,6 @@ async def full_text(ark: str, page: Optional[int] = None):
         page_data = []
         if page is None:
             pagination_data = await Pagination.get(ark=ark, session=gallica_session)
-            print(pagination_data)
             if pagination_data:
                 page_data = [
                     page
@@ -140,7 +141,7 @@ async def top(
     lock: asyncio.Lock = Depends(get_lock),
     session: aiohttp.ClientSession = Depends(session),
 ):
-    # have to lock this route because it's the most intensive on Gallica's servers...
+    # have to lock this route because it's the most intense on Gallica's servers...
 
     async with lock:
         try:
@@ -196,7 +197,7 @@ def sort_by_count_and_return_top_limit(items):
                 items.sort(key=lambda x: x.count, reverse=True)
                 return items[:limit]
 
-            item = {
+            return {
                 "top_papers": sort_by_count_and_return_top_limit(top_papers),
                 "top_cities": sort_by_count_and_return_top_limit(top_cities),
             }
@@ -328,6 +329,37 @@ async def fetch_volume_context(ark: str, term: str):
         return rows
 
 
+@app.get("/api/pagination")
+async def pagination(
+    ark: str,
+    session: aiohttp.ClientSession = Depends(session),
+):
+    return await Pagination.get(ark=ark, session=session)
+
+
+@app.get("/api/mostTermsAtTime")
+async def most_terms_at_time(
+    term: str,
+    year: int,
+    month: int | None = None,
+    max_n: int = 1,
+    session: aiohttp.ClientSession = Depends(session),
+    sample_size: int = 50,
+):
+    counts = await get_gallica_core(
+        root_gram=term,
+        max_n=max_n,
+        start_date=make_date_from_year_mon_day(year=year, month=month),
+        session=session,
+        sample_size=sample_size,
+    )
+    return sorted(
+        [(term, count) for term, count in counts.items()],
+        key=lambda x: x[1],
+        reverse=True,
+    )[0:20]
+
+
 @app.get("/api/sru")
 async def fetch_sru(
     terms: List[str] = Query(),

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,7 @@
 aiohttp==3.8.4
 beautifulsoup4==4.12.2
-fastapi==0.100.0
+fastapi==0.103.0
 lxml==4.9.2
-pydantic==1.10.2
+pydantic==2.3.0
 pytest==7.2.1
-redis==4.6.0
-uvicorn==0.23.0
+uvicorn==0.23.2
diff --git a/tests/test_main.py b/tests/test_main.py