Check types and style in CI

Use mypy, black and ruff to test the type annotations and style with every commit to GitHub.
sul-dlss-labs · Jan 5, 2024 · 1f25388 · 1f25388
1 parent 702b2a4
commit 1f25388
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 84 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -23,8 +23,14 @@ jobs:
         pip install poetry
         poetry install
 
+    - name: Lint check
+      run: poetry run ruff check 
+
     - name: Check formatting
       run: poetry run black --check .
 
-    - name: Test with pytest
+    - name: Check types
+      run: poetry run mypy .
+
+    - name: Run tests
       run: poetry run pytest -v
diff --git a/marctable/__init__.py b/marctable/__init__.py
@@ -1,4 +1,6 @@
 from collections.abc import Callable
+from io import IOBase
+from typing import BinaryIO, TextIO
 
 import click
 
@@ -38,7 +40,7 @@ def rule_params(f: Callable) -> Callable:
 @cli.command()
 @io_params
 @rule_params
-def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None:
     """
     Convert MARC to CSV.
     """
@@ -48,7 +50,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non
 @cli.command()
 @io_params
 @rule_params
-def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def parquet(infile: BinaryIO, outfile: IOBase, rules: list, batch: int) -> None:
     """
     Convert MARC to Parquet.
     """
@@ -58,7 +60,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) ->
 @cli.command()
 @io_params
 @rule_params
-def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
+def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
     """
     Convert MARC to JSON Lines (JSONL)
     """
@@ -67,9 +69,10 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N
 
 @cli.command()
 @click.argument("outfile", type=click.File("w"), default="-")
-def avram(outfile: click.File) -> None:
+def avram(outfile: TextIO) -> None:
     """
-    Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website.
+    Generate Avram (YAML) from scraping the Library of Congress MARC
+    bibliographic web.
     """
     marctable.marc.crawl(outfile=outfile)
 

diff --git a/marctable/marc.py b/marctable/marc.py
@@ -14,11 +14,11 @@
 import re
 import sys
 from functools import cache
-from typing import IO, Generator
+from typing import IO, Generator, List, Optional, Type
 from urllib.parse import urljoin
 
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 
 class Subfield:
@@ -28,16 +28,21 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None:
         self.repeatable = repeatable
 
     @classmethod
-    def from_dict(_, d: dict):
-        return Subfield(d.get("code"), d.get("label"), d.get("repeatable"))
+    def from_dict(cls: Type["Subfield"], d: dict) -> "Subfield":
+        return Subfield(d["code"], d["label"], d["repeatable"])
 
     def to_dict(self) -> dict:
         return {"code": self.code, "label": self.label, "repeatable": self.repeatable}
 
 
 class Field:
     def __init__(
-        self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None
+        self,
+        tag: str,
+        label: str,
+        subfields: list[Subfield],
+        repeatable: bool = False,
+        url: Optional[str] = None,
     ) -> None:
         self.tag = tag
         self.label = label
@@ -47,71 +52,66 @@ def __init__(
 
     def __str__(self) -> str:
         if len(self.subfields) > 0:
-            subfields = ": " + (",".join(self.subfields.keys()))
+            subfields = ": " + (",".join([sf.code for sf in self.subfields]))
         else:
             subfields = ""
         return (
             f"{self.tag} {self.label}: {'R' if self.repeatable else 'NR'} {subfields}"
         )
 
     @classmethod
-    def from_dict(klass, d: dict):
+    def from_dict(cls: Type["Field"], d: dict) -> "Field":
         return Field(
-            tag=d.get("tag"),
-            label=d.get("label"),
-            repeatable=d.get("repeatable"),
+            tag=d["tag"],
+            label=d["label"],
+            repeatable=d["repeatable"],
             url=d.get("url"),
             subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()],
         )
 
     def to_dict(self) -> dict:
-        return {
+        d = {
             "tag": self.tag,
             "label": self.label,
             "repeatable": self.repeatable,
             "url": self.url,
-            "subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()},
         }
 
-    def to_avram(self) -> dict:
-        d = self.to_dict()
-        if len(d["subfields"]) == 0:
-            del d["subfields"]
+        if self.subfields is not None:
+            d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields}
+
         return d
 
     def get_subfield(self, code: str) -> Subfield:
         for sf in self.subfields:
             if sf.code == code:
                 return sf
-        return None
+        raise SchemaSubfieldError(f"{code} is not a valid subfield in field {self.tag}")
 
 
 class MARC:
     def __init__(self) -> None:
-        self.fields = []
+        self.fields: List[Field] = []
 
     @cache
     def get_field(self, tag: str) -> Field:
         for field in self.fields:
             if field.tag == tag:
                 return field
-        return None
+        raise SchemaFieldError(f"{tag} is not a defined field tag in Avram schema")
 
     @cache
     def get_subfield(self, tag: str, code: str) -> Subfield:
         field = self.get_field(tag)
-        if field:
-            return field.get_subfield(code)
-        else:
-            return None
+        return field.get_subfield(code)
 
     @property
-    def avram_file(self):
+    def avram_file(self) -> pathlib.Path:
         return pathlib.Path(__file__).parent / "marc.json"
 
     @classmethod
     @cache
-    def from_avram(cls, avram_file: IO = None) -> dict:
+    def from_avram(cls: Type["MARC"], avram_file: Optional[IO] = None) -> "MARC":
         marc = MARC()
 
         if avram_file is None:
@@ -122,7 +122,7 @@ def from_avram(cls, avram_file: IO = None) -> dict:
 
         return marc
 
-    def write_avram(self, avram_file: IO = None) -> None:
+    def to_avram(self, avram_file: Optional[IO] = None) -> None:
         if avram_file is None:
             avram_file = self.avram_file.open("w")
 
@@ -131,11 +131,19 @@ def write_avram(self, avram_file: IO = None) -> None:
             "url": "https://www.loc.gov/marc/bibliographic/",
             "family": "marc",
             "language": "en",
-            "fields": {f.tag: f.to_avram() for f in self.fields},
+            "fields": {f.tag: f.to_dict() for f in self.fields},
         }
         json.dump(d, avram_file, indent=2)
 
 
+class SchemaFieldError(Exception):
+    pass
+
+
+class SchemaSubfieldError(Exception):
+    pass
+
+
 def fields() -> Generator[Field, None, None]:
     toc_url = "https://www.loc.gov/marc/bibliographic/"
     toc_doc = _soup(toc_url)
@@ -150,30 +158,34 @@ def fields() -> Generator[Field, None, None]:
                         yield field
 
 
-def make_field(url: str) -> Field:
+def make_field(url: str) -> Optional[Field]:
     soup = _soup(url)
-    h1 = soup.select_one("h1", first=True).text.strip()
-    if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1):
+    h1: Optional[Tag] = soup.select_one("h1")
+    if h1 is None:
+        raise Exception("Expecting h1 element in {url}")
+
+    h1_text: str = h1.text.strip()
+    if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text):
         tag, label, repeatable = m1.groups()
 
         # most pages put the subfield info in a list
-        subfields = {}
+        subfields = []
         for el in soup.select("table.subfields li"):
             if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text):
-                subfields[m2.group(1)] = Subfield(
-                    m2.group(1), m2.group(2), m2.group(3) == "R"
-                )
+                subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R"))
 
         # some pages use a different layout, of course
         if len(subfields) == 0:
             for el in soup.select('td[colspan="1"]'):
                 for text in el.text.split("$"):
                     text = text.strip()
                     if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text):
-                        subfields[m2.group(1)] = Subfield(
-                            code=m2.group(1),
-                            label=m2.group(2),
-                            repeatable=m2.group(3) == "R",
+                        subfields.append(
+                            Subfield(
+                                code=m2.group(1),
+                                label=m2.group(2),
+                                repeatable=m2.group(3) == "R",
+                            )
                         )
 
         return Field(
@@ -184,6 +196,8 @@ def make_field(url: str) -> Field:
             subfields=subfields,
         )
 
+    return None
+
 
 # scrape the loc website for the marc fields
 def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
@@ -194,7 +208,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
             print(f)
         if n != 0 and len(marc.fields) >= n:
             break
-    marc.write_avram(outfile)
+    marc.to_avram(outfile)
 
 
 def _soup(url: str) -> BeautifulSoup: