Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(harvard_merger): add harvard_id field to OpinionCluster #4622

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cl/search/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class Meta:
"citation_count": INTEGER_LOOKUPS,
"precedential_status": ["exact"],
"date_blocked": DATE_LOOKUPS,
"harvard_id": ["exact"],
"blocked": ["exact"],
}

Expand Down
117 changes: 68 additions & 49 deletions cl/search/management/commands/import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ def add_arguments(self, parser):
help="Directory for reading crosswalk files",
required=True,
)
parser.add_argument(
"--job",
type=str,
choices=["import_pdf", "assign_cap_id"],
default="import_pdf",
help="",
)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see other things running through the crosswalk file to do work (#4614). I’ve no sound opinion for what the “correct” approach is. jobs here is driven by my interest in not copy/pasting (and then owning) the moderate complexity surrounding crosswalk processing. @jtmst, I’m happy to take any direction that you can give.


def handle(self, *args: Any, **options: Any) -> None:
"""Handle the command execution.
Expand All @@ -65,6 +72,7 @@ def handle(self, *args: Any, **options: Any) -> None:

self.dry_run = options["dry_run"]
self.crosswalk_dir = options["crosswalk_dir"]
self.job = options["job"]

if not os.path.exists(self.crosswalk_dir):
logger.warning(
Expand Down Expand Up @@ -156,37 +164,23 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None:
crosswalk_data = json.load(f)

for entry in tqdm(crosswalk_data, desc="Processing entries"):
logger.debug(f"Processing entry: {entry}")
try:
cap_case_id = entry["cap_case_id"]
cl_cluster_id = entry["cl_cluster_id"]
json_path = entry["cap_path"]

# Construct the PDF path based on the JSON path
pdf_path = json_path.replace("cases", "case-pdfs").replace(
".json", ".pdf"
)
cap_path = entry["cap_path"]

if pdf_path in self.processed_pdfs:
logger.info(f"Skipping already processed PDF: {pdf_path}")
if not all([cap_case_id, cl_cluster_id, cap_path]):
logger.error(
f"Missing key in entry: {json.dumps(entry, indent=2)}"
)
continue

logger.info(f"Processing PDF: {pdf_path}")

if not self.dry_run:
pdf_content = self.fetch_pdf_from_cap(pdf_path)
if pdf_content:
cluster = OpinionCluster.objects.get(id=cl_cluster_id)
self.store_pdf_in_cl(cluster, pdf_content)
else:
logger.info(f"Dry run: Would fetch PDF from {pdf_path}")
logger.debug(
f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}"
)

self.processed_pdfs.add(pdf_path)
self.process_entry(cap_case_id, cl_cluster_id, cap_path)

except KeyError as e:
logger.error(
f"Missing key in entry: {e}. Entry: {json.dumps(entry, indent=2)}"
)
except Exception as e:
logger.error(
f"Error processing CAP ID {entry.get('cap_case_id', 'Unknown')}: {str(e)}",
Expand All @@ -206,36 +200,61 @@ def parse_cap_path(self, cap_path: str) -> Tuple[str, str, str]:
case_name = parts[-1].replace(".json", "")
return reporter_slug, volume_folder, case_name

def process_entry(self, entry: Dict[str, Any]) -> None:
def process_entry(
self, cap_case_id: int, cl_cluster_id: int, cap_path: str
) -> None:
"""Process a single crosswalk entry.

:param entry: Dictionary containing crosswalk entry data.
:param cap_case_id: CAP case id
:param cl_cluster_id: CL cluster id
:param cap_path: Path of CAP JSON data (in the CAP_R2 S3 bucket)
:return: None
"""
cap_case_id = entry["cap_case_id"]
cl_cluster_id = entry["cl_cluster_id"]
cap_path = entry["cap_path"]
logger.info(
f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}"
)
try:
cluster = OpinionCluster.objects.get(id=cl_cluster_id)
logger.info(f"Found cluster: {cluster}")
pdf_content = self.fetch_pdf_from_cap(cap_path)
logger.info(
f"Fetched PDF content, length: {len(pdf_content) if pdf_content else 0}"
)
if pdf_content:
logger.info(
"PDF content is not empty, calling store_pdf_in_cl"
)
self.store_pdf_in_cl(cluster, pdf_content)
else:
logger.info("PDF content is empty, skipping storage")
except OpinionCluster.DoesNotExist:
logger.info(f"Cluster not found for id: {cl_cluster_id}")
except Exception as e:
logger.error(f"Error processing entry: {str(e)}", exc_info=True)

match self.job:
case "import_pdf":
try:
pdf_path = cap_path.replace("cases", "case-pdfs").replace(
".json", ".pdf"
)

if pdf_path in self.processed_pdfs:
logger.info(
f"Skipping already processed PDF: {pdf_path}"
)
return

logger.info(f"Processing PDF: {pdf_path}")

if not self.dry_run:
pdf_content = self.fetch_pdf_from_cap(pdf_path)
if pdf_content:
cluster = OpinionCluster.objects.get(
id=cl_cluster_id
)
self.store_pdf_in_cl(cluster, pdf_content)
else:
logger.info(
f"Dry run: Would fetch PDF from {pdf_path}"
)

self.processed_pdfs.add(pdf_path)

except OpinionCluster.DoesNotExist:
logger.info(f"Cluster not found for id: {cl_cluster_id}")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tempted to rely on the caller’s exception handler to do this logging ¯_(ツ)_/¯


case "assign_cap_id":
try:
cluster = OpinionCluster.objects.get(id=cl_cluster_id)
cluster.harvard_id = cap_case_id
if not self.dry_run:
cluster.save()

except OpinionCluster.DoesNotExist:
logger.info(f"Cluster not found for id: {cl_cluster_id}")

case _:
raise Exception(f"Unknown job {self.job}")

def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]:
"""Fetch PDF content from CAP.
Expand Down
69 changes: 69 additions & 0 deletions cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Generated by Django 5.1.2 on 2024-10-25 18:54

import pgtrigger.compiler
import pgtrigger.migrations
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("search", "0036_add_searchquery"),
]

operations = [
pgtrigger.migrations.RemoveTrigger(
model_name="opinioncluster",
name="update_update",
),
pgtrigger.migrations.RemoveTrigger(
model_name="opinioncluster",
name="delete_delete",
),
migrations.AddField(
model_name="opinioncluster",
name="harvard_id",
field=models.CharField(
db_index=True,
default=0,
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
),
),
migrations.AddField(
model_name="opinionclusterevent",
name="harvard_id",
field=models.CharField(
default=0,
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
),
),
pgtrigger.migrations.AddTrigger(
model_name="opinioncluster",
trigger=pgtrigger.compiler.Trigger(
name="update_update",
sql=pgtrigger.compiler.UpsertTriggerSql(
condition='WHEN (OLD."arguments" IS DISTINCT FROM (NEW."arguments") OR OLD."attorneys" IS DISTINCT FROM (NEW."attorneys") OR OLD."blocked" IS DISTINCT FROM (NEW."blocked") OR OLD."case_name" IS DISTINCT FROM (NEW."case_name") OR OLD."case_name_full" IS DISTINCT FROM (NEW."case_name_full") OR OLD."case_name_short" IS DISTINCT FROM (NEW."case_name_short") OR OLD."citation_count" IS DISTINCT FROM (NEW."citation_count") OR OLD."correction" IS DISTINCT FROM (NEW."correction") OR OLD."cross_reference" IS DISTINCT FROM (NEW."cross_reference") OR OLD."date_blocked" IS DISTINCT FROM (NEW."date_blocked") OR OLD."date_filed" IS DISTINCT FROM (NEW."date_filed") OR OLD."date_filed_is_approximate" IS DISTINCT FROM (NEW."date_filed_is_approximate") OR OLD."disposition" IS DISTINCT FROM (NEW."disposition") OR OLD."docket_id" IS DISTINCT FROM (NEW."docket_id") OR OLD."filepath_json_harvard" IS DISTINCT FROM (NEW."filepath_json_harvard") OR OLD."filepath_pdf_harvard" IS DISTINCT FROM (NEW."filepath_pdf_harvard") OR OLD."harvard_id" IS DISTINCT FROM (NEW."harvard_id") OR OLD."headmatter" IS DISTINCT FROM (NEW."headmatter") OR OLD."headnotes" IS DISTINCT FROM (NEW."headnotes") OR OLD."history" IS DISTINCT FROM (NEW."history") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."judges" IS DISTINCT FROM (NEW."judges") OR OLD."nature_of_suit" IS DISTINCT FROM (NEW."nature_of_suit") OR OLD."other_dates" IS DISTINCT FROM (NEW."other_dates") OR OLD."posture" IS DISTINCT FROM (NEW."posture") OR OLD."precedential_status" IS DISTINCT FROM (NEW."precedential_status") OR OLD."procedural_history" IS DISTINCT FROM (NEW."procedural_history") OR OLD."scdb_decision_direction" IS DISTINCT FROM (NEW."scdb_decision_direction") OR OLD."scdb_id" IS DISTINCT FROM (NEW."scdb_id") OR OLD."scdb_votes_majority" IS DISTINCT FROM (NEW."scdb_votes_majority") OR OLD."scdb_votes_minority" IS DISTINCT FROM (NEW."scdb_votes_minority") OR OLD."slug" IS DISTINCT FROM (NEW."slug") OR OLD."source" IS DISTINCT FROM (NEW."source") OR OLD."summary" IS DISTINCT FROM (NEW."summary") OR OLD."syllabus" IS DISTINCT FROM (NEW."syllabus"))',
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
hash="bc20a56b13c375017e704a6e50efd44e5c060018",
operation="UPDATE",
pgid="pgtrigger_update_update_c83f1",
table="search_opinioncluster",
when="AFTER",
),
),
),
pgtrigger.migrations.AddTrigger(
model_name="opinioncluster",
trigger=pgtrigger.compiler.Trigger(
name="delete_delete",
sql=pgtrigger.compiler.UpsertTriggerSql(
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
hash="93725d0e8785d341973cd6af46aa9b3e9aca1ec2",
operation="DELETE",
pgid="pgtrigger_delete_delete_a8516",
table="search_opinioncluster",
when="AFTER",
),
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
BEGIN;
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
cweider marked this conversation as resolved.
Show resolved Hide resolved
ALTER TABLE "search_opinionclusterevent" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinionclusterevent" ALTER COLUMN "harvard_id" DROP DEFAULT;
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
COMMIT;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
BEGIN;
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
COMMIT;
5 changes: 5 additions & 0 deletions cl/search/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2731,6 +2731,11 @@ class OpinionCluster(AbstractDateTimeModel):
storage=IncrementingAWSMediaStorage(),
blank=True,
)
harvard_id = models.CharField(
cweider marked this conversation as resolved.
Show resolved Hide resolved
help_text="The ID of the item in the Caselaw Access Project (Harvard)",
default=0,
cweider marked this conversation as resolved.
Show resolved Hide resolved
db_index=True,
cweider marked this conversation as resolved.
Show resolved Hide resolved
)
arguments = models.TextField(
help_text="The attorney(s) and legal arguments presented as HTML text. "
"This is primarily seen in older opinions and can contain "
Expand Down
65 changes: 65 additions & 0 deletions cl/search/tests/test_import_harvard_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,68 @@ def test_import_harvard_pdfs(
self.assertEqual(
self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
)

@patch("cl.search.management.commands.import_harvard_pdfs.tqdm")
@patch(
"cl.search.management.commands.import_harvard_pdfs.OpinionCluster.objects.get"
)
@patch(
"cl.search.management.commands.import_harvard_pdfs.HarvardPDFStorage"
)
@patch("cl.search.management.commands.import_harvard_pdfs.boto3.client")
@patch("cl.search.management.commands.import_harvard_pdfs.os.listdir")
@patch("cl.search.management.commands.import_harvard_pdfs.os.path.exists")
def test_assign_harvard_id(
self,
mock_exists,
mock_listdir,
mock_boto3_client,
mock_harvard_storage,
mock_opinion_cluster_get,
mock_tqdm,
):
# Setup mocks
mock_listdir.return_value = ["test_crosswalk.json"]
mock_exists.side_effect = lambda path: path in [
"/mocked_path/crosswalk_dir"
]

mock_s3 = MagicMock()
mock_boto3_client.return_value = mock_s3
mock_storage = MagicMock()
mock_harvard_storage.return_value = mock_storage
mock_opinion_cluster_get.return_value = self.cluster
mock_tqdm.side_effect = (
lambda x, *args, **kwargs: x
) # Make tqdm a pass-through function

crosswalk_data = [
{
"cap_case_id": 1,
"cl_cluster_id": self.cluster.id,
"cap_path": "/test/path.json",
}
]

# Mock file operations
m = mock_open(read_data=json.dumps(crosswalk_data))

# Mock crosswalk_dir
crosswalk_dir = "/mocked_path/crosswalk_dir"

# Verify crosswalk_dir exists
self.assertTrue(
os.path.exists(crosswalk_dir),
f"Crosswalk directory does not exist: {crosswalk_dir}",
)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Everything above is copy-paste from test_import_harvard_pdfs above. I don’t love it, but “good enough”? Nudge me if it needs to be better.


with patch("builtins.open", m):
call_command(
"import_harvard_pdfs",
crosswalk_dir=crosswalk_dir,
job="assign_cap_id",
)

# Verify that the cluster's harvard_id field was updated
self.cluster.refresh_from_db()
self.assertEqual(self.cluster.harvard_id, "1")