-
-
Notifications
You must be signed in to change notification settings - Fork 151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(harvard_merger): add harvard_id
field to OpinionCluster
#4622
base: main
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,6 +51,13 @@ def add_arguments(self, parser): | |
help="Directory for reading crosswalk files", | ||
required=True, | ||
) | ||
parser.add_argument( | ||
"--job", | ||
type=str, | ||
choices=["import_pdf", "assign_cap_id"], | ||
default="import_pdf", | ||
help="", | ||
) | ||
|
||
def handle(self, *args: Any, **options: Any) -> None: | ||
"""Handle the command execution. | ||
|
@@ -65,6 +72,7 @@ def handle(self, *args: Any, **options: Any) -> None: | |
|
||
self.dry_run = options["dry_run"] | ||
self.crosswalk_dir = options["crosswalk_dir"] | ||
self.job = options["job"] | ||
|
||
if not os.path.exists(self.crosswalk_dir): | ||
logger.warning( | ||
|
@@ -156,37 +164,23 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None: | |
crosswalk_data = json.load(f) | ||
|
||
for entry in tqdm(crosswalk_data, desc="Processing entries"): | ||
logger.debug(f"Processing entry: {entry}") | ||
try: | ||
cap_case_id = entry["cap_case_id"] | ||
cl_cluster_id = entry["cl_cluster_id"] | ||
json_path = entry["cap_path"] | ||
|
||
# Construct the PDF path based on the JSON path | ||
pdf_path = json_path.replace("cases", "case-pdfs").replace( | ||
".json", ".pdf" | ||
) | ||
cap_path = entry["cap_path"] | ||
|
||
if pdf_path in self.processed_pdfs: | ||
logger.info(f"Skipping already processed PDF: {pdf_path}") | ||
if not all([cap_case_id, cl_cluster_id, cap_path]): | ||
logger.error( | ||
f"Missing key in entry: {json.dumps(entry, indent=2)}" | ||
) | ||
continue | ||
|
||
logger.info(f"Processing PDF: {pdf_path}") | ||
|
||
if not self.dry_run: | ||
pdf_content = self.fetch_pdf_from_cap(pdf_path) | ||
if pdf_content: | ||
cluster = OpinionCluster.objects.get(id=cl_cluster_id) | ||
self.store_pdf_in_cl(cluster, pdf_content) | ||
else: | ||
logger.info(f"Dry run: Would fetch PDF from {pdf_path}") | ||
logger.debug( | ||
f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}" | ||
) | ||
|
||
self.processed_pdfs.add(pdf_path) | ||
self.process_entry(cap_case_id, cl_cluster_id, cap_path) | ||
|
||
except KeyError as e: | ||
logger.error( | ||
f"Missing key in entry: {e}. Entry: {json.dumps(entry, indent=2)}" | ||
) | ||
except Exception as e: | ||
logger.error( | ||
f"Error processing CAP ID {entry.get('cap_case_id', 'Unknown')}: {str(e)}", | ||
|
@@ -206,36 +200,61 @@ def parse_cap_path(self, cap_path: str) -> Tuple[str, str, str]: | |
case_name = parts[-1].replace(".json", "") | ||
return reporter_slug, volume_folder, case_name | ||
|
||
def process_entry(self, entry: Dict[str, Any]) -> None: | ||
def process_entry( | ||
self, cap_case_id: int, cl_cluster_id: int, cap_path: str | ||
) -> None: | ||
"""Process a single crosswalk entry. | ||
|
||
:param entry: Dictionary containing crosswalk entry data. | ||
:param cap_case_id: CAP case id | ||
:param cl_cluster_id: CL cluster id | ||
:param cap_path: Path of CAP JSON data (in the CAP_R2 S3 bucket) | ||
:return: None | ||
""" | ||
cap_case_id = entry["cap_case_id"] | ||
cl_cluster_id = entry["cl_cluster_id"] | ||
cap_path = entry["cap_path"] | ||
logger.info( | ||
f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}" | ||
) | ||
try: | ||
cluster = OpinionCluster.objects.get(id=cl_cluster_id) | ||
logger.info(f"Found cluster: {cluster}") | ||
pdf_content = self.fetch_pdf_from_cap(cap_path) | ||
logger.info( | ||
f"Fetched PDF content, length: {len(pdf_content) if pdf_content else 0}" | ||
) | ||
if pdf_content: | ||
logger.info( | ||
"PDF content is not empty, calling store_pdf_in_cl" | ||
) | ||
self.store_pdf_in_cl(cluster, pdf_content) | ||
else: | ||
logger.info("PDF content is empty, skipping storage") | ||
except OpinionCluster.DoesNotExist: | ||
logger.info(f"Cluster not found for id: {cl_cluster_id}") | ||
except Exception as e: | ||
logger.error(f"Error processing entry: {str(e)}", exc_info=True) | ||
|
||
match self.job: | ||
case "import_pdf": | ||
try: | ||
pdf_path = cap_path.replace("cases", "case-pdfs").replace( | ||
".json", ".pdf" | ||
) | ||
|
||
if pdf_path in self.processed_pdfs: | ||
logger.info( | ||
f"Skipping already processed PDF: {pdf_path}" | ||
) | ||
return | ||
|
||
logger.info(f"Processing PDF: {pdf_path}") | ||
|
||
if not self.dry_run: | ||
pdf_content = self.fetch_pdf_from_cap(pdf_path) | ||
if pdf_content: | ||
cluster = OpinionCluster.objects.get( | ||
id=cl_cluster_id | ||
) | ||
self.store_pdf_in_cl(cluster, pdf_content) | ||
else: | ||
logger.info( | ||
f"Dry run: Would fetch PDF from {pdf_path}" | ||
) | ||
|
||
self.processed_pdfs.add(pdf_path) | ||
|
||
except OpinionCluster.DoesNotExist: | ||
logger.info(f"Cluster not found for id: {cl_cluster_id}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tempted to rely on the caller’s exception handler to do this logging ¯_(ツ)_/¯ |
||
|
||
case "assign_cap_id": | ||
try: | ||
cluster = OpinionCluster.objects.get(id=cl_cluster_id) | ||
cluster.harvard_id = cap_case_id | ||
if not self.dry_run: | ||
cluster.save() | ||
|
||
except OpinionCluster.DoesNotExist: | ||
logger.info(f"Cluster not found for id: {cl_cluster_id}") | ||
|
||
case _: | ||
raise Exception(f"Unknown job {self.job}") | ||
|
||
def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]: | ||
"""Fetch PDF content from CAP. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Generated by Django 5.1.2 on 2024-10-25 18:54 | ||
|
||
import pgtrigger.compiler | ||
import pgtrigger.migrations | ||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
("search", "0036_add_searchquery"), | ||
] | ||
|
||
operations = [ | ||
pgtrigger.migrations.RemoveTrigger( | ||
model_name="opinioncluster", | ||
name="update_update", | ||
), | ||
pgtrigger.migrations.RemoveTrigger( | ||
model_name="opinioncluster", | ||
name="delete_delete", | ||
), | ||
migrations.AddField( | ||
model_name="opinioncluster", | ||
name="harvard_id", | ||
field=models.CharField( | ||
db_index=True, | ||
default=0, | ||
help_text="The ID of the item in the Caselaw Access Project (Harvard)", | ||
), | ||
), | ||
migrations.AddField( | ||
model_name="opinionclusterevent", | ||
name="harvard_id", | ||
field=models.CharField( | ||
default=0, | ||
help_text="The ID of the item in the Caselaw Access Project (Harvard)", | ||
), | ||
), | ||
pgtrigger.migrations.AddTrigger( | ||
model_name="opinioncluster", | ||
trigger=pgtrigger.compiler.Trigger( | ||
name="update_update", | ||
sql=pgtrigger.compiler.UpsertTriggerSql( | ||
condition='WHEN (OLD."arguments" IS DISTINCT FROM (NEW."arguments") OR OLD."attorneys" IS DISTINCT FROM (NEW."attorneys") OR OLD."blocked" IS DISTINCT FROM (NEW."blocked") OR OLD."case_name" IS DISTINCT FROM (NEW."case_name") OR OLD."case_name_full" IS DISTINCT FROM (NEW."case_name_full") OR OLD."case_name_short" IS DISTINCT FROM (NEW."case_name_short") OR OLD."citation_count" IS DISTINCT FROM (NEW."citation_count") OR OLD."correction" IS DISTINCT FROM (NEW."correction") OR OLD."cross_reference" IS DISTINCT FROM (NEW."cross_reference") OR OLD."date_blocked" IS DISTINCT FROM (NEW."date_blocked") OR OLD."date_filed" IS DISTINCT FROM (NEW."date_filed") OR OLD."date_filed_is_approximate" IS DISTINCT FROM (NEW."date_filed_is_approximate") OR OLD."disposition" IS DISTINCT FROM (NEW."disposition") OR OLD."docket_id" IS DISTINCT FROM (NEW."docket_id") OR OLD."filepath_json_harvard" IS DISTINCT FROM (NEW."filepath_json_harvard") OR OLD."filepath_pdf_harvard" IS DISTINCT FROM (NEW."filepath_pdf_harvard") OR OLD."harvard_id" IS DISTINCT FROM (NEW."harvard_id") OR OLD."headmatter" IS DISTINCT FROM (NEW."headmatter") OR OLD."headnotes" IS DISTINCT FROM (NEW."headnotes") OR OLD."history" IS DISTINCT FROM (NEW."history") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."judges" IS DISTINCT FROM (NEW."judges") OR OLD."nature_of_suit" IS DISTINCT FROM (NEW."nature_of_suit") OR OLD."other_dates" IS DISTINCT FROM (NEW."other_dates") OR OLD."posture" IS DISTINCT FROM (NEW."posture") OR OLD."precedential_status" IS DISTINCT FROM (NEW."precedential_status") OR OLD."procedural_history" IS DISTINCT FROM (NEW."procedural_history") OR OLD."scdb_decision_direction" IS DISTINCT FROM (NEW."scdb_decision_direction") OR OLD."scdb_id" IS DISTINCT FROM (NEW."scdb_id") OR OLD."scdb_votes_majority" IS DISTINCT FROM (NEW."scdb_votes_majority") OR OLD."scdb_votes_minority" IS DISTINCT FROM (NEW."scdb_votes_minority") OR OLD."slug" IS DISTINCT FROM (NEW."slug") OR OLD."source" IS DISTINCT FROM (NEW."source") OR OLD."summary" IS DISTINCT FROM (NEW."summary") OR OLD."syllabus" IS DISTINCT FROM (NEW."syllabus"))', | ||
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;', | ||
hash="bc20a56b13c375017e704a6e50efd44e5c060018", | ||
operation="UPDATE", | ||
pgid="pgtrigger_update_update_c83f1", | ||
table="search_opinioncluster", | ||
when="AFTER", | ||
), | ||
), | ||
), | ||
pgtrigger.migrations.AddTrigger( | ||
model_name="opinioncluster", | ||
trigger=pgtrigger.compiler.Trigger( | ||
name="delete_delete", | ||
sql=pgtrigger.compiler.UpsertTriggerSql( | ||
func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;', | ||
hash="93725d0e8785d341973cd6af46aa9b3e9aca1ec2", | ||
operation="DELETE", | ||
pgid="pgtrigger_delete_delete_a8516", | ||
table="search_opinioncluster", | ||
when="AFTER", | ||
), | ||
), | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
BEGIN; | ||
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL; | ||
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT; | ||
cweider marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ALTER TABLE "search_opinionclusterevent" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL; | ||
ALTER TABLE "search_opinionclusterevent" ALTER COLUMN "harvard_id" DROP DEFAULT; | ||
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id"); | ||
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops); | ||
COMMIT; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
BEGIN; | ||
ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL; | ||
ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT; | ||
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id"); | ||
CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops); | ||
COMMIT; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -119,3 +119,68 @@ def test_import_harvard_pdfs( | |
self.assertEqual( | ||
self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf" | ||
) | ||
|
||
@patch("cl.search.management.commands.import_harvard_pdfs.tqdm") | ||
@patch( | ||
"cl.search.management.commands.import_harvard_pdfs.OpinionCluster.objects.get" | ||
) | ||
@patch( | ||
"cl.search.management.commands.import_harvard_pdfs.HarvardPDFStorage" | ||
) | ||
@patch("cl.search.management.commands.import_harvard_pdfs.boto3.client") | ||
@patch("cl.search.management.commands.import_harvard_pdfs.os.listdir") | ||
@patch("cl.search.management.commands.import_harvard_pdfs.os.path.exists") | ||
def test_assign_harvard_id( | ||
self, | ||
mock_exists, | ||
mock_listdir, | ||
mock_boto3_client, | ||
mock_harvard_storage, | ||
mock_opinion_cluster_get, | ||
mock_tqdm, | ||
): | ||
# Setup mocks | ||
mock_listdir.return_value = ["test_crosswalk.json"] | ||
mock_exists.side_effect = lambda path: path in [ | ||
"/mocked_path/crosswalk_dir" | ||
] | ||
|
||
mock_s3 = MagicMock() | ||
mock_boto3_client.return_value = mock_s3 | ||
mock_storage = MagicMock() | ||
mock_harvard_storage.return_value = mock_storage | ||
mock_opinion_cluster_get.return_value = self.cluster | ||
mock_tqdm.side_effect = ( | ||
lambda x, *args, **kwargs: x | ||
) # Make tqdm a pass-through function | ||
|
||
crosswalk_data = [ | ||
{ | ||
"cap_case_id": 1, | ||
"cl_cluster_id": self.cluster.id, | ||
"cap_path": "/test/path.json", | ||
} | ||
] | ||
|
||
# Mock file operations | ||
m = mock_open(read_data=json.dumps(crosswalk_data)) | ||
|
||
# Mock crosswalk_dir | ||
crosswalk_dir = "/mocked_path/crosswalk_dir" | ||
|
||
# Verify crosswalk_dir exists | ||
self.assertTrue( | ||
os.path.exists(crosswalk_dir), | ||
f"Crosswalk directory does not exist: {crosswalk_dir}", | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Everything above is copy-paste from |
||
|
||
with patch("builtins.open", m): | ||
call_command( | ||
"import_harvard_pdfs", | ||
crosswalk_dir=crosswalk_dir, | ||
job="assign_cap_id", | ||
) | ||
|
||
# Verify that the cluster's harvard_id field was updated | ||
self.cluster.refresh_from_db() | ||
self.assertEqual(self.cluster.harvard_id, "1") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see other things running through the crosswalk file to do work (#4614). I’ve no sound opinion for what the “correct” approach is.
jobs
here is driven by my interest in not copy/pasting (and then owning) the moderate complexity surrounding crosswalk processing. @jtmst, I’m happy to take any direction that you can give.