freelawproject · cweider · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · cweider
diff --git a/cl/search/filters.py b/cl/search/filters.py
@@ -166,6 +166,7 @@ class Meta:
             "citation_count": INTEGER_LOOKUPS,
             "precedential_status": ["exact"],
             "date_blocked": DATE_LOOKUPS,
+            "harvard_id": ["exact"],
             "blocked": ["exact"],
         }
 

diff --git a/cl/search/management/commands/import_harvard_pdfs.py b/cl/search/management/commands/import_harvard_pdfs.py
@@ -51,6 +51,13 @@ def add_arguments(self, parser):
             help="Directory for reading crosswalk files",
             required=True,
         )
+        parser.add_argument(
+            "--job",
+            type=str,
+            choices=["import_pdf", "assign_cap_id"],
+            default="import_pdf",
+            help="",
+        )
 
     def handle(self, *args: Any, **options: Any) -> None:
         """Handle the command execution.
@@ -65,6 +72,7 @@ def handle(self, *args: Any, **options: Any) -> None:
 
         self.dry_run = options["dry_run"]
         self.crosswalk_dir = options["crosswalk_dir"]
+        self.job = options["job"]
 
         if not os.path.exists(self.crosswalk_dir):
             logger.warning(
@@ -156,37 +164,23 @@ def process_crosswalk_file(self, crosswalk_file: str) -> None:
             crosswalk_data = json.load(f)
 
         for entry in tqdm(crosswalk_data, desc="Processing entries"):
-            logger.debug(f"Processing entry: {entry}")
             try:
                 cap_case_id = entry["cap_case_id"]
                 cl_cluster_id = entry["cl_cluster_id"]
-                json_path = entry["cap_path"]
-
-                # Construct the PDF path based on the JSON path
-                pdf_path = json_path.replace("cases", "case-pdfs").replace(
-                    ".json", ".pdf"
-                )
+                cap_path = entry["cap_path"]
 
-                if pdf_path in self.processed_pdfs:
-                    logger.info(f"Skipping already processed PDF: {pdf_path}")
+                if not all([cap_case_id, cl_cluster_id, cap_path]):
+                    logger.error(
+                        f"Missing key in entry: {json.dumps(entry, indent=2)}"
+                    )
                     continue
 
-                logger.info(f"Processing PDF: {pdf_path}")
-
-                if not self.dry_run:
-                    pdf_content = self.fetch_pdf_from_cap(pdf_path)
-                    if pdf_content:
-                        cluster = OpinionCluster.objects.get(id=cl_cluster_id)
-                        self.store_pdf_in_cl(cluster, pdf_content)
-                else:
-                    logger.info(f"Dry run: Would fetch PDF from {pdf_path}")
+                logger.debug(
+                    f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}"
+                )
 
-                self.processed_pdfs.add(pdf_path)
+                self.process_entry(cap_case_id, cl_cluster_id, cap_path)
 
-            except KeyError as e:
-                logger.error(
-                    f"Missing key in entry: {e}. Entry: {json.dumps(entry, indent=2)}"
-                )
             except Exception as e:
                 logger.error(
                     f"Error processing CAP ID {entry.get('cap_case_id', 'Unknown')}: {str(e)}",
@@ -206,36 +200,61 @@ def parse_cap_path(self, cap_path: str) -> Tuple[str, str, str]:
         case_name = parts[-1].replace(".json", "")
         return reporter_slug, volume_folder, case_name
 
-    def process_entry(self, entry: Dict[str, Any]) -> None:
+    def process_entry(
+        self, cap_case_id: int, cl_cluster_id: int, cap_path: str
+    ) -> None:
         """Process a single crosswalk entry.
 
-        :param entry: Dictionary containing crosswalk entry data.
+        :param cap_case_id: CAP case id
+        :param cl_cluster_id: CL cluster id
+        :param cap_path: Path of CAP JSON data (in the CAP_R2 S3 bucket)
         :return: None
         """
-        cap_case_id = entry["cap_case_id"]
-        cl_cluster_id = entry["cl_cluster_id"]
-        cap_path = entry["cap_path"]
-        logger.info(
-            f"Processing entry: cap_case_id={cap_case_id}, cl_cluster_id={cl_cluster_id}, cap_path={cap_path}"
-        )
-        try:
-            cluster = OpinionCluster.objects.get(id=cl_cluster_id)
-            logger.info(f"Found cluster: {cluster}")
-            pdf_content = self.fetch_pdf_from_cap(cap_path)
-            logger.info(
-                f"Fetched PDF content, length: {len(pdf_content) if pdf_content else 0}"
-            )
-            if pdf_content:
-                logger.info(
-                    "PDF content is not empty, calling store_pdf_in_cl"
-                )
-                self.store_pdf_in_cl(cluster, pdf_content)
-            else:
-                logger.info("PDF content is empty, skipping storage")
-        except OpinionCluster.DoesNotExist:
-            logger.info(f"Cluster not found for id: {cl_cluster_id}")
-        except Exception as e:
-            logger.error(f"Error processing entry: {str(e)}", exc_info=True)
+
+        match self.job:
+            case "import_pdf":
+                try:
+                    pdf_path = cap_path.replace("cases", "case-pdfs").replace(
+                        ".json", ".pdf"
+                    )
+
+                    if pdf_path in self.processed_pdfs:
+                        logger.info(
+                            f"Skipping already processed PDF: {pdf_path}"
+                        )
+                        return
+
+                    logger.info(f"Processing PDF: {pdf_path}")
+
+                    if not self.dry_run:
+                        pdf_content = self.fetch_pdf_from_cap(pdf_path)
+                        if pdf_content:
+                            cluster = OpinionCluster.objects.get(
+                                id=cl_cluster_id
+                            )
+                            self.store_pdf_in_cl(cluster, pdf_content)
+                    else:
+                        logger.info(
+                            f"Dry run: Would fetch PDF from {pdf_path}"
+                        )
+
+                    self.processed_pdfs.add(pdf_path)
+
+                except OpinionCluster.DoesNotExist:
+                    logger.info(f"Cluster not found for id: {cl_cluster_id}")
+
+            case "assign_cap_id":
+                try:
+                    cluster = OpinionCluster.objects.get(id=cl_cluster_id)
+                    cluster.harvard_id = cap_case_id
+                    if not self.dry_run:
+                        cluster.save()
+
+                except OpinionCluster.DoesNotExist:
+                    logger.info(f"Cluster not found for id: {cl_cluster_id}")
+
+            case _:
+                raise Exception(f"Unknown job {self.job}")
 
     def fetch_pdf_from_cap(self, pdf_path: str) -> Optional[bytes]:
         """Fetch PDF content from CAP.

diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.py
@@ -0,0 +1,69 @@
+# Generated by Django 5.1.2 on 2024-10-25 18:54
+
+import pgtrigger.compiler
+import pgtrigger.migrations
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("search", "0036_add_searchquery"),
+    ]
+
+    operations = [
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="opinioncluster",
+            name="update_update",
+        ),
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="opinioncluster",
+            name="delete_delete",
+        ),
+        migrations.AddField(
+            model_name="opinioncluster",
+            name="harvard_id",
+            field=models.CharField(
+                db_index=True,
+                default=0,
+                help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+            ),
+        ),
+        migrations.AddField(
+            model_name="opinionclusterevent",
+            name="harvard_id",
+            field=models.CharField(
+                default=0,
+                help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="opinioncluster",
+            trigger=pgtrigger.compiler.Trigger(
+                name="update_update",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    condition='WHEN (OLD."arguments" IS DISTINCT FROM (NEW."arguments") OR OLD."attorneys" IS DISTINCT FROM (NEW."attorneys") OR OLD."blocked" IS DISTINCT FROM (NEW."blocked") OR OLD."case_name" IS DISTINCT FROM (NEW."case_name") OR OLD."case_name_full" IS DISTINCT FROM (NEW."case_name_full") OR OLD."case_name_short" IS DISTINCT FROM (NEW."case_name_short") OR OLD."citation_count" IS DISTINCT FROM (NEW."citation_count") OR OLD."correction" IS DISTINCT FROM (NEW."correction") OR OLD."cross_reference" IS DISTINCT FROM (NEW."cross_reference") OR OLD."date_blocked" IS DISTINCT FROM (NEW."date_blocked") OR OLD."date_filed" IS DISTINCT FROM (NEW."date_filed") OR OLD."date_filed_is_approximate" IS DISTINCT FROM (NEW."date_filed_is_approximate") OR OLD."disposition" IS DISTINCT FROM (NEW."disposition") OR OLD."docket_id" IS DISTINCT FROM (NEW."docket_id") OR OLD."filepath_json_harvard" IS DISTINCT FROM (NEW."filepath_json_harvard") OR OLD."filepath_pdf_harvard" IS DISTINCT FROM (NEW."filepath_pdf_harvard") OR OLD."harvard_id" IS DISTINCT FROM (NEW."harvard_id") OR OLD."headmatter" IS DISTINCT FROM (NEW."headmatter") OR OLD."headnotes" IS DISTINCT FROM (NEW."headnotes") OR OLD."history" IS DISTINCT FROM (NEW."history") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."judges" IS DISTINCT FROM (NEW."judges") OR OLD."nature_of_suit" IS DISTINCT FROM (NEW."nature_of_suit") OR OLD."other_dates" IS DISTINCT FROM (NEW."other_dates") OR OLD."posture" IS DISTINCT FROM (NEW."posture") OR OLD."precedential_status" IS DISTINCT FROM (NEW."precedential_status") OR OLD."procedural_history" IS DISTINCT FROM (NEW."procedural_history") OR OLD."scdb_decision_direction" IS DISTINCT FROM (NEW."scdb_decision_direction") OR OLD."scdb_id" IS DISTINCT FROM (NEW."scdb_id") OR OLD."scdb_votes_majority" IS DISTINCT FROM (NEW."scdb_votes_majority") OR OLD."scdb_votes_minority" IS DISTINCT FROM (NEW."scdb_votes_minority") OR OLD."slug" IS DISTINCT FROM (NEW."slug") OR OLD."source" IS DISTINCT FROM (NEW."source") OR OLD."summary" IS DISTINCT FROM (NEW."summary") OR OLD."syllabus" IS DISTINCT FROM (NEW."syllabus"))',
+                    func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
+                    hash="bc20a56b13c375017e704a6e50efd44e5c060018",
+                    operation="UPDATE",
+                    pgid="pgtrigger_update_update_c83f1",
+                    table="search_opinioncluster",
+                    when="AFTER",
+                ),
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="opinioncluster",
+            trigger=pgtrigger.compiler.Trigger(
+                name="delete_delete",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    func='INSERT INTO "search_opinionclusterevent" ("arguments", "attorneys", "blocked", "case_name", "case_name_full", "case_name_short", "citation_count", "correction", "cross_reference", "date_blocked", "date_created", "date_filed", "date_filed_is_approximate", "date_modified", "disposition", "docket_id", "filepath_json_harvard", "filepath_pdf_harvard", "harvard_id", "headmatter", "headnotes", "history", "id", "judges", "nature_of_suit", "other_dates", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "posture", "precedential_status", "procedural_history", "scdb_decision_direction", "scdb_id", "scdb_votes_majority", "scdb_votes_minority", "slug", "source", "summary", "syllabus") VALUES (OLD."arguments", OLD."attorneys", OLD."blocked", OLD."case_name", OLD."case_name_full", OLD."case_name_short", OLD."citation_count", OLD."correction", OLD."cross_reference", OLD."date_blocked", OLD."date_created", OLD."date_filed", OLD."date_filed_is_approximate", OLD."date_modified", OLD."disposition", OLD."docket_id", OLD."filepath_json_harvard", OLD."filepath_pdf_harvard", OLD."harvard_id", OLD."headmatter", OLD."headnotes", OLD."history", OLD."id", OLD."judges", OLD."nature_of_suit", OLD."other_dates", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."posture", OLD."precedential_status", OLD."procedural_history", OLD."scdb_decision_direction", OLD."scdb_id", OLD."scdb_votes_majority", OLD."scdb_votes_minority", OLD."slug", OLD."source", OLD."summary", OLD."syllabus"); RETURN NULL;',
+                    hash="93725d0e8785d341973cd6af46aa9b3e9aca1ec2",
+                    operation="DELETE",
+                    pgid="pgtrigger_delete_delete_a8516",
+                    table="search_opinioncluster",
+                    when="AFTER",
+                ),
+            ),
+        ),
+    ]
diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.sql b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster.sql
@@ -0,0 +1,8 @@
+BEGIN;
+ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
+ALTER TABLE "search_opinionclusterevent" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinionclusterevent" ALTER COLUMN "harvard_id" DROP DEFAULT;
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
+COMMIT;
diff --git a/cl/search/migrations/0037_add_harvard_id_to_opinioncluster_customers.sql b/cl/search/migrations/0037_add_harvard_id_to_opinioncluster_customers.sql
@@ -0,0 +1,6 @@
+BEGIN;
+ALTER TABLE "search_opinioncluster" ADD COLUMN "harvard_id" varchar DEFAULT '0' NOT NULL;
+ALTER TABLE "search_opinioncluster" ALTER COLUMN "harvard_id" DROP DEFAULT;
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52" ON "search_opinioncluster" ("harvard_id");
+CREATE INDEX "search_opinioncluster_harvard_id_b7c3eb52_like" ON "search_opinioncluster" ("harvard_id" varchar_pattern_ops);
+COMMIT;
diff --git a/cl/search/models.py b/cl/search/models.py
@@ -2731,6 +2731,11 @@ class OpinionCluster(AbstractDateTimeModel):
         storage=IncrementingAWSMediaStorage(),
         blank=True,
     )
+    harvard_id = models.CharField(
+        help_text="The ID of the item in the Caselaw Access Project (Harvard)",
+        default=0,
+        db_index=True,
+    )
     arguments = models.TextField(
         help_text="The attorney(s) and legal arguments presented as HTML text. "
         "This is primarily seen in older opinions and can contain "

diff --git a/cl/search/tests/test_import_harvard_pdfs.py b/cl/search/tests/test_import_harvard_pdfs.py
@@ -119,3 +119,68 @@ def test_import_harvard_pdfs(
         self.assertEqual(
             self.cluster.filepath_pdf_harvard, "mocked_saved_path.pdf"
         )
+
+    @patch("cl.search.management.commands.import_harvard_pdfs.tqdm")
+    @patch(
+        "cl.search.management.commands.import_harvard_pdfs.OpinionCluster.objects.get"
+    )
+    @patch(
+        "cl.search.management.commands.import_harvard_pdfs.HarvardPDFStorage"
+    )
+    @patch("cl.search.management.commands.import_harvard_pdfs.boto3.client")
+    @patch("cl.search.management.commands.import_harvard_pdfs.os.listdir")
+    @patch("cl.search.management.commands.import_harvard_pdfs.os.path.exists")
+    def test_assign_harvard_id(
+        self,
+        mock_exists,
+        mock_listdir,
+        mock_boto3_client,
+        mock_harvard_storage,
+        mock_opinion_cluster_get,
+        mock_tqdm,
+    ):
+        # Setup mocks
+        mock_listdir.return_value = ["test_crosswalk.json"]
+        mock_exists.side_effect = lambda path: path in [
+            "/mocked_path/crosswalk_dir"
+        ]
+
+        mock_s3 = MagicMock()
+        mock_boto3_client.return_value = mock_s3
+        mock_storage = MagicMock()
+        mock_harvard_storage.return_value = mock_storage
+        mock_opinion_cluster_get.return_value = self.cluster
+        mock_tqdm.side_effect = (
+            lambda x, *args, **kwargs: x
+        )  # Make tqdm a pass-through function
+
+        crosswalk_data = [
+            {
+                "cap_case_id": 1,
+                "cl_cluster_id": self.cluster.id,
+                "cap_path": "/test/path.json",
+            }
+        ]
+
+        # Mock file operations
+        m = mock_open(read_data=json.dumps(crosswalk_data))
+
+        # Mock crosswalk_dir
+        crosswalk_dir = "/mocked_path/crosswalk_dir"
+
+        # Verify crosswalk_dir exists
+        self.assertTrue(
+            os.path.exists(crosswalk_dir),
+            f"Crosswalk directory does not exist: {crosswalk_dir}",
+        )
+
+        with patch("builtins.open", m):
+            call_command(
+                "import_harvard_pdfs",
+                crosswalk_dir=crosswalk_dir,
+                job="assign_cap_id",
+            )
+
+        # Verify that the cluster's harvard_id field was updated
+        self.cluster.refresh_from_db()
+        self.assertEqual(self.cluster.harvard_id, "1")