Skip to content

Commit

Permalink
Merge pull request #19 from datamade/feature/16-update-related
Browse files Browse the repository at this point in the history
Update related tables when cases are rescraped
  • Loading branch information
antidipyramid authored Dec 8, 2023
2 parents 0e2c1d4 + 0b9da74 commit edfdef7
Show file tree
Hide file tree
Showing 10 changed files with 205 additions and 113 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ instance/

# Scrapy stuff:
.scrapy
error

# Sphinx documentation
docs/_build/
Expand Down
20 changes: 0 additions & 20 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,26 +60,6 @@ cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)'
sqlite-utils convert $@ event date 'r.parsedate(value)'

.PHONY : update_civil_db
update_civil_db : rescraped_civil_cases.csv
tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail

.PHONY : update_chancery_db
update_chancery_db : rescraped_chancery_cases.csv
tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail

rescraped_civil_cases.csv : to_rescrape.civil.csv
scrapy crawl civil -a update=True -a case_numbers_file=$< -O $@

rescraped_chancery_cases.csv : to_rescrape.chancery.csv
scrapy crawl chancery -a update=True -a case_numbers_file=$< -O $@

to_rescrape.civil.csv : cases.db
sqlite3 cases.db < scripts/to_scrape.civil.sql > $@

to_rescrape.chancery.csv : cases.db
sqlite3 cases.db < scripts/to_scrape.chancery.sql > $@

%.csv: court_case_raw.%.csv
cat $< | \
sed '1s/court_case_raw\._key/case_number/g' | \
Expand Down
85 changes: 85 additions & 0 deletions Makefile.update
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Makefile for re-scraping cases and updating the case database

.INTERMEDIATE: rescraped_cases.db

.PHONY : update_db
update_db: rescraped_cases.db cases.db
sqlite3 $< < scripts/update_rescraped_cases.sql

rescraped_cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
csvs-to-sqlite $^ $@
cat scripts/foreign_key.sql | sqlite3 $@
sqlite-utils add-column $@ court_case subdivision text
sqlite3 $@ < scripts/subdivision.sql
sqlite-utils transform $@ court_case \
--drop _key \
--pk case_number \
--column-order case_number \
--column-order filing_date \
--column-order division \
--column-order subdivision \
--column-order case_type \
--column-order calendar \
--column-order ad_damnum
sqlite-utils add-foreign-keys $@ \
attorney case_number court_case case_number \
defendant case_number court_case case_number \
plaintiff case_number court_case case_number \
event case_number court_case case_number
sqlite-utils transform $@ defendant \
--rename _key order \
--column-order case_number \
--column-order order \
--column-order defendant
sqlite-utils transform $@ attorney \
--rename _key order \
--column-order case_number \
--column-order order \
--column-order attorney
sqlite-utils transform $@ event \
--rename _key order \
--column-order case_number \
--column-order order \
--column-order date \
--column-order description \
--column-order comments
sqlite-utils transform $@ plaintiff \
--rename _key order \
--column-order case_number \
--column-order order \
--column-order plaintiff
sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)'
sqlite-utils convert $@ event date 'r.parsedate(value)'

%.csv: court_case_raw.%.csv
cat $< | \
sed '1s/court_case_raw\._key/case_number/g' | \
sed -r '1s/[a-z0-9_]+\.//g' > $@

court_case.csv : court_case_raw.csv
cat $< | sed -r '1s/[a-z0-9_]+\.//g' > $@

court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : rescraped_cases.json
perl json-to-multicsv.pl --file $< \
--path /:table:court_case_raw \
--path /*/events/:table:event \
--path /*/plaintiffs/:table:plaintiff \
--path /*/defendants/:table:defendant \
--path /*/attorneys/:table:attorney

rescraped_cases.json: rescraped_chancery_cases.jl rescraped_civil_cases.jl
cat $^ | jq --slurp '.' > $@

rescraped_civil_cases.jl : to_rescrape.civil.csv
scrapy crawl civil -a case_numbers_file=$< -O $@

rescraped_chancery_cases.jl : to_rescrape.chancery.csv
scrapy crawl chancery -a case_numbers_file=$< -O $@

TO_SCRAPE_QUERY=$(shell tail -n +6 scripts/to_scrape.sql)

to_rescrape.civil.csv : cases.db
sqlite-utils query --csv --no-headers $< "$(TO_SCRAPE_QUERY)" -p court civil > $@

to_rescrape.chancery.csv : cases.db
sqlite-utils query --csv --no-headers $< "$(TO_SCRAPE_QUERY)" -p court chancery > $@
17 changes: 12 additions & 5 deletions courtscraper/spiders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,29 @@ def get_case_numbers(self):
def case_numbers_from_file(self, filename):
with open(filename) as f:
for case_number in f:
yield case_number
yield case_number.strip()

def parse(self, response):
now = datetime.now(tz=timezone.utc).isoformat()

case_info = self.get_case_info(response)
case_info.update(
{
"events": self.get_activities(response),
"court": self.name,
"updated_at": None if self.update else now,
"scraped_at": now,
}
)
case_info["hash"] = dict_hash(case_info)

# When scraping for a case for the first time, we
# need to set up the updated_at and scraped_at fields
if not self.update:
now = datetime.now(tz=timezone.utc).isoformat()
case_info.update(
{
"updated_at": now,
"scraped_at": now,
}
)

self._success(response)

return case_info
Expand Down
3 changes: 0 additions & 3 deletions scripts/foreign_key.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
UPDATE plaintiff SET case_number = court_case.case_number



FROM court_case
WHERE plaintiff.case_number = court_case._key;

Expand Down
23 changes: 0 additions & 23 deletions scripts/to_scrape.chancery.sql

This file was deleted.

23 changes: 0 additions & 23 deletions scripts/to_scrape.civil.sql

This file was deleted.

32 changes: 32 additions & 0 deletions scripts/to_scrape.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
-- Generates a priority queue of cases to re-scrape
-- Inspired by Cho and Molina, Estimating Frequency of Change
-- https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=
-- 60c8e42055dfb80072a547c73fbc18dfbacc20aa

WITH
overall_rate AS (
SELECT
sum(
1 / (julianday(scraped_at) - julianday(updated_at))
) / count(*) FILTER (
WHERE julianday(scraped_at) > julianday(updated_at)
) AS rate,
3 AS prior_weight
FROM court_case
)

SELECT court_case.case_number
FROM court_case
INNER JOIN overall_rate ON 1 = 1
WHERE court_case.court = :court -- noqa
ORDER BY
(
(overall_rate.prior_weight + 1)
/ (
overall_rate.prior_weight / overall_rate.rate
+ julianday(court_case.scraped_at)
- julianday(court_case.updated_at)
)
)
* (julianday('now') - julianday(court_case.scraped_at)) DESC
LIMIT 3000;
39 changes: 0 additions & 39 deletions scripts/update.sql

This file was deleted.

75 changes: 75 additions & 0 deletions scripts/update_rescraped_cases.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
ATTACH DATABASE 'cases.db' AS cases; -- noqa

-- Temporary table to hold updated cases
-- i.e. cases whose hashes have changed
CREATE TEMPORARY TABLE updated_case(num text);
INSERT INTO
updated_case
SELECT
a.case_number
FROM
court_case as a
LEFT JOIN cases.court_case as b ON a.case_number = b.case_number
WHERE
a.hash != b.hash;

UPDATE cases.court_case
SET
calendar = r.calendar,
filing_date = r.filing_date,
division = r.division,
case_type = r.case_type,
ad_damnum = r.ad_damnum,
court = r.court,
hash = r.hash,
scraped_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
FROM court_case as r
WHERE
court_case.case_number IN (SELECT * FROM updated_case);

-- Update related attorneys
DELETE FROM cases.attorney
WHERE
case_number IN (SELECT * FROM updated_case);

INSERT INTO cases.attorney
SELECT * FROM attorney
WHERE
case_number IN (SELECT * FROM updated_case);

-- Update related defendants
DELETE FROM cases.defendant
WHERE
case_number IN (SELECT * FROM updated_case);

INSERT INTO cases.defendant
SELECT * FROM defendant
WHERE
case_number IN (SELECT * FROM updated_case);

-- Update related events
DELETE FROM cases.event
WHERE
case_number IN (SELECT * FROM updated_case);

INSERT INTO cases.event
SELECT * FROM event
WHERE
case_number IN (SELECT * FROM updated_case);

-- Update related plaintiffs
DELETE FROM cases.plaintiff
WHERE
case_number IN (SELECT * FROM updated_case);

INSERT INTO cases.plaintiff
SELECT * FROM plaintiff
WHERE
case_number IN (SELECT * FROM updated_case);

-- For cases that haven't changed, just update their scraped_at field
UPDATE cases.court_case
SET scraped_at = CURRENT_TIMESTAMP
WHERE
case_number NOT IN (SELECT * FROM updated_case);

0 comments on commit edfdef7

Please sign in to comment.