-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from datamade/feature/16-update-related
Update related tables when cases are rescraped
- Loading branch information
Showing
10 changed files
with
205 additions
and
113 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,6 +79,7 @@ instance/ | |
|
||
# Scrapy stuff: | ||
.scrapy | ||
error | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# Makefile for re-scraping cases and updating the case database | ||
|
||
.INTERMEDIATE: rescraped_cases.db | ||
|
||
.PHONY : update_db | ||
update_db: rescraped_cases.db cases.db | ||
sqlite3 $< < scripts/update_rescraped_cases.sql | ||
|
||
rescraped_cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv | ||
csvs-to-sqlite $^ $@ | ||
cat scripts/foreign_key.sql | sqlite3 $@ | ||
sqlite-utils add-column $@ court_case subdivision text | ||
sqlite3 $@ < scripts/subdivision.sql | ||
sqlite-utils transform $@ court_case \ | ||
--drop _key \ | ||
--pk case_number \ | ||
--column-order case_number \ | ||
--column-order filing_date \ | ||
--column-order division \ | ||
--column-order subdivision \ | ||
--column-order case_type \ | ||
--column-order calendar \ | ||
--column-order ad_damnum | ||
sqlite-utils add-foreign-keys $@ \ | ||
attorney case_number court_case case_number \ | ||
defendant case_number court_case case_number \ | ||
plaintiff case_number court_case case_number \ | ||
event case_number court_case case_number | ||
sqlite-utils transform $@ defendant \ | ||
--rename _key order \ | ||
--column-order case_number \ | ||
--column-order order \ | ||
--column-order defendant | ||
sqlite-utils transform $@ attorney \ | ||
--rename _key order \ | ||
--column-order case_number \ | ||
--column-order order \ | ||
--column-order attorney | ||
sqlite-utils transform $@ event \ | ||
--rename _key order \ | ||
--column-order case_number \ | ||
--column-order order \ | ||
--column-order date \ | ||
--column-order description \ | ||
--column-order comments | ||
sqlite-utils transform $@ plaintiff \ | ||
--rename _key order \ | ||
--column-order case_number \ | ||
--column-order order \ | ||
--column-order plaintiff | ||
sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)' | ||
sqlite-utils convert $@ event date 'r.parsedate(value)' | ||
|
||
%.csv: court_case_raw.%.csv | ||
cat $< | \ | ||
sed '1s/court_case_raw\._key/case_number/g' | \ | ||
sed -r '1s/[a-z0-9_]+\.//g' > $@ | ||
|
||
court_case.csv : court_case_raw.csv | ||
cat $< | sed -r '1s/[a-z0-9_]+\.//g' > $@ | ||
|
||
court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : rescraped_cases.json | ||
perl json-to-multicsv.pl --file $< \ | ||
--path /:table:court_case_raw \ | ||
--path /*/events/:table:event \ | ||
--path /*/plaintiffs/:table:plaintiff \ | ||
--path /*/defendants/:table:defendant \ | ||
--path /*/attorneys/:table:attorney | ||
|
||
rescraped_cases.json: rescraped_chancery_cases.jl rescraped_civil_cases.jl | ||
cat $^ | jq --slurp '.' > $@ | ||
|
||
rescraped_civil_cases.jl : to_rescrape.civil.csv | ||
scrapy crawl civil -a case_numbers_file=$< -O $@ | ||
|
||
rescraped_chancery_cases.jl : to_rescrape.chancery.csv | ||
scrapy crawl chancery -a case_numbers_file=$< -O $@ | ||
|
||
TO_SCRAPE_QUERY=$(shell tail -n +6 scripts/to_scrape.sql) | ||
|
||
to_rescrape.civil.csv : cases.db | ||
sqlite-utils query --csv --no-headers $< "$(TO_SCRAPE_QUERY)" -p court civil > $@ | ||
|
||
to_rescrape.chancery.csv : cases.db | ||
sqlite-utils query --csv --no-headers $< "$(TO_SCRAPE_QUERY)" -p court chancery > $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
-- Generates a priority queue of cases to re-scrape | ||
-- Inspired by Cho and Molina, Estimating Frequency of Change | ||
-- https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi= | ||
-- 60c8e42055dfb80072a547c73fbc18dfbacc20aa | ||
|
||
WITH | ||
overall_rate AS ( | ||
SELECT | ||
sum( | ||
1 / (julianday(scraped_at) - julianday(updated_at)) | ||
) / count(*) FILTER ( | ||
WHERE julianday(scraped_at) > julianday(updated_at) | ||
) AS rate, | ||
3 AS prior_weight | ||
FROM court_case | ||
) | ||
|
||
SELECT court_case.case_number | ||
FROM court_case | ||
INNER JOIN overall_rate ON 1 = 1 | ||
WHERE court_case.court = :court -- noqa | ||
ORDER BY | ||
( | ||
(overall_rate.prior_weight + 1) | ||
/ ( | ||
overall_rate.prior_weight / overall_rate.rate | ||
+ julianday(court_case.scraped_at) | ||
- julianday(court_case.updated_at) | ||
) | ||
) | ||
* (julianday('now') - julianday(court_case.scraped_at)) DESC | ||
LIMIT 3000; |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
ATTACH DATABASE 'cases.db' AS cases; -- noqa | ||
|
||
-- Temporary table to hold updated cases | ||
-- i.e. cases whose hashes have changed | ||
CREATE TEMPORARY TABLE updated_case(num text); | ||
INSERT INTO | ||
updated_case | ||
SELECT | ||
a.case_number | ||
FROM | ||
court_case as a | ||
LEFT JOIN cases.court_case as b ON a.case_number = b.case_number | ||
WHERE | ||
a.hash != b.hash; | ||
|
||
UPDATE cases.court_case | ||
SET | ||
calendar = r.calendar, | ||
filing_date = r.filing_date, | ||
division = r.division, | ||
case_type = r.case_type, | ||
ad_damnum = r.ad_damnum, | ||
court = r.court, | ||
hash = r.hash, | ||
scraped_at = CURRENT_TIMESTAMP, | ||
updated_at = CURRENT_TIMESTAMP | ||
FROM court_case as r | ||
WHERE | ||
court_case.case_number IN (SELECT * FROM updated_case); | ||
|
||
-- Update related attorneys | ||
DELETE FROM cases.attorney | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
INSERT INTO cases.attorney | ||
SELECT * FROM attorney | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
-- Update related defendants | ||
DELETE FROM cases.defendant | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
INSERT INTO cases.defendant | ||
SELECT * FROM defendant | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
-- Update related events | ||
DELETE FROM cases.event | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
INSERT INTO cases.event | ||
SELECT * FROM event | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
-- Update related plaintiffs | ||
DELETE FROM cases.plaintiff | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
INSERT INTO cases.plaintiff | ||
SELECT * FROM plaintiff | ||
WHERE | ||
case_number IN (SELECT * FROM updated_case); | ||
|
||
-- For cases that haven't changed, just update their scraped_at field | ||
UPDATE cases.court_case | ||
SET scraped_at = CURRENT_TIMESTAMP | ||
WHERE | ||
case_number NOT IN (SELECT * FROM updated_case); |