Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lobbyist employer scraper #31

Merged
merged 4 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions lobbyists.mk
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,24 @@ data/intermediate/lobbyists.csv : data/raw/lobbyists.csv
data/intermediate/clients.csv : data/raw/clients.csv
csvsql --query "SELECT ClientID, ClientVersionID, MAX(ClientName) AS ClientName FROM STDIN GROUP BY ClientID" < $< > $@

data/intermediate/filings.csv : data/raw/lobbyists.csv
csvsql --query "SELECT DISTINCT MemberID, MemberVersionID FROM STDIN" < $< | \
# Concatenate individual lobbyist and lobbyist employer filings
data/intermediate/filings.csv : data/intermediate/employer_filings.csv data/intermediate/individual_filings.csv
tail -n +2 data/intermediate/individual_filings.csv > data/intermediate/individual_filings_rows.csv; \
cat data/intermediate/employer_filings.csv data/intermediate/individual_filings_rows.csv > $@

data/intermediate/individual_filings.csv : data/raw/lobbyists.csv
csvsql --query "SELECT DISTINCT MemberID AS id, MemberVersionID AS version FROM STDIN" < $< | \
python -m scrapers.lobbyist.scrape_filings > $@

data/intermediate/employer_filings.csv : data/raw/employers.csv
csvsql --query "SELECT DISTINCT LobbyMemberID AS id, LobbyMemberversionid AS version FROM STDIN" < $< | \
python -m scrapers.lobbyist.scrape_filings --employer > $@

data/raw/lobbyists.csv : data/intermediate/clients.csv
python -m scrapers.lobbyist.scrape_lobbyists < $< > $@

data/raw/employers.csv :
python -m scrapers.lobbyist.scrape_employers > $@

data/raw/clients.csv :
python -m scrapers.lobbyist.scrape_clients > $@
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ awscli
csvkit
pyOpenSSL>=23.3.0
pandas
click
91 changes: 91 additions & 0 deletions scrapers/lobbyist/scrape_employers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import csv
import json
import logging
import scrapelib
import sys

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)


class LobbyistEmployerScraper(scrapelib.Scraper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def _employers(self):
payload = {
"FilerType": "CR",
"ExpenditureType": None,
"ElectionYear": "",
"AmountType": "Over",
"pageNumber": 1,
"pageSize": 1000,
"sortDir": "ASC",
"sortedBy": "",
}

page_number = 1
page_size = 1000
result_count = 1000

while result_count == page_size:
logger.debug(f"Fetching page {page_number}")

_payload = {**payload, "pageNumber": page_number}
logger.debug(_payload)

response = self.post(
"https://login.cfis.sos.state.nm.us/api///"
"Search/LobbyistExpenditureSearchInformation",
data=json.dumps(_payload),
headers={"Content-Type": "application/json"},
verify=False,
)

if response.ok:
results = response.json()

yield from results

result_count = len(results)

logger.debug(f"Last page {page_number} had {result_count} results")

page_number += 1
else:
logger.error(
f"Failed to fetch results:\n{response.content.decode('utf-8')}"
)
sys.exit()

def scrape(self):
for result in self._employers():
yield {
"Name": result["Name"],
"LobbyMemberID": result["LobbyMemberID"],
"LobbyMemberversionid": result["LobbyMemberversionid"],
}


def main():
writer = csv.DictWriter(
sys.stdout,
fieldnames=[
"Name",
"LobbyMemberID",
"LobbyMemberversionid",
],
extrasaction="ignore",
)

writer.writeheader()

scraper = LobbyistEmployerScraper(
requests_per_minute=60, retry_attempts=3, verify=False
)
for result in scraper.scrape():
writer.writerow(result)


if __name__ == "__main__":
main()
190 changes: 125 additions & 65 deletions scrapers/lobbyist/scrape_filings.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,139 @@
from abc import ABC, abstractmethod

import csv
import json
import logging
import scrapelib
import sys
import click

from tqdm import tqdm


logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

reader = csv.DictReader(sys.stdin)
writer = csv.DictWriter(
sys.stdout,
fieldnames=[
"ClientID",
"ReportName",
"ReportFileName",
"ReportTypeCode",
"ReportType",
"PeriodStart",
"PeriodEnd",
"DueDate",
"SubmittedDate",
"TotalRows",
"ReportID",
"ReportVersionID",
"Status",
"IncidentalLobbying",
"ReportStatus",
"IsAfterDueDate",
"CreatedBy",
"CreatedDate",
"CreatedIPAddress",
"LobbyistClientID",
"LobbyistClientVersionID",
"LastModifiedBy",
"LastModifiedDate",
"LastModifiedIPAddress",
"Message",
"IsSuccess",
"Action",
"IsDirty",
"PersonID",
"PersonVersionID",
"MemberID",
],
)

writer.writeheader()

s = scrapelib.Scraper(requests_per_minute=0, retry_attempts=3, verify=False)

payload = {
"PageNo": 1,
"PageSize": 1000,
"SortDir": "ASC",
"SortedBy": "",
"ElectionYear": "",
"LobbyistVersionID": "",
}

for row in tqdm(reader):
_payload = payload.copy()
_payload["LobbyistID"] = row["MemberID"]
_payload["LobbyistVersionID"] = row["MemberVersionID"]

response = s.post(
"https://login.cfis.sos.state.nm.us/api//ExploreClients/Fillings",
data=json.dumps(_payload),
headers={"Content-Type": "application/json"},

class LobbyistScraper(ABC, scrapelib.Scraper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

@abstractmethod
def _get_payload(self, id, version):
pass

def scrape(self, url, id, version):
payload = self._get_payload(id, version)

response = self.post(
url,
data=json.dumps(payload),
headers={"Content-Type": "application/json"},
verify=False,
)

if response.ok:
for record in response.json():
# Add the lobbyist ID and version to the filing record
record["MemberID"] = id
return record


class LobbyistEmployerScraper(LobbyistScraper):
url = "https://login.cfis.sos.state.nm.us/api//ExploreClients/Disclosures"

def _get_payload(self, id, version):
return {
"PageNumber": 1,
"PageSize": 1000,
"SortDir": "ASC",
"SortedBy": "",
"Year": "",
"ClientID": id,
"ClientVersionID": version,
}

def scrape(self, id, version):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this call signature different from the abstract base class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't figure out a clean way for the LobbyistScraper base class to reference the url attribute in a child class.

Turns out you can't stack @classmethod and @property to define a static class attribute as of 3.11.

If I'm overlooking something, please let me know.

Copy link
Member

@hancush hancush Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about stacking @property and @abstractmethod? https://docs.python.org/3/library/abc.html#abc.abstractproperty

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea!

return super().scrape(self.__class__.url, id, version)


class IndividualLobbyistScraper(LobbyistScraper):
url = "https://login.cfis.sos.state.nm.us/api//ExploreClients/Fillings"

def _get_payload(self, id, version):
return {
"PageNo": 1,
"PageSize": 1000,
"SortDir": "ASC",
"SortedBy": "",
"ElectionYear": "",
"LobbyistID": id,
"LobbyistVersionID": version,
}

def scrape(self, id, version):
return super().scrape(self.__class__.url, id, version)


@click.command()
@click.option("--employer", "is_employer_scrape", is_flag=True)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Used a flag to hook into the existing filing scraper.

def main(is_employer_scrape):

scraper_opts = {"requests_per_minute": 60, "retry_attempts": 3, "verify": False}

if is_employer_scrape:
scraper = LobbyistEmployerScraper(**scraper_opts)
else:
scraper = IndividualLobbyistScraper(**scraper_opts)

reader = csv.DictReader(sys.stdin)

writer = csv.DictWriter(
sys.stdout,
fieldnames=[
"ClientID",
"ReportName",
"ReportFileName",
"ReportTypeCode",
"ReportType",
"PeriodStart",
"PeriodEnd",
"DueDate",
"SubmittedDate",
"TotalRows",
"ReportID",
"ReportVersionID",
"Status",
"IncidentalLobbying",
"ReportStatus",
"IsAfterDueDate",
"CreatedBy",
"CreatedDate",
"CreatedIPAddress",
"LobbyistClientID",
"LobbyistClientVersionID",
"LastModifiedBy",
"LastModifiedDate",
"LastModifiedIPAddress",
"Message",
"IsSuccess",
"Action",
"IsDirty",
"PersonID",
"PersonVersionID",
"MemberID",
],
)

if response.ok:
for record in response.json():
# Add the lobbyist ID and version to the filing record
record["MemberID"] = row["MemberID"]
writer.writerow(record)
writer.writeheader()

for row in tqdm(reader):
logger.debug(row)
id, version = row["id"], str(int(float(row["version"])))
result = scraper.scrape(id, version)
if result:
writer.writerow(result)


if __name__ == "__main__":
main()