Skip to content

Commit

Permalink
Make scraper resilient to base url changes
Browse files Browse the repository at this point in the history
  • Loading branch information
michael-k authored and markpeek committed Nov 20, 2021
1 parent 7f3b2be commit 1087726
Showing 1 changed file with 4 additions and 7 deletions.
11 changes: 4 additions & 7 deletions scrape/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,24 +156,21 @@ async def collect_service_info() -> Iterable[Tuple[str, httpx.Response]]:
for link in parsed_html.body.find_all("a"):
href = link.attrs["href"]
if href.startswith("./list_") and href.endswith(".html"):
service_links.append(href)
service_links.append(r.url.join(href))

# This doesn't work at the moment,
# see https://github.com/encode/httpx/issues/1171
#
# return await asyncio.gather(
# *[
# client.get(urllib.parse.urljoin(BASE_URL, link))
# for link in service_links
# ]
# service_page_responses = await asyncio.gather(
# *[client.get(link) for link in service_links]
# )
#
# workaround
service_page_responses = []
for start in range(0, len(service_links), max_connections):
service_page_responses += await asyncio.gather(
*[
client.get(urllib.parse.urljoin(BASE_URL, link))
client.get(link)
for link in service_links[start : start + max_connections]
]
)
Expand Down

0 comments on commit 1087726

Please sign in to comment.