Make scraper resilient to base url changes

cloudtools · Nov 20, 2021 · 1087726 · 1087726
1 parent 7f3b2be
commit 1087726
Showing 1 changed file with 4 additions and 7 deletions.
diff --git a/scrape/scrape.py b/scrape/scrape.py
@@ -156,24 +156,21 @@ async def collect_service_info() -> Iterable[Tuple[str, httpx.Response]]:
         for link in parsed_html.body.find_all("a"):
             href = link.attrs["href"]
             if href.startswith("./list_") and href.endswith(".html"):
-                service_links.append(href)
+                service_links.append(r.url.join(href))
 
         # This doesn't work at the moment,
         # see https://github.com/encode/httpx/issues/1171
         #
-        # return await asyncio.gather(
-        #     *[
-        #         client.get(urllib.parse.urljoin(BASE_URL, link))
-        #         for link in service_links
-        #     ]
+        # service_page_responses = await asyncio.gather(
+        #     *[client.get(link) for link in service_links]
         # )
         #
         # workaround
         service_page_responses = []
         for start in range(0, len(service_links), max_connections):
             service_page_responses += await asyncio.gather(
                 *[
-                    client.get(urllib.parse.urljoin(BASE_URL, link))
+                    client.get(link)
                     for link in service_links[start : start + max_connections]
                 ]
             )