diff --git a/CHANGES.md b/CHANGES.md index 45925df90..eef4f0400 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -16,14 +16,19 @@ Releases are also tagged in git, if that's helpful. ## Current -**2.6.40 - 2024-11-20** +**2.6.42 - 2024-11-21** - Fixes: - - Fix `mass` and `massctapp` scrapers, scrape new endpoint - - Exclude "Commomwealth" string from short case names + - Fix `mass` and `massctapp` cleanup content method ## Past +**2.6.40 - 2024-11-20** + +- Fixes: + - Fix `mass` and `massctapp` scrapers, scrape new endpoint + - Exclude "Commonwealth" string from short case names + **2.6.39 - 2024-11-18** - Fixes: diff --git a/juriscraper/opinions/united_states/state/colo.py b/juriscraper/opinions/united_states/state/colo.py index ef58c6480..865daaceb 100644 --- a/juriscraper/opinions/united_states/state/colo.py +++ b/juriscraper/opinions/united_states/state/colo.py @@ -12,8 +12,8 @@ - 2024-07-04: Update to new site, grossir """ -from datetime import date, datetime -from typing import Tuple +from datetime import date, datetime, timedelta +from typing import Optional, Tuple from urllib.parse import urlencode from juriscraper.AbstractSite import logger @@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.court_id = self.__module__ self.params = { - "product_id": "WW", + "product_id": "COLORADO", "jurisdiction": "US", "content_type": "2", "court": self.api_court_code, @@ -40,14 +40,13 @@ def __init__(self, *args, **kwargs): "per_page": "30", # Server breaks down when per_page=500, returns 503 "page": "1", "sort": "date", + "type": "document", "include_local_exclusive": "true", "cbm": "6.0|361.0|5.0|9.0|4.0|2.0=0.01|400.0|1.0|0.001|1.5|0.2", "locale": "en", "hide_ct6": "true", - "t": str(datetime.now().timestamp())[:10], - "type": "document", } - self.url = f"{self.base_url}?{urlencode(self.params)}" + self.update_url() # Request won't work without some of these X- headers self.request["headers"].update( @@ -123,6 +122,23 @@ def _download_backwards(self, dates: Tuple[date]) -> None: :return None """ logger.info("Backscraping for range %s %s", *dates) + self.update_url(dates) + self.html = self._download() + self._process_html() + + def update_url(self, dates: Optional[Tuple[date]] = None) -> None: + """ + Set URL with date filters and current timestamp. + Request with no date filter was returning very old documents + instead of the most recent ones + + :param dates: start and end date tuple. If not present, + scrape last week + """ + if not dates: + today = datetime.now() + dates = (today - timedelta(7), today + timedelta(1)) + start = dates[0].strftime("%Y-%m-%d") end = dates[1].strftime("%Y-%m-%d") timestamp = str(datetime.now().timestamp())[:10] @@ -130,12 +146,7 @@ def _download_backwards(self, dates: Tuple[date]) -> None: params.update( { "date": f"{start}..{end}", - # These are duplicated by the frontend too - "locale": ["en", "en"], - "hide_ct6": ["true", "true"], - "t": [timestamp, timestamp], + "t": timestamp, } ) self.url = f"{self.base_url}?{urlencode(params)}" - self.html = self._download() - self._process_html() diff --git a/juriscraper/opinions/united_states/state/mass.py b/juriscraper/opinions/united_states/state/mass.py index 02cf4b905..75ee81a94 100644 --- a/juriscraper/opinions/united_states/state/mass.py +++ b/juriscraper/opinions/united_states/state/mass.py @@ -83,4 +83,4 @@ def cleanup_content(content): new_tree = etree.Element("html") body = etree.SubElement(new_tree, "body") body.append(content) - return html.tostring(new_tree, pretty_print=True, encoding="unicode") + return html.tostring(new_tree).decode("utf-8") diff --git a/setup.py b/setup.py index ef1f155a3..1e9797047 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import find_packages, setup from setuptools.command.install import install -VERSION = "2.6.40" +VERSION = "2.6.42" AUTHOR = "Free Law Project" EMAIL = "info@free.law" HERE = os.path.abspath(os.path.dirname(__file__))