diff --git a/data_collection/gazette/spiders/base/nucleogov.py b/data_collection/gazette/spiders/base/nucleogov.py new file mode 100644 index 000000000..6593cafd5 --- /dev/null +++ b/data_collection/gazette/spiders/base/nucleogov.py @@ -0,0 +1,41 @@ +import json +from datetime import datetime + +import scrapy +from dateutil.rrule import DAILY, rrule + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class NucleoGovGazetteSpider(BaseGazetteSpider): + def start_requests(self): + days = rrule(freq=DAILY, dtstart=self.start_date, until=self.end_date) + for day in days: + yield scrapy.Request(self.url_base.format(day.strftime("%Y-%m-%d"))) + + def parse(self, response): + data = json.loads(response.text) + + gazettes = data.get("data") + for gazette in gazettes: + gazette_urls = [] + + if gazette.get("media_legacy"): + gazette_urls.append(gazette.get("media_legacy")) + else: + midias = gazette.get("midias") + + for midia in midias: + gazette_urls.append(midia.get("url")) + + gazette_date = datetime.strptime(gazette.get("data"), "%Y-%m-%d") + edition_number = gazette.get("numero") + + yield Gazette( + date=gazette_date.date(), + file_urls=gazette_urls, + edition_number=edition_number, + power="executive", + is_extra_edition=False, + ) diff --git a/data_collection/gazette/spiders/go/go_anapolis.py b/data_collection/gazette/spiders/go/go_anapolis.py new file mode 100644 index 000000000..cd61de1d2 --- /dev/null +++ b/data_collection/gazette/spiders/go/go_anapolis.py @@ -0,0 +1,16 @@ +import datetime as dt + +from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider + + +class GoAnapolisSpider(NucleoGovGazetteSpider): + name = "go_anapolis" + TERRITORY_ID = "5201108" + allowed_domains = [ + "dom.anapolis.go.gov.br", + ] + url_base = ( + "https://dom.anapolis.go.gov.br/api/diarios?data={}&calendar=true&situacao=2" + ) + + start_date = dt.date(2010, 5, 31) diff --git a/data_collection/gazette/spiders/go/go_valparaiso_de_goias.py b/data_collection/gazette/spiders/go/go_valparaiso_de_goias.py new file mode 100644 index 000000000..d0af3d317 --- /dev/null +++ b/data_collection/gazette/spiders/go/go_valparaiso_de_goias.py @@ -0,0 +1,14 @@ +import datetime as dt + +from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider + + +class GoValparaisoDeGoiasSpider(NucleoGovGazetteSpider): + name = "go_valparaiso_de_goias" + TERRITORY_ID = "5221858" + allowed_domains = [ + "diariooficial.valparaisodegoias.go.gov.br", + ] + url_base = "https://diariooficial.valparaisodegoias.go.gov.br/api/diarios?data={}&calendar=true&situacao=2" + + start_date = dt.date(2021, 2, 17) diff --git a/data_collection/gazette/spiders/to/to_cariri_do_tocantins.py b/data_collection/gazette/spiders/to/to_cariri_do_tocantins.py new file mode 100644 index 000000000..e9c483af3 --- /dev/null +++ b/data_collection/gazette/spiders/to/to_cariri_do_tocantins.py @@ -0,0 +1,16 @@ +import datetime as dt + +from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider + + +class ToCaririDoTocantinsSpider(NucleoGovGazetteSpider): + name = "to_cariri_do_tocantins" + TERRITORY_ID = "1703867" + allowed_domains = [ + "dom.cariri.to.gov.br", + ] + url_base = ( + "https://dom.cariri.to.gov.br/api/diarios?data={}&calendar=true&situacao=2" + ) + + start_date = dt.date(2023, 1, 30) diff --git a/data_collection/gazette/spiders/to/to_jau_do_tocantins.py b/data_collection/gazette/spiders/to/to_jau_do_tocantins.py new file mode 100644 index 000000000..1b41f7a17 --- /dev/null +++ b/data_collection/gazette/spiders/to/to_jau_do_tocantins.py @@ -0,0 +1,14 @@ +import datetime as dt + +from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider + + +class ToJauDoTocantinsSpider(NucleoGovGazetteSpider): + name = "to_jau_do_tocantins" + TERRITORY_ID = "1711506" + allowed_domains = [ + "diariooficial.jau.to.gov.br", + ] + url_base = "https://diariooficial.jau.to.gov.br/api/diarios?data={}&calendar=true&situacao=2" + + start_date = dt.date(2023, 5, 15) diff --git a/data_collection/gazette/spiders/to/to_parana.py b/data_collection/gazette/spiders/to/to_parana.py new file mode 100644 index 000000000..cefb126d1 --- /dev/null +++ b/data_collection/gazette/spiders/to/to_parana.py @@ -0,0 +1,14 @@ +import datetime as dt + +from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider + + +class ToParanaSpider(NucleoGovGazetteSpider): + name = "to_parana" + TERRITORY_ID = "1716208" + allowed_domains = [ + "diariooficial.parana.to.gov.br", + ] + url_base = "https://diariooficial.parana.to.gov.br/api/diarios?data={}&calendar=true&situacao=2" + + start_date = dt.date(2023, 5, 8)