Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Novo spider base]: NucleoGov - Anápolis, GO #1147

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions data_collection/gazette/spiders/base/nucleogov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json
from datetime import datetime

import scrapy
from dateutil.rrule import DAILY, rrule

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class NucleoGovGazetteSpider(BaseGazetteSpider):
def start_requests(self):
days = rrule(freq=DAILY, dtstart=self.start_date, until=self.end_date)
for day in days:
yield scrapy.Request(self.url_base.format(day.strftime("%Y-%m-%d")))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

como a string de acesso a API /api/diarios?data={}&calendar=true&situacao=2" é igual em todas as classes filhas, nós temos adotado o padrão de deixar essa construção da URL na classe mãe, escondendo isso da classe filha e deixando mais simples o raspador do município.

Suggested change
yield scrapy.Request(self.url_base.format(day.strftime("%Y-%m-%d")))
url = f"{self.url_base}/api/diarios?data={day.strftime('%Y-%m-%d')}&calendar=true&situacao=2"
yield scrapy.Request(url)


def parse(self, response):
data = json.loads(response.text)

gazettes = data.get("data")
for gazette in gazettes:
gazette_urls = []

if gazette.get("media_legacy"):
gazette_urls.append(gazette.get("media_legacy"))
else:
midias = gazette.get("midias")

for midia in midias:
gazette_urls.append(midia.get("url"))

gazette_date = datetime.strptime(gazette.get("data"), "%Y-%m-%d")
edition_number = gazette.get("numero")

yield Gazette(
date=gazette_date.date(),
file_urls=gazette_urls,
edition_number=edition_number,
power="executive",
is_extra_edition=False,
)
16 changes: 16 additions & 0 deletions data_collection/gazette/spiders/go/go_anapolis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import datetime as dt

from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider


class GoAnapolisSpider(NucleoGovGazetteSpider):
name = "go_anapolis"
TERRITORY_ID = "5201108"
allowed_domains = [
"dom.anapolis.go.gov.br",
]
url_base = (
"https://dom.anapolis.go.gov.br/api/diarios?data={}&calendar=true&situacao=2"
)
Comment on lines +12 to +14
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflete aqui a mudança na classe base

Suggested change
url_base = (
"https://dom.anapolis.go.gov.br/api/diarios?data={}&calendar=true&situacao=2"
)
url_base = "https://dom.anapolis.go.gov.br"


start_date = dt.date(2010, 5, 31)
14 changes: 14 additions & 0 deletions data_collection/gazette/spiders/go/go_valparaiso_de_goias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import datetime as dt

from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider


class GoValparaisoDeGoiasSpider(NucleoGovGazetteSpider):
name = "go_valparaiso_de_goias"
TERRITORY_ID = "5221858"
allowed_domains = [
"diariooficial.valparaisodegoias.go.gov.br",
]
url_base = "https://diariooficial.valparaisodegoias.go.gov.br/api/diarios?data={}&calendar=true&situacao=2"

Comment on lines +12 to +13
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflete aqui a mudança na classe base

Suggested change
url_base = "https://diariooficial.valparaisodegoias.go.gov.br/api/diarios?data={}&calendar=true&situacao=2"
url_base = "https://diariooficial.valparaisodegoias.go.gov.br"

start_date = dt.date(2021, 2, 17)
16 changes: 16 additions & 0 deletions data_collection/gazette/spiders/to/to_cariri_do_tocantins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import datetime as dt

from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider


class ToCaririDoTocantinsSpider(NucleoGovGazetteSpider):
name = "to_cariri_do_tocantins"
TERRITORY_ID = "1703867"
allowed_domains = [
"dom.cariri.to.gov.br",
]
url_base = (
"https://dom.cariri.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"
)
Comment on lines +12 to +14
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflete aqui a mudança na classe base

Suggested change
url_base = (
"https://dom.cariri.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"
)
url_base = "https://dom.cariri.to.gov.br"


start_date = dt.date(2023, 1, 30)
14 changes: 14 additions & 0 deletions data_collection/gazette/spiders/to/to_jau_do_tocantins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import datetime as dt

from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider


class ToJauDoTocantinsSpider(NucleoGovGazetteSpider):
name = "to_jau_do_tocantins"
TERRITORY_ID = "1711506"
allowed_domains = [
"diariooficial.jau.to.gov.br",
]
url_base = "https://diariooficial.jau.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"

Comment on lines +12 to +13
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflete aqui a mudança na classe base

Suggested change
url_base = "https://diariooficial.jau.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"
url_base = "https://diariooficial.jau.to.gov.br"

start_date = dt.date(2023, 5, 15)
14 changes: 14 additions & 0 deletions data_collection/gazette/spiders/to/to_parana.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import datetime as dt

from gazette.spiders.base.nucleogov import NucleoGovGazetteSpider


class ToParanaSpider(NucleoGovGazetteSpider):
name = "to_parana"
TERRITORY_ID = "1716208"
allowed_domains = [
"diariooficial.parana.to.gov.br",
]
url_base = "https://diariooficial.parana.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"

Comment on lines +12 to +13
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflete aqui a mudança na classe base

Suggested change
url_base = "https://diariooficial.parana.to.gov.br/api/diarios?data={}&calendar=true&situacao=2"
url_base = "https://diariooficial.parana.to.gov.br"

start_date = dt.date(2023, 5, 8)
Loading