Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Revisão retroativa] Imprensa Oficial #1125

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_amelia_rodrigues.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaAmeliaRodriguesSpider(ImprensaOficialSpider):
name = "ba_amelia_rodrigues"
allowed_domains = ["pmameliarodriguesba.imprensaoficial.org"]
start_date = date(2015, 1, 1)
url_base = "http://pmameliarodriguesba.imprensaoficial.org/{}"
city_domain = "http://pmameliarodriguesba.imprensaoficial.org"
TERRITORY_ID = "2930501"
14 changes: 0 additions & 14 deletions data_collection/gazette/spiders/ba/ba_conceicao_do_almeida.py

This file was deleted.

2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_gentio_do_ouro.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaGentioDoOuroSpider(ImprensaOficialSpider):
name = "ba_gentio_do_ouro"
allowed_domains = ["pmgentiodoouroba.imprensaoficial.org"]
start_date = date(2017, 2, 1)
url_base = "http://pmgentiodoouroba.imprensaoficial.org/{}"
city_domain = "http://pmgentiodoouroba.imprensaoficial.org"
TERRITORY_ID = "2911303"
11 changes: 5 additions & 6 deletions data_collection/gazette/spiders/ba/ba_gongogi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from datetime import date

from gazette.spiders.base.imprensa_oficial import ImprensaOficialSpider
from gazette.spiders.base.sai import SaiGazetteSpider


class BaGongogiSpider(ImprensaOficialSpider):
class BaGongogi(SaiGazetteSpider):
name = "ba_gongogi"
allowed_domains = ["pmgongogiba.imprensaoficial.org"]
start_date = date(2020, 2, 1)
end_date = date(2020, 12, 30)
url_base = "http://pmgongogiba.imprensaoficial.org/{}"
allowed_domains = ["gongogi.ba.gov.br"]
start_date = date(2005, 8, 15)
base_url = "https://www.gongogi.ba.gov.br"
TERRITORY_ID = "2911501"
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class BaGovernadorMangabeiraSpider(ImprensaOficialSpider):
name = "ba_governador_mangabeira"
allowed_domains = ["pmGOVERNADORMANGABEIRABA.imprensaoficial.org"]
allowed_domains = ["pmgovernadormangabeiraba.imprensaoficial.org"]
start_date = date(2018, 1, 1)
url_base = "http://pmGOVERNADORMANGABEIRABA.imprensaoficial.org/{}"
city_domain = "http://pmgovernadormangabeiraba.imprensaoficial.org"
TERRITORY_ID = "2911600"
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@


class BaItaquaraSpider(ImprensaOficialSpider):
name = "ba_itaquara"
allowed_domains = ["pmitaquaraba.imprensaoficial.org", "itaquara.ba.gov.br"]
name = "ba_itaquara_2019"
allowed_domains = ["pmitaquaraba.imprensaoficial.org"]
start_date = date(2019, 1, 1)
url_base = "http://itaquara.ba.gov.br/{}"
end_date = date(2022, 1, 4)
city_domain = "http://pmitaquaraba.imprensaoficial.org"
TERRITORY_ID = "2916708"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_jaguarari.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ class BaJaguarariSpider(ImprensaOficialSpider):
allowed_domains = ["pmjaguarariba.imprensaoficial.org"]
start_date = date(2019, 10, 1)
end_date = date(2020, 12, 31)
url_base = "http://pmjaguarariba.imprensaoficial.org/{}"
city_domain = "http://pmjaguarariba.imprensaoficial.org"
TERRITORY_ID = "2917706"
4 changes: 2 additions & 2 deletions data_collection/gazette/spiders/ba/ba_muniz_ferreira.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ class BaMunizFerreiraSpider(ImprensaOficialSpider):
name = "ba_muniz_ferreira"
allowed_domains = ["pmmunizferreiraba.imprensaoficial.org"]
start_date = date(2014, 12, 1)
end_date = date(2021, 1, 19)
url_base = "http://pmmunizferreiraba.imprensaoficial.org/{}"
end_date = date(2022, 9, 27)
city_domain = "http://pmmunizferreiraba.imprensaoficial.org"
TERRITORY_ID = "2922201"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_paratinga.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaParatingaSpider(ImprensaOficialSpider):
name = "ba_paratinga"
allowed_domains = ["pmparatingaba.imprensaoficial.org"]
start_date = date(2018, 4, 1)
url_base = "http://pmparatingaba.imprensaoficial.org/{}"
city_domain = "http://pmparatingaba.imprensaoficial.org"
TERRITORY_ID = "2923704"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_pe_de_serra.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaPeDeSerraSpider(ImprensaOficialSpider):
name = "ba_pe_de_serra"
allowed_domains = ["pmpedeserraba.imprensaoficial.org"]
start_date = date(2017, 1, 1)
url_base = "http://pmpedeserraba.imprensaoficial.org/{}"
city_domain = "http://pmpedeserraba.imprensaoficial.org"
TERRITORY_ID = "2924058"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_sao_felipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ class BaSaoFelipeSpider(ImprensaOficialSpider):
allowed_domains = ["pmsaofelipeba.imprensaoficial.org"]
start_date = date(2020, 1, 1)
end_date = date(2021, 4, 22)
url_base = "http://pmsaofelipeba.imprensaoficial.org/{}"
city_domain = "http://pmsaofelipeba.imprensaoficial.org"
TERRITORY_ID = "2929107"
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaSaoFranciscoDoCondeSpider(ImprensaOficialSpider):
name = "ba_sao_francisco_do_conde"
allowed_domains = ["pmsaofranciscodocondeba.imprensaoficial.org"]
start_date = date(2019, 3, 1)
url_base = "http://pmsaofranciscodocondeba.imprensaoficial.org/{}"
city_domain = "http://pmsaofranciscodocondeba.imprensaoficial.org"
TERRITORY_ID = "2929206"
7 changes: 2 additions & 5 deletions data_collection/gazette/spiders/ba/ba_sao_miguel_das_matas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@

class BaSaoMiguelDasMatasSpider(ImprensaOficialSpider):
name = "ba_sao_miguel_das_matas"
allowed_domains = [
"pmsaomigueldasmatasba.imprensaoficial.org",
"saomigueldasmatas.ba.gov.br",
]
allowed_domains = ["pmsaomigueldasmatasba.imprensaoficial.org"]
start_date = date(2019, 2, 1)
url_base = "http://saomigueldasmatas.ba.gov.br/{}"
city_domain = "http://pmsaomigueldasmatasba.imprensaoficial.org"
TERRITORY_ID = "2929404"
4 changes: 2 additions & 2 deletions data_collection/gazette/spiders/ba/ba_sapeacu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class BaSapeacuSpider(ImprensaOficialSpider):
name = "ba_sapeacu"
allowed_domains = ["pmsapeacuba.imprensaoficial.org", "sapeacu.ba.gov.br"]
allowed_domains = ["pmsapeacuba.imprensaoficial.org"]
start_date = date(2017, 1, 1)
url_base = "http://sapeacu.ba.gov.br/{}"
city_domain = "http://pmsapeacuba.imprensaoficial.org"
TERRITORY_ID = "2929602"
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@


class BaSaudeSpider(ImprensaOficialSpider):
name = "ba_saude"
name = "ba_saude_2018"
allowed_domains = ["pmsaudeba.imprensaoficial.org"]
start_date = date(2018, 2, 1)
end_date = date(2019, 4, 12)
url_base = "http://pmsaudeba.imprensaoficial.org/{}"
city_domain = "http://pmsaudeba.imprensaoficial.org"
TERRITORY_ID = "2929800"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_serrinha.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaSerrinhaSpider(ImprensaOficialSpider):
name = "ba_serrinha"
allowed_domains = ["pmserrinhaba.imprensaoficial.org"]
start_date = date(2020, 1, 1)
url_base = "http://pmserrinhaba.imprensaoficial.org/{}"
city_domain = "http://pmserrinhaba.imprensaoficial.org"
TERRITORY_ID = "2930501"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_vera_cruz.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaVeraCruzSpider(ImprensaOficialSpider):
name = "ba_vera_cruz"
allowed_domains = ["pmveracruzba.imprensaoficial.org"]
start_date = date(2017, 4, 1)
url_base = "http://pmveracruzba.imprensaoficial.org/{}"
city_domain = "http://pmveracruzba.imprensaoficial.org"
TERRITORY_ID = "2933208"
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaWenceslauGuimaraesSpider(ImprensaOficialSpider):
name = "ba_wenceslau_guimaraes"
allowed_domains = ["pmwenceslauguimaraesba.imprensaoficial.org"]
start_date = date(2017, 1, 1)
url_base = "http://pmwenceslauguimaraesba.imprensaoficial.org/{}"
city_domain = "http://pmwenceslauguimaraesba.imprensaoficial.org"
TERRITORY_ID = "2933505"
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_xique_xique.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ class BaXiqueXiqueSpider(ImprensaOficialSpider):
name = "ba_xique_xique"
allowed_domains = ["pmxiquexiqueba.imprensaoficial.org"]
start_date = date(2017, 1, 1)
url_base = "http://pmxiquexiqueba.imprensaoficial.org/{}"
city_domain = "http://pmxiquexiqueba.imprensaoficial.org"
TERRITORY_ID = "2933604"
14 changes: 10 additions & 4 deletions data_collection/gazette/spiders/base/imprensa_oficial.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@ def start_requests(self):
freq=MONTHLY, dtstart=initial_date, until=self.end_date
):
year_month = monthly_date.strftime("%Y/%m/") # like 2015/01
yield scrapy.Request(
self.url_base.format(year_month), callback=self.extract_gazette_links
)
url = f"{self.city_domain}/{year_month}"

yield scrapy.Request(url, callback=self.extract_gazette_links)

def extract_gazette_links(self, response):
for gazette_link in response.css("h2 a::attr(href)").getall():
links = response.css("h2 a::attr(href)").getall()

for gazette_link in links:
raw_gazette_date = re.search(r"\d{4}/\d{2}/\d{2}", gazette_link).group()
gazette_date = datetime.strptime(raw_gazette_date, "%Y/%m/%d").date()

if gazette_date < self.start_date:
return
yield scrapy.Request(gazette_link)
Expand All @@ -32,17 +35,20 @@ def extract_gazette_links(self, response):
another_page = response.xpath(
".//a[contains(text(), 'Publicações mais antigas')]/@href"
).get()

if another_page:
yield scrapy.Request(another_page, callback=self.extract_gazette_links)

def parse(self, response):
file_url = response.css(
"div.entry-content a[href*='baixar.php?arquivo=']::attr(href)"
).get()

if not file_url: # older dates
file_url = response.css(
"div.entry-content a[title='Baixar Diário']::attr(href)"
).get()

gazette_date = response.css("span.posted-on a time::attr(datetime)").get()
gazette_date = datetime.strptime(gazette_date, "%Y-%m-%dT%H:%M:%S%z").date()

Expand Down
Loading