-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
47 lines (33 loc) · 1.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import csv
import scrapy
from scrapy.crawler import CrawlerProcess
SEARCH_URL = "https://th.indeed.com/jobs?q=%22data+engineer%22&l=%E0%B8%81%E0%B8%A3%E0%B8%B8%E0%B8%87%E0%B9%80%E0%B8%97%E0%B8%9E%E0%B8%A1%E0%B8%AB%E0%B8%B2%E0%B8%99%E0%B8%84%E0%B8%A3"
class MySpider(scrapy.Spider):
name = "indeed_spider"
start_urls = [SEARCH_URL,]
start = 0
def parse(self, response):
jobs = []
job_links = response.css("#mosaic-provider-jobcards a")
for each in job_links:
if "id" not in each.attrib:
continue
url = each.attrib["href"]
jk = url.split("&")[0].split("=")[1]
job_url = f"{SEARCH_URL}&vjk={jk}"
title = each.css(".jobTitle span::attr(\"title\")").get()
company_name = each.css(".companyName::text").get()
company_location = each.css(".companyLocation::text").get()
print(title, company_name, company_location, job_url)
jobs.append([title, company_name, company_location, job_url])
with open('jobs.csv', 'a', newline='') as f:
writer = csv.writer(f)
writer.writerows(jobs)
self.start = self.start + 10
if self.start < 50:
next_page = f"{SEARCH_URL}&start={self.start}"
yield response.follow(next_page, self.parse)
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(MySpider)
process.start()