forked from serpapi/code-challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
web scrapper.py
49 lines (43 loc) · 1.9 KB
/
web scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from bs4 import BeautifulSoup
import re
import json
import os
for file in os.listdir('test_files'):
with open(f'test_files/{file}', encoding='utf-8') as url:
soup = BeautifulSoup(url, 'html.parser')
results = {
'artworks': []
}
for tag in soup.find('g-scrolling-carousel'):
titles = tag.findAll('a')
for element in titles:
name = element['aria-label']
extensions = []
link_extension = element['href']
link = 'google.com' + link_extension
thumbnail_source = element.find('img')
try:
if thumbnail_source is not None:
image_id = thumbnail_source['id']
thumbnail = thumbnail_source['src']
for script in soup.findAll('script'):
images = re.findall(rf"(?<=var s=')(.*?)(?=';var ii=\['{image_id}'])", str(script))
if images:
decoded_image = images[0].encode('ascii', 'ignore')
decoded_image = decoded_image.decode('unicode-escape')
thumbnail = decoded_image
else:
thumbnail = None
except KeyError:
thumbnail = None
if element.findAll('div') and element.findAll('div')[-1].text != name:
extensions.append(element.findAll('div')[-1].text)
results['artworks'].append({
'name': name,
'extensions': extensions,
'link': link,
'thumbnail': thumbnail
})
json_file = json.dumps(results, indent=4)
with open(f'result_files/{file}'.replace('.html', '.json'), 'w') as output:
output.write(json_file)