-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_additional_annotations.py
163 lines (124 loc) · 5.51 KB
/
scrape_additional_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
from multiprocessing import Pool
import os
import re
import time
import lxml # needed for beautofulsoup
"""
Scraping the Wikiarts web page for the downloaded ArtEmis images
"""
def _fix(string):
try:
return string.replace("\n", "").replace(":", "").strip()
except Exception as e:
print(e)
def _get_csv_entries(csv_file):
pd_csv = pd.read_csv(csv_file, sep=",", header=None, names=["url"])
print(pd_csv["url"])
return pd_csv["url"]
def _get_html_request(property_url):
try:
# prevent request errors
time.sleep(0.01)
html_request = requests.get(property_url)
except Exception as e:
print(f"REQUEST ERROR: {property_url}\n\terror_desc: {e}")
# adding flag for erroneous request calls
html_request = -1
return html_request
def crawl(url, multiple=[]):
data = {}
url_wikiart = "https://www.wikiart.org/en"
picture_name = "_".join(url.split("/")[-2:]).replace(".jpg", "")
property_url = f"{url_wikiart}/{picture_name.replace('_', '/')}"
# GET request to fetch the raw HTML content
html_request = _get_html_request(property_url)
# cleaning up url suffixes after 404
if html_request != -1:
request_status_code = html_request.status_code
if request_status_code != 200:
clean_property_url = re.sub(r"\(\d+\)?", "", property_url)
html_request = _get_html_request(clean_property_url)
if html_request != -1:
request_status_code = html_request.status_code
else:
print(f"REQUEST REJECTED: {picture_name}")
return (None, f"error:{picture_name}")
if request_status_code == 200:
data["name"] = picture_name
data["pic_url"] = url
print(data["name"])
html_content = html_request.text
data["property_url"] = property_url
# parse the html content with lxml (> pip install lxml)
soup = BeautifulSoup(html_content, "lxml")
# get properties
properties_tree = soup.find_all("article")
if len(properties_tree) >= 1:
properties = properties_tree[0]
title = properties.find("h3")
if title is not None:
data["title"] = _fix(title.text)
painter = properties.find("h5")
if painter is not None:
data["painter"] = _fix(painter.text)
listing = properties.find("ul")
if listing is not None:
items = listing.find_all("li")
for item in items:
property_key = item.find("s")
property_value = item.find("span")
if property_key is not None and property_value is not None:
# check for multiple sub-values for current property
value_list = _fix(property_value.text).split(",")
fixed_property_key = _fix(property_key.text).lower()
# filter properties where len(value_list) >= 1 for at least one painting
if fixed_property_key in ["date", "genre", "location"]:
data[fixed_property_key] = _fix(property_value.text)
else:
keywords_list = []
for value in value_list:
keywords_list.append(_fix(value))
data[fixed_property_key] = keywords_list
# get keywords
keywords_tree = soup.find("div", "tags-cheaps")
keywords_list = []
if keywords_tree is not None:
html_keywords_list = keywords_tree.find_all(
"a", "tags-cheaps__item__ref"
)
for keyword in html_keywords_list:
keywords_list.append(_fix(keyword.text))
data["keywords"] = keywords_list
# return json object & None for the error
return (data, None)
else:
print(f"404 NOT FOUND: {picture_name}")
return (None, f"404:{picture_name}")
else:
print(f"REQUEST REJECTED: {picture_name}")
return (None, f"error:{picture_name}")
if __name__ == "__main__":
json_file = {}
out_dir_stats = "./artemis-download-stats"
csv_file = "./artemis-download-stats/all_paintings.txt"
csv_entries = _get_csv_entries(csv_file)
# os.cpu_count() equals 128 on gpu server -> triggers too many requests simultaneously
# NOTE: you can increase the number of threads but you might encounter request errors (connetion reset by peer)
with Pool(1) as p:
json_array = p.map(crawl, csv_entries)
json_list = list(filter(None, [tup[0] for tup in json_array]))
json_file["entries"] = json_list
not_found_list = list(filter(None, [tup[1] for tup in json_array]))
with open(out_dir_stats + "/enhanced_annotations.json", "w") as outfile:
json.dump(json_file, outfile, indent=2)
print(f"number of successfully scraped painting links: {len(json_list)} ")
with open(
out_dir_stats + "/additional_annotation_not_found_list.txt", "a+"
) as outfile:
for name in not_found_list:
outfile.write(f"{name}\n")
print(f"number of defective painting links: {len(not_found_list)}")