This repository has been archived by the owner on Dec 15, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
newspaper_search.py
69 lines (52 loc) · 1.92 KB
/
newspaper_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import urllib
import json
class Search(object):
@staticmethod
def search(*args):
anno_links = []
full_links = []
for arg in args:
search_string = ('+'.join('' + item + '' for item in args))
API_key = "your_key"
base_url = "https://newspapers.eanadev.org/api/v2/search.json?query="
url = "".join(base_url + '' + search_string + '' + "&profile=hits&wskey=" + API_key)
print url
response = urllib.urlopen(url)
data = json.load(response)
try:
num_entries = len(data['items'])
except Exception as e:
print repr(e)
try:
for i in data['items']:
anno_links.append(i['id'])
except Exception as e:
print repr(e)
for base in anno_links:
base_url2 = "https://iiif.europeana.eu/presentation"
new_url = "".join(base_url2 + '' + base + '' + "/annopage/1")
full_links.append(new_url)
text_links = []
print full_links
for link in full_links:
response2 = urllib.urlopen(link)
data2 = json.load(response2)
try:
num_entries2 = len(data2['resources'])
except Exception as e:
print repr(e)
try:
for i in data2['resources']:
text_links.append(i['resource']['@id'])
except Exception as e:
print repr(e)
fulltext = []
print text_links
for txt in text_links:
response3 = urllib.urlopen(txt)
data3 = json.load(response3)
fulltext.append(data3['value'])
fulltext = [x.encode('utf-8') for x in fulltext]
with open('fulltext.txt', 'w') as outfile:
for item in fulltext:
outfile.write("%s\n" % item)