-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimporter.py
129 lines (108 loc) · 4.22 KB
/
importer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import logging, json, importio, latch, csv
dataRows=[]#usefull to get the datas
dataRows2=[]#usefull to get the datas
class AnkiCard:
front=""
back=""
def __init__(self, q, a):
self.front=q.encode('utf8', 'ignore')
self.back=a.encode('utf8', 'ignore')
def GenerateAnkiCardsFromWikipediaCategory(url,deckName,user_id,api_key):
cards=[]
client = importio.importio(user_id=user_id,api_key=api_key , host="https://query.import.io")
client.connect()
global queryLatch
queryLatch = latch.latch(1)
client.query({
"connectorGuids": [
"68b4b6ac-25ce-434d-923d-7cc9661216ff"#7fc7daa2-25a4-4649-b48c-be1d7fd8756e
],
"input": {
"webpage/url": url
}
}, callback)
print "Queries dispatched, now waiting for results"
queryLatch.await()
print json.dumps(dataRows, indent = 4)
#print(dataRows[0]["title"])
queryLatch = latch.latch(len(dataRows))
for data in dataRows :
if('url' in data.keys()):
client.query({
"connectorGuids": [
"7fc7daa2-25a4-4649-b48c-be1d7fd8756e"
],
"input": {
"webpage/url": data['url']
}
}, callback2)
queryLatch.await()
print json.dumps(dataRows2, indent = 4)
for d in dataRows2:
if(all(x in d.keys() for x in ["title","first_par"])):
cards.append(AnkiCard(d["title"],d["first_par"]))
client.disconnect()
reinitGlobalVariables()
return cards
def reinitGlobalVariables():
global dataRows
global dataRows2
dataRows=[]
dataRows2=[]
def callback2(query, message):
global dataRows2
# Disconnect messages happen if we disconnect the client library while a query is in progress
if message["type"] == "DISCONNECT":
print "Query in progress when library disconnected"
print json.dumps(message["data"], indent = 4)
# Check the message we receive actually has some data in it
if message["type"] == "MESSAGE":
if "errorType" in message["data"]:
# In this case, we received a message, but it was an error from the external service
print "Got an error!"
print json.dumps(message["data"], indent = 4)
else:
# We got a message and it was not an error, so we can process the data
print "Got data!"
print json.dumps(message["data"], indent = 4)
# Save the data we got in our dataRows variable for later
dataRows2.extend(message["data"]["results"])
# When the query is finished, countdown the latch so the program can continue when everything is done
if query.finished(): queryLatch.countdown()
def callback(query, message):
global dataRows
# Disconnect messages happen if we disconnect the client library while a query is in progress
if message["type"] == "DISCONNECT":
print "Query in progress when library disconnected"
print json.dumps(message["data"], indent = 4)
# Check the message we receive actually has some data in it
if message["type"] == "MESSAGE":
if "errorType" in message["data"]:
# In this case, we received a message, but it was an error from the external service
print "Got an error!"
print json.dumps(message["data"], indent = 4)
else:
# We got a message and it was not an error, so we can process the data
print "Got data!"
print json.dumps(message["data"], indent = 4)
# Save the data we got in our dataRows variable for later
dataRows.extend(message["data"]["results"])
# When the query is finished, countdown the latch so the program can continue when everything is done
if query.finished(): queryLatch.countdown()
def printCardsAsCsv(cards,filename):
with open(filename, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=';',quotechar='"', quoting=csv.QUOTE_MINIMAL)
for card in cards:
writer.writerow([card.front]+[card.back.replace(";",",")])
# Issue queries to your data sources and with your inputs
# You can modify the inputs and connectorGuids so as to query your own sources
# Query for tile First paragraphe wikipedia (crawler)
if __name__ == "__main__":
dataRows=[]
dataRows2=[]
user_id="93e6a27f-a52e-4ecc-8c70-79b1df692285"
api_key=""#complete this line
url="http://en.wikipedia.org/wiki/Category:Statistical_paradoxes"
deckName="StatParadoxes"
cards=GenerateAnkiCardsFromWikipediaCategory(url,deckName,user_id,api_key)
printCardsAsCsv(cards,deckName+".csv")