forked from natanelia/arkintel-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlIntelProcessor.py
executable file
·134 lines (105 loc) · 4.22 KB
/
crawlIntelProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
__author__ = 'Natan Elia'
import sys, os
sys.path.append(os.path.abspath('src'))
import CrawlUtil
import json
import re
import requests
import time
import threading
from queue import Queue
from pprint import pprint
import copy
PROCESSOR_API_URL = "https://www.kimonolabs.com/api/cd7sg4yq?apikey=b8BQTunaAccOVZAG9lpyTg1HLy4hkKXN"
PROCESSOR_EXPECTED_LIMIT = 100000
def mergeResult(urls):
s = requests.Session()
s.mount("http://", requests.adapters.HTTPAdapter(max_retries=10))
s.mount("https://", requests.adapters.HTTPAdapter(max_retries=10))
responses = []
for url in urls:
print("[PROCESSING] ", url)
dat = CrawlUtil.getJSON(s, url)
if dat['results'] == {}:
break
responses.append(dat)
merged = {}
for response in responses:
merged = mergeDict(merged, response)
return merged
def mergeDict(a, b, path=None):
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
mergeDict(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
elif isinstance(a[key], list) and isinstance(b[key], list):
a[key] = a[key] + b[key]
elif a[key] != b[key]:
a[key] = b[key]
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
def process(phoneUrl, num, numTotal):
startTime = time.time()
s = requests.Session()
s.mount("http://", requests.adapters.HTTPAdapter(max_retries=10))
s.mount("https://", requests.adapters.HTTPAdapter(max_retries=10))
phoneAPIUrl = "https://www.kimonolabs.com/api/7ltn7gno?apikey=b8BQTunaAccOVZAG9lpyTg1HLy4hkKXN&kimmodify=1"
resp = CrawlUtil.postJSON('https://ws.kimonolabs.com/ws/updateapi/', {'apiid': '7ltn7gno', 'updateObj': {'targeturl' : phoneUrl}})
if resp['success']:
resp = CrawlUtil.postJSON('https://ws.kimonolabs.com/ws/startcrawl/', {'apiid': '7ltn7gno'})
if resp['success']:
resp = CrawlUtil.getJSON(s, 'https://ws.kimonolabs.com/ws/crawlstats/?apiid=7ltn7gno')
while (resp['isCrawling'] != False):
var = None
resp = CrawlUtil.getJSON(s, 'https://ws.kimonolabs.com/ws/crawlstats/?apiid=7ltn7gno')
phoneJson = CrawlUtil.getJSON(s, phoneAPIUrl)
phoneResult = phoneJson['results']
deviceName = phoneResult['main']['device_name']
if not os.path.exists('results'):
os.makedirs('results')
f = open('results/' + deviceName.replace("/", "-", 100) + '.json', 'w')
res = json.dump(phoneResult, f, sort_keys=True, indent=4, separators=(',', ': '))
f.close()
print('[PROCESSED] {:s} ({:d}/{:d}) in {:f} secs'.format(deviceName, num, numTotal, (time.time() - startTime)))
def reformatRawData(r):
arrData = []
for dataTitle in r['results']['main']:
obj = {}
obj['Name'] = re.sub(r'\n', '', dataTitle['title'])
for dataContent in r['results']['data']:
if (dataTitle['url'] == dataContent['url']):
if isinstance(dataContent['info'], str):
obj[dataContent['category1']] = dataContent['info']
else:
obj[dataContent['category1']] = dataContent['info']['text']
arrData.append(obj)
r['results'] = arrData
r['count'] = len(r['results'])
return r
def main(argv):
merged = None
if ('-p' in argv):
urls = []
offset = 0
while offset < PROCESSOR_EXPECTED_LIMIT:
urls.append(PROCESSOR_API_URL + "&kimoffset=" + str(offset))
offset = offset + 2500
merged = mergeResult(urls)
f = open('processors/' + 'raw-data.json', 'w')
f.write(json.dumps(merged, indent=4))
f.close()
else:
f = open('processors/' + 'raw-data.json', 'r')
merged = json.load(f)
reformatted = reformatRawData(merged)
f = open('processors/' + 'formatted-data.json', 'w')
f.write(json.dumps(reformatted, indent=4))
f.close()
main(sys.argv[1:])