-
Notifications
You must be signed in to change notification settings - Fork 0
/
skim_content_check.py
executable file
·269 lines (242 loc) · 10.8 KB
/
skim_content_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python3
import skim_utils
import subprocess
from time import perf_counter as pc
from typing import List, Dict
class SkimContentCheck:
'''
Once all sites have been processed, we take the current list of content hashes
and compare them to the relevant hashes from the last pass to detect any changes.
If the hashes of a site do not match we take the current content and compare to
the content from the last pass to indicate exactly what was modified.
These details are them emailed as an alert for further investigation.
'''
def __init__(self):
self.perf_get_hash_results = "n/a"
self.perf_get_dir_name_cmd_builder = "n/a"
self.perf_get_dir_name = "n/a"
self.perf_get_file_name = "n/a"
self.perf_compare_hashes = "n/a"
self.perf_get_content = "n/a"
self.perf_search_content = "n/a"
def get_hash_results(self, path: str) -> Dict:
'''
Read hash file from disk to Dictionary.
Split the URL and hash_digest using "~" as delimiter, return tuple and map URL to hash in Dict.
'''
try:
p1 = pc()
hash_dict: Dict = {}
print("\n\nget_hash_results file to open: " + str(path))
path = str(path.rstrip("\n"))
with open(path, "r") as file:
for line in file.readlines():
site, content_hash = line.split("~")
hash_dict[str(site)] = str(content_hash)
file.close()
p2 = pc()
self.perf_get_hash_results = p2 - p1
return hash_dict
except Exception as e:
print("Error! in SkimContentCheck.get_hash_results: " + str(e))
def get_dir_name_cmd_builder(self, which_results: str) -> str:
'''
Return a command string to be run by "get_dir_name" according to the "which_results" parameter.
'''
try:
p1 = pc()
import skim_controller
basepath: str = skim_controller.SkimController().basepath
cmd1 = "ls -td " + str(basepath) + "201* | head -n 1"
cmd2 = "ls -td " + str(basepath) + "201* | head -n 2 | tail -n 1"
if which_results == "most_recent":
p2 = pc()
self.perf_get_dir_name_cmd_builder = p2 - p1
return str(cmd1)
elif which_results == "second":
p2 = pc()
self.perf_get_dir_name_cmd_builder = p2 - p1
return str(cmd2)
except Exception as e:
print("Error! in SkimContentCheck.get_dir_name_cmd_builder: " + str(e))
def get_dir_name(self, cmd: str) -> str:
'''
Return the name of the directory after running the "cmd" parameter.
'''
try:
p1 = pc()
lint = skim_utils.SkimUitls().lint
lint("CMD for file name: " + str(cmd))
dir_name = subprocess.run([str(cmd)], stdout = subprocess.PIPE, shell = True)
if dir_name.returncode != 0:
raise IOError
dir_out = dir_name.stdout
dir_out = dir_out.decode("utf-8")
dir_out = dir_out.rstrip('\n')
lint(str(dir_out))
p2 = pc()
self.perf_get_dir_name = p2 - p1
return str(dir_out)
except IOError as i:
print("get_file_path_subprocess IOERROR" + str(i))
except Exception as e:
print("get_file_path" + str(e))
def get_file_name(self, dir_name: str, file_to_return: str) -> str:
'''
Return the name of the hash file in the directory calculated by get_dir_name
'''
try:
p1 = pc()
lint = skim_utils.SkimUitls().lint
file_cmd = ("ls -td " + str(dir_name) + "/*" + str(file_to_return) + "*")
lint("File cmd: " + str(file_cmd))
file_name = subprocess.run([str(file_cmd)], stdout = subprocess.PIPE, shell = True)
if file_name.returncode != 0:
raise IOError
file_name_out = file_name.stdout
file_name_out = file_name_out.decode("utf-8")
lint("File name: " + str(file_name_out))
p2 = pc()
self.perf_get_file_name = p2 - p1
return str(file_name_out)
except IOError as i:
print("Error! in content.checker.get_hashes: IOERROR subprocess: " + str(i))
except Exception as e:
print("Error! in content.checker.get_hashes: " + str(e))
def compare_hashes(self, dict1: Dict, dict2: Dict) -> List:
'''
Use URLs as keys, get corresponding hash from both dictionaries and compare.
Return List of URL's with non matching hashes.
'''
try:
p1 = pc()
lint = skim_utils.SkimUitls().lint
mismatches = []
apd = mismatches.append
for key in dict1.keys():
hash_latest = dict1.get(key)
hash_second = dict2.get(key)
if (not hash_latest) or (not hash_second):
continue
if (hash_latest != hash_second):
lint("ALERT!! - Content has changed on: " + str(key) + "\n")
lint(str(key))
lint(str(hash_latest))
lint(str(hash_second))
apd(str(key))
else:
continue
p2 = pc()
self.perf_compare_hashes = p2 -p1
return mismatches
except Exception as e:
print("Error! in compare_hashes: " + str(e))
def get_content(self, file_name: str) -> List:
'''
Read content file into List
'''
try:
p1 = pc()
content = []
apd = content.append
with open(file_name.rstrip("\n"), "r") as file:
for line in file.readlines():
apd(line)
file.close()
p2 = pc()
self.perf_get_content = p2 - p1
return content
except Exception as e:
print("Error! in SkimContentCheck.get_content: " + str(e))
def search_content(self, domain: str, content_list: List) -> List:
'''
Find and retrieve domain specific content from List
'''
try:
p1 = pc()
starter = ("$$$$$$$$$$~~~~~~~~~~$$$$$$$$$$" + str(domain) +
"$$$$$$$$$$~~~~~~~~~~$$$$$$$$$$").rstrip("\n").rstrip(" ")
ender = ("%%%%%%%%%%~~~~~~~~~~~%%%%%%%%%%" + str(domain) +
"%%%%%%%%%%~~~~~~~~~~~%%%%%%%%%%").rstrip("\n").rstrip(" ")
site_content = []
apnd = site_content.append
i = int(0)
write_flag = False
while i < len(content_list):
if write_flag == False:
if starter in content_list[i]:
write_flag = True
if write_flag == True:
apnd(content_list[i])
i += 1
if ender in content_list[i]:
break
else:
i += 1
p2 = pc()
self.perf_search_content = p2 - p1
return site_content
except Exception as e:
print("Error! in SkimContentCheck.get_content: " + str(e))
def print_perf_values(self):
'''
Display performance timings for each method
'''
lint = skim_utils.SkimUitls().lint
lint(
"\nperf_get_hash_results: " + str(self.perf_get_hash_results) +
"\nperf_get_dir_name_cmd_builder: " + str(self.perf_get_dir_name_cmd_builder) +
"\nperf_get_dir_name: " + str(self.perf_get_dir_name) +
"\nperf_get_file_name: " + str(self.perf_get_file_name) +
"\nperf_compare_hashes: " + str(self.perf_compare_hashes) +
"\nperf_get_content: " + str(self.perf_get_content) +
"\nperf_search_content: " + str(self.perf_search_content))
def main():
'''
Content checking execution flow is driven from here.
'''
try:
lint = skim_utils.SkimUitls().lint
check = SkimContentCheck()
most_recent = check.get_dir_name_cmd_builder("most_recent")
second = check.get_dir_name_cmd_builder("second")
most_recent_dir = check.get_dir_name(most_recent)
second_dir = check.get_dir_name(second)
last_hash_results_file_name = check.get_file_name(most_recent_dir, "hash")
second_last_hash_results_file_name = check.get_file_name(second_dir, "hash")
last_content_file_name = check.get_file_name(most_recent_dir, "content")
second_last_content_file_name = check.get_file_name(second_dir, "content")
latest_hash_results = check.get_hash_results(last_hash_results_file_name)
second_last_hash_results = check.get_hash_results(second_last_hash_results_file_name)
lint("\nlast_hash_results_file_name: " + str(last_hash_results_file_name) +
" Number of results: " + str(len(latest_hash_results)))
lint("\nsecond_last_hash_results_file_name: " + str(second_last_hash_results_file_name) +
" Number of results: " + str(len(second_last_hash_results)))
lint("\nlast_content_file_name: " + str(last_content_file_name))
lint("\nsecond_last_content_file_name: " + str(second_last_content_file_name))
mismatches = check.compare_hashes(latest_hash_results, second_last_hash_results)
if not mismatches:
lint("\nAll urls have matching hash.\n")
else:
lint("\n\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Number of mismatches: " + str(len(mismatches))
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
for domain in mismatches:
last_content_list = check.get_content(last_content_file_name)
second_last_content_list = check.get_content(second_last_content_file_name)
cont_last = check.search_content(domain, last_content_list)
cont_second = check.search_content(domain, second_last_content_list)
diff = list(set(cont_second) ^ set(cont_last))
df = "\n".join(diff)
import gmail
import toolbag
col = toolbag.Toolbag().color
log_mess = col("\nContent Warning - " + str(domain) + "\nThese elements are different:\n", "red")
lint("\n" + log_mess + "\n" + col(df, "yellow"))
mail_mess = "Content Warning - " + str(domain)
gmail.Gmail().sendText(mail_mess, df)
lint("Performance Timings per method:")
lint(str(check.print_perf_values()))
except Exception as e:
print("Error! in content_checker.main(): " + str(e))
if __name__ == '__main__':
main()