-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
134 lines (111 loc) · 3.54 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
import datetime
import smtplib
from email.mime.text import MIMEText
import schedule
import time
import json
import re
from media_headers import TheEconomist, Bloomberg, WallStreetJournal, Others
PERSONAL_FILE = 'personal_info.json' # File containing personal mail data. Must be first configured using personal.py
def load_json(json_file):
# Load JSON file
with open(json_file, 'r') as data:
dictionary = json.load(data)
return dictionary
def dump_json(dictionary, fname):
# Dump JSON file
with open(fname, 'w') as handler:
json.dump(dictionary, handler)
# Load personal data
personal_data = load_json(PERSONAL_FILE)
FROM = personal_data['from']
PASSWORD = personal_data['password']
SMTP_SERVER = personal_data['smtp_server']
TO = personal_data['to']
# CUSTOM GLOBALS
TXT_FILENAME = 'headers.txt'
JSON_FILENAME = 'words_rating.json'
TIME = '19:41'
def extract_data():
# Scrape headers and write a TXT file
handler = open(TXT_FILENAME, 'w', encoding='utf-8')
handler.write(str(datetime.datetime.now()) + '\n\n')
economist = TheEconomist(handler)
economist.process_headers()
bloomberg = Bloomberg(handler)
bloomberg.process_headers()
wsj = WallStreetJournal(handler)
wsj.process_headers()
others = Others(handler)
others.walk_through()
handler.close()
def send_email(filename):
# Send headers to an email
handler = open(filename, 'r', encoding="utf8")
msg = MIMEText(handler.read())
handler.close()
msg['Subject'] = 'HEADERS ' + str(datetime.datetime.now())
msg['From'] = FROM
msg['To'] = TO
server = smtplib.SMTP(SMTP_SERVER, 587)
server.ehlo()
server.starttls()
server.login(FROM, PASSWORD)
server.sendmail(msg['From'], msg['To'], msg.as_string())
server.quit()
def parse_txt(source):
# parse TXT file to find most common words excluding those in 'words_to_exclude.json'
exclude = load_json('words_to_exclude.json')
handler = open(source, 'r', encoding='utf-8')
reader = handler.readlines()
results = dict()
for line in reader[1:]:
if not line.startswith('http'):
lst = re.findall(r"[\w']+", line)
for word in lst:
word = word.lower()
if word not in exclude:
if word in results:
results[word] += 1
else:
results[word] = 1
length = len(results)
output = dict()
while length > 0:
max_val = 2
max_key = ''
for entry in results:
if results[entry] > max_val:
max_val = results[entry]
max_key = entry
output[max_key] = max_val
try:
results.pop(max_key)
except KeyError:
break
length -= 1
handler.close()
return output
def handle_json(fname, result):
# Update or create a json file with word ratings
utc = str(datetime.datetime.utcnow())
try:
data = load_json(fname)
data[utc] = result
dump_json(data, fname)
except FileNotFoundError:
data = {utc: result}
dump_json(data, fname)
def deliver_headers():
# scheduled actions
extract_data()
send_email(TXT_FILENAME)
today_summary = parse_txt(TXT_FILENAME)
handle_json(JSON_FILENAME, today_summary)
if __name__ == '__main__':
# schedule.every().hour.do(deliver_headers)
schedule.every().day.at(TIME).do(deliver_headers)
while True:
schedule.run_pending()
time.sleep(1)