-
Notifications
You must be signed in to change notification settings - Fork 3
/
old_main.py
478 lines (427 loc) · 23.2 KB
/
old_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
"""
@author: Elias Kordoulas, python's butcher
@license: MIT
"""
import json
import datetime
import plotme
#This file is the same as main.py but it's missing the 'CLI', thus you can call the functions directly in a simpler fashion
#Instructions are at the bottom of the file
"""
#time is stored in epochs ['timestamp_ms'] so we have to make it into a date %Y %m %d
@returns a datetime.datetime object of which values can be accessed like
.year
.month
.day
"""
def getDateFromTimestamp(timestamp):
return datetime.datetime.fromtimestamp(float(timestamp)/1e3)
#helper function for the other 2 functions that have to do with month data
def getMonthName(month): #Returns a string with the month's number
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
return months[month-1]
#@returns a list of the names of the participants
#@WHY while there is a 'participants' field it does not account for if someone was in the conversation but has since left
#thus we check every message in the list, we could do this in conjuction with another function but oh well?
def getParticipants():
participantsList = []
for m in messagesList:
if (m['sender_name'] not in participantsList):
participantsList.append(m['sender_name'])
return participantsList
#@returns the length of the messagesList since it's already been sanitized and filtered to only contain messages
def getTotalMessages():
return len(messagesList)
#@returns the days attribute of @global variable 'diff' which is set at the loadfile() function at the bottom
def getSpanOfConversation():
return str(diff.days+1) + " days"
#@returns a dict { participantName : value }
#create a dict with keys the participants through the pList and assign 0 to it
#iterate over the mList and just increment the value for that key by 1
#@bool includeTotal is used to have a 3rd element total that holds all messages
def getTotalMessagesPerParticipant(includeTotal):
theDict = dict.fromkeys(participantsList,0)
total = 0
for m in messagesList:
theDict[m['sender_name']] += 1
if(includeTotal == True):
theDict.update({'Total' : len(messagesList)})
return theDict
#@argument 'this' is the return of the getMessagesPerMonth() function
#@why? because we could order the dict in the perMonth function but then the data wouldn't be useful for plotting
#@returns a tuple (str, str) with the month's name and message value
def getMonthWithMostMessages(this):
maxV = 0
maxM = None
for k,v in this.items():
if v > maxV:
maxV = v
maxM = k
return (maxM, str(maxV)) if maxV > 0 else None
#@returns a dict where keys are 'year + monthName' and values are ints with the ammount of messages of both participants
#ex { 2020 February : 2469}
def getMessagesPerMonth():
perMonthDict = dict()
for m in messagesList:
date = getDateFromTimestamp(m['timestamp_ms'])
key = str(date.year) +' '+ str(getMonthName(date.month)) #'2020 February'
perMonthDict.update({key : 1 }) if key not in perMonthDict else perMonthDict.update({key : perMonthDict[key] + 1}) #{2020 February : 2469}
return perMonthDict
#@returns a dict of dicts for every month that has the participants as keys and their respective messages as values
#@structure is:
# {
# date(y m) : { p1 : v, p2 : v },
# date2(y m) : { p1 : v, p2 : v }
# }
#@access by : dict[date][participant] returns the messages of that participant for that date
def getMessagesPerMonthPerParticipant():
theDict = {k:{} for (k,v) in getMessagesPerMonth().items()}
for k in theDict:
theDict[k].update({k:0 for k in participantsList})
for m in messagesList:
date = getDateFromTimestamp(m['timestamp_ms'])
key = str(date.year) +' '+ str(getMonthName(date.month))
p = m['sender_name']
theDict[key][p] += 1
return theDict
#@returns a dict in form of { date(y m d) : messages }
#@**kwargs is used if you want to find only the days where some words were written
#@uses 'words'=[str1, str2, str3, etc], should be a @type::set()
def getMessagesPerDay(**kwargs):
keywords = set(kwargs['words']) if len(kwargs) > 0 else False
from datetime import timedelta
theDict = dict()
for i in range (diff.days+2):
day = sdate + timedelta(days=i)
theDict[day.strftime('%Y %m %d')] = 0
for m in messagesList:
currentDay = getDateFromTimestamp(m['timestamp_ms']).strftime('%Y %m %d')
if (keywords == False):
theDict[currentDay] += 1
else: #this loop is used if you want to look for specific words in each message
if 'content' in m.keys():
wList = set(m['content'].split())
theDict[currentDay] += 1 if len(keywords.intersection(wList)) > 0 else 0
return theDict
#@returns a dict where the keys are the dates and the values are dicts
#@representation { date : {'participant' : messages} }
#final structures should be e.x :
# {
# '2018 01 15' : {'p1' : 30, 'p2' : 39},
# '2019 09 12' : {'p1' : 39, 'p2' : 30}
# }
def getMessagesPerDayPerParticipant():
from datetime import timedelta
theDict = dict()
for i in range (diff.days+2): #should be days+1 as days+2 sometimes creates an extra day. This is a problem because of the difference in the time of day the first and last message were sent resulting in incorrect .days
day = sdate + timedelta(days=i)
theDict[day.strftime('%Y %m %d')] = dict.fromkeys(participantsList, 0)
for m in messagesList:
currentDay = getDateFromTimestamp(m['timestamp_ms']).strftime('%Y %m %d')
theDict[currentDay][m['sender_name']] += 1
return theDict
#@returns a list of ints that represent the seconds between the last message of a participant and the first of another
#@note does not care for ammount of participants in conversation
#@**kwargs : 'time' = type::string ex. 'Seconds', is @used for time representation and divisor since we get seconds
#@alternative, you could use the first message of someone until the first message of another but that creates
# situations where it's not really a response but rather a new starting point for the conversation. That would
# require contextual analysis.
#@example if someone says 'goodbye' or 'talk to you then' then it's not really a response to that last message
#@also someone could be the same one that sent the last message at let's say a week ago and then sends the next message
# initiating the conversation again. That would skew the results as the response will be at the last message sent rather
# than the one a week ago.
def getReponseTimePerMessage(**kwargs):
times = {'Seconds':1, 'Minutes' : 60, 'Hours' : 3600}
divisor = times[kwargs['time']] if len(kwargs) > 0 else 1
myl = []
for i in range(1,len(messagesList)):
prev = messagesList[i-1]
m = messagesList[i]
if (m['sender_name'] != prev['sender_name']):
difference = getDateFromTimestamp(m['timestamp_ms']) - getDateFromTimestamp(prev['timestamp_ms'])
myl.append(difference.total_seconds()//divisor)
return myl
#@returns a dict {k:v where k=participant, v=their global response time/their responses}
#@**kwargs : 'time' = type::string ex. 'Seconds', is @used for time representation and divisor since we get seconds
def getGlobalAverageResponseTimePerParticipant(**kwargs):
times = {'Seconds' : 1, 'Minutes' : 60, 'Hours' : 60*60}
divisor = times[kwargs['time']] if len(kwargs) > 0 else 1
theDict = dict.fromkeys(participantsList,0)
valuesList = [0 for k in theDict]
#iterate over the list - 1, if the current message's sender is different than the sender of the next in
#the list then it's considered a response to that message.
#@caution in group conversations where multiple people respond to a message sent by one it's difficult
#to filter out, there would need to be context awareness which is beyond the scope of this 'project'
for i in range(0,len(messagesList)-1):
current = messagesList[i]
nuxt = messagesList[i+1]
if (current['sender_name'] != nuxt['sender_name']):
difference = getDateFromTimestamp(nuxt ['timestamp_ms']) - getDateFromTimestamp(current['timestamp_ms'])
diffInSeconds = difference.total_seconds()
valuesList[participantsList.index(nuxt['sender_name'])] += diffInSeconds
theDict[nuxt['sender_name']]+=1
for k,v in theDict.items():
theDict[k] = round(valuesList[participantsList.index(k)] / v,2) // divisor
return theDict
#@returns a dict where keys are each month as (year month) and values the messages for that month
#{ date : messages }
def getAverageResponseTimePerMonth(**kwargs):
#same idea as the function above only we are now adding everything to a per month dict and do not care about
#the participants as this is a global average of their messages
#@reminder we take the date of the next message because that is considered the response and thus the date we increment
times = {'Seconds' : 1, 'Minutes' : 60, 'Hours' : 60*60}
divisor = times[kwargs['time']] if len(kwargs) > 0 else 1
perMonthDict = dict() #@holds all the months as keys and the values will be the totalSeconds / totalMessages
valuesList = [] #@holds the total seconds between responses for every message in a given month
for i in range(0,len(messagesList)-1):
current = messagesList[i]
nuxt = messagesList[i+1]
if (current['sender_name'] != nuxt['sender_name']):
date = getDateFromTimestamp(nuxt['timestamp_ms'])
difference = date - getDateFromTimestamp(current['timestamp_ms'])
key = str(date.year) + ' ' + str(getMonthName(date.month))
if key not in perMonthDict: #this means we've entered a new month and must initialize a new element in the array
perMonthDict[key] = 1
valuesList.append(difference.total_seconds())
else: #if in the same month update the last (and implicitly current) month in the last by the seconds it took to respond
valuesList[len(valuesList)-1] += difference.total_seconds()
perMonthDict[key] += 1
i=0 #need to be able to access the list
for k,v in perMonthDict.items():
perMonthDict[k] = round(valuesList[i] / v,2) // divisor
i += 1
return perMonthDict
#@returns a dict { weekday : value } containing the messages of both participants
def getMessagesPerDayOfTheWeek():
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekDict = dict.fromkeys(weekdays,0)
for m in messagesList:
weekDict[weekdays[getDateFromTimestamp(m['timestamp_ms']).weekday()]] += 1
return weekDict
#@returns a dict { time : value } where key is the hour (13:34 is 13) and value the messages of both participants at that time
def getMessagesPerTimeOfDay():
timeList = [i for i in range(24)]
timeDict = dict.fromkeys(timeList,0)
for m in messagesList:
timeDict[getDateFromTimestamp(m['timestamp_ms']).hour] += 1
return timeDict
#returns a float
#ammount refers to the amount of text messages, not photos, videos etc.
def getAverageWordsPerMessage():
wordCount = 0
ammount = 0
for m in messagesList:
if 'content' in m.keys(): #to filter the messages that have text and not photos, audio etc.
words = m['content'].split()
wordCount += len(words)
ammount += 1
return round(wordCount / ammount, 2)
#@returns a dict {k:v with k:month name v:totalwordsPMonth / totalmessagesPMonth}
def getAverageWordsPerMessagePerMonth():
perMonthDict = dict()#holds the ammount of words for each month
prevMonth = getDateFromTimestamp(messagesList[0]['timestamp_ms']).month
month = 0
valuesList = [0] #holds the ammount of text messages per month to use for dividing the wordcount later
for m in messagesList:
date = getDateFromTimestamp(m['timestamp_ms'])
key = str(date.year) +' '+ str(getMonthName(date.month))
if 'content' in m.keys(): #check if it's a text message
wordCount = len(m['content'].split())
perMonthDict.update({key : wordCount }) if key not in perMonthDict else perMonthDict.update({key : perMonthDict[key] + wordCount})
valuesList[month]+=1
if(date.month != prevMonth):
valuesList.append(1)
month+=1
prevMonth = date.month
month = 0
for k in perMonthDict:
perMonthDict[k] = round(perMonthDict[k] / valuesList[month], 2)
return perMonthDict
#@returns a dict with key: sender_name | value: number:averageWords
#{ participant : averageWords }
#@messagesPerParticipant::list holds number of messages for every participant
#@averages::list holds the sum of words of every message for that participant
#@pDict::dict is the return, keys are the participants, value is totalWordsOfPar / totalMessagesOfPar
def getAverageWordsPerMessagePerParticipant():
messagesPerParticipant = [0] * len(participantsList)
averages = [0] * len(participantsList)
for m in messagesList:
if 'content' in m.keys():
words = m['content'].split()
averages[participantsList.index( m['sender_name'] )] += len(words)
messagesPerParticipant[participantsList.index( m['sender_name'] )] += 1
pDict = dict.fromkeys(participantsList)
for i in range(len(participantsList)):
pDict[participantsList[i]] = round(averages[i] / messagesPerParticipant[i], 2)
return pDict
#returns a dict with typeOfMessage : value
def getMessagesPerType(): #The types are: audio_files | photos | call_duration | videos | share | gifs and if none of these, then it's a simple text message
#Using 2 lists more than just the typesSet because we want to change the name of the keys in the final dict
#but don't have another way to match them
typesSet = {"simple", "photos", "videos", "audio_files", "call_duration", "share", "gifs"}
fixedTypesList = ['Text messages','Photos', 'Videos','Voice recordings', 'Voice calls', 'Shares', 'gifs']
referenceList = ["simple", "photos", "videos", "audio_files", "call_duration", "share", "gifs"]
typesDict = dict.fromkeys(fixedTypesList, 0)
total = 0
for message in messagesList:
for mType in message:
if (mType in typesSet):
typesDict[fixedTypesList[referenceList.index(mType)]] += 1
total += 1
break
typesDict['Text messages'] = getTotalMessages() - total #all the messages minus the special ones are the simple text ones
return typesDict
#@returns a dict of dicts
#@structure {
# {
# 'participant1' : {'simple' : 1, 'photos' : 145},
# 'participant2' : {'simple' : 4, 'photos' : 123}
# }
#each dict contains a key that relates to the type of message and their values
def getMessagesPerTypePerParticipant():
typesSet = {"simple", "photos", "videos", "audio_files", "call_duration", "share", "gifs"}
fixedTypesList = ['Text messages','Photos', 'Videos','Voice recordings', 'Voice calls', 'Shares', 'gifs']
referenceList = ["simple", "photos", "videos", "audio_files", "call_duration", "share", "gifs"] #don't do list(dict), the keys won't be in order
#creating a dict of dicts and initalizing the k:v for that dict because dict.fromkeys() returns a reference to the same object
dictPerParticipant = {participant: {"Text messages":0, "Photos":0, "Videos":0, "Voice recordings":0, "Voice calls":0, "Shares":0, "gifs":0} for participant in participantsList}
for message in messagesList:
for mType in message: #going over the keys of the message object dict, you'd think we should access it with 'type', well yes but not every message has a type key :)
if (mType in typesSet): #if the set contains the type then it's increment, if not then it's a text message which carries no type identifier
cType = fixedTypesList[referenceList.index(mType)]
dictPerParticipant[message['sender_name']][cType] += 1
add = False
break
else:
add = True
if (add):
dictPerParticipant[message['sender_name']]['Text messages'] += 1
add = False
return dictPerParticipant
#returns a sorted dictionary containing every word and the times it was sent
def getMostCommonWords(**kwargs):
from collections import defaultdict
from operator import itemgetter
words = defaultdict(lambda :0) #this is for when a new key is added, it is assigned a value of 0
for m in messagesList:
if 'content' in m.keys():
temp = m['content'].split()
for w in temp:
words[w] += 1
sortedDict = dict(sorted(words.items(), key=itemgetter(1), reverse=True))
if ('range' in kwargs.keys()):
r = int(kwargs['range'])
enforcedDict = dict()
for k,v in sortedDict.items():
if (len(enforcedDict) == r):
return enforcedDict
enforcedDict.update({k:v})
return sortedDict
#@returns an int of the length of the set of mostCommonWords since it's already there
def getAmmountOfUniqueWords():
return len(getMostCommonWords())
#@returns a sorted dict fron the getMessagesPerDay() function
#**kwargs is for using ranger=x where x the ammount of days you'd like to see
def getDaysWithMostMessages(**kwargs):
most = getMessagesPerDay()
most = {k: v for k,v in sorted(most.items(), key=lambda item: item[1], reverse=True)}
if len(kwargs) > 0:
enforcedDict = dict()
for k,v in most.items():
if (len(enforcedDict) == kwargs['range']):
return enforcedDict
enforcedDict.update({k:v})
else: #not needed, added for visual clarity
return most
#@returns a float (response time in seconds, no rounding)
#Sums all the response times for every message from the list returned by getResponseTimePerMessage()
#divides it by the length of that list since every message in that list is a response
#returns it
def getGlobalAverageResponseTime():
temp = getReponseTimePerMessage() #returns the list in seconds
return sum(temp)/len(temp)
#strips the other information from the messageFile, returns a list instead of dict
def dictToList(messagesFile):
tempList = [message for message in messagesFile['messages']]
for t in tempList: #this loop is needed for messenger's terrible choice of text encoding
for key in t: #list of dicts { key : value }, we need to encode.decode the values
if key == 'sender_name' or key == 'content':
t[key] = t[key].encode('latin1').decode('utf8')
return tempList
#@returns an int
#counts the times a word is seen in all the messages
#iterates over every message, uses.split() and searches for it in that list
#increments+1 for every time the word is found
def getWordAppearances(word):
count = 0
for m in messagesList:
if ('content' in m.keys()): #needed because not every message is a text. otherwise throws KeyError
for w in m['content'].split():
count = count+1 if word == w else count
return count
#****************START****************#
messagesFile = None
messagesList = [] #Will be a list of dicts containing the content of each message
###Change the name variable to your folder containing the jsons inside MessagesSources folder.###
from os import listdir
import time
stime = time.time()
name = 'skg' #CHANGE THIS
dir = r'./MessagesSources/'+name+'/'
for i in range(1,len(listdir(dir))+1):
with open (dir+'message_'+str(i)+'.json', 'r',encoding="utf-8") as f:
messagesFile = json.load(f)
messagesList += dictToList(messagesFile) #Creating a list containg only the messages
messagesList.reverse() #the JSON files are newest first, we reverse the list to go from first to last sequentially later
messagesFile = None
participantsList = getParticipants()
ldate = getDateFromTimestamp(messagesList[len(messagesList)-1]['timestamp_ms'])
sdate = getDateFromTimestamp(messagesList[0]['timestamp_ms'])
diff = ldate - sdate
print('---- loading took %s seconds to complete ----' %(time.time() - stime))
#################################### ATTENTION ####################################
#There is no networking in this thing, you are safe unless you have malware. Go check that.
""" How to call Functions and Plots
Every function here returns something and is declared above it.
Functions do not need the messages as arguments because messagesList contains them all in proper format
and is global.
If a function takes **kwargs check the declaration to find out why, it's probably time representation so not
important as there are already default values set.
To print anything, call the function ex.'getMessagesPerMonth()' in a print statement. Every function returns
an ordered data structure, so you need not worry about funky business
To plot something, check the functions inside the plotme.py file. All the needed plotting is there but
some are used for multiple things, examples have been left below but the naming scheme helps somewhat.
To plot something you call the plotting function through plotme and they take as arguments the data structures
provided by the functions in here.
e.x 'plotme.plotLineGraph_MessagesPerDay(getMessagesPerDay())'
In this example there is a function that returns a structure with the days that messages were sent with specific
words. In that case you'd call:
kw = ['hello','world']
'plotme.plotLineGraph_MessagesPerDay(getMessagesPerDay(words=kw),kw)
this is because messagesPerDay() has **kwargs and expects a list of words and
plotme has *args so it can set the proper title for the function.
Feel free to change anything and submit a pull request if you optimize or fix something.
@toMyDefence, yes many optimizations could be made, such as creating a perMonthList to iterate over instead
of going through all the messagesList and doing that dynamically every time. Why did I not? well
my bad
This thing is so fast anyway that it doesn't matter.
Average full analysis at saveAll() for a conversation with 300K messages is 5seconds so we'll survive.
Thank you!
"""
#print(getMessagesPerMonthPerParticipant())
#print(getDaysWithMostMessages(range=5))
#print(getMostCommonWords(range=5))
#print(getGlobalAverageResponseTimePerParticipant(time='Minutes'))
#getAverageResponseTimePerMonth()
#getAverageWordsPerMessagePerMonth()
#plotme.plotBarGraph_AveragePerMonth_General(getAverageWordsPerMessagePerMonth(),title='Words Per Message')
#plotme.plotBarGraph_MessagesPerMonth(getAverageResponseTimePerMonth(time='Minutes'),title='Average Response Time',time='Minutes')
#plotme.plotBarGraph_MessagesPerMonthPerParticipant(getMessagesPerMonthPerParticipant(), participantsList)
#plotme.plotBarGraph_MessagesPerMonth(getMessagesPerMonth())
#plotme.plotLineGraph_MessagesPerMonth(getMessagesPerMonth())
#plotme.plotLineGraph_MessagesPerDayPerParticipant(getMessagesPerDayPerParticipant(), participantsList, False)
#plotme.plotLineGraph_MessagesPerDay(getMessagesPerDay())
#plotme.plotBarGraph_TotalMessages_PerParticipant(getTotalMessagesPerParticipant())
#plotme.plotBarGraph_MessagesPerMonth(getMessagesPerMonth())
#plotme.plotSpiderGraph(getMessagesPerDayOfTheWeek())
#plotme.plotLineGraph_TimeOfResponsePerMessage(getReponseTimePerMessage(time='Minutes'),time='Minutes')
#print(getMostCommonWords())