-
Notifications
You must be signed in to change notification settings - Fork 19
/
NTSB_analysis.py
740 lines (659 loc) · 30.5 KB
/
NTSB_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 21 17:40:20 2017
@author: Edward Preble
"""
###############################
#Task: Exploration and analysis of NTSB Aviation Accident dataset
#
#Major Code Sections:
# Script Setup
# ETL
# XML import
# JSON import
# Exploration & Summary Stats
# Basic Visualizations
# Text Mining
# NLTK Setup and Preprocessing
# TF-IDF
# Kmeans
# Misc
# Unused/Uninteresting Code
# Ngram Frequency Analysis
###############################
###############################
#Script Setup
###############################
#REQUIRED FILE PATHS:
path_xml = "C:\Users\Edward\Desktop\Github\data-scientist-exercise02\data\AviationData.xml"
path_json = "C:\Users\Edward\Desktop\Github\data-scientist-exercise02\data"
kmeans_model_output_directory = 'C:\Users\Edward\Desktop\Github\data-scientist-exercise02'
#REQUIRED PACKAGES:
import pandas as pd
import xml.etree.ElementTree as ET
import json
from os import listdir
from os.path import isfile, join
from datetime import datetime
import nltk
from nltk.util import ngrams
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
###############################
#ETL
# XML Import
###############################
tree = ET.parse(path_xml)
root = tree.getroot()
len(root[0]) #How many records? 77257
len(root[0][0].attrib.keys()) #How many columns in each record? 31
aviationdata_column_names = root[0][0].attrib.keys() #Grab list of column names
aviationdata_df = pd.DataFrame(columns=aviationdata_column_names) #Create empty dataframe
#Load xml into dataframe, iterfind iterates through each xml tree branch finding keys/values
data = []
inner = {}
for el in tree.iterfind('./*'):
for i in el.iterfind('*'):
for k, v in i.items():
inner[k] = v
data.append(inner)
inner = {}
aviationdata_df = pd.DataFrame(data)
#Cleanup
del data, inner, i, k, v
###############################
#ETL
# JSON Import
###############################
# Get list of JSONs in path_json, omitting AviationData.xml from the list
ListOfDataFiles = [f for f in listdir(path_json) if (isfile(join(path_json, f))) & (f!='AviationData.xml')]
#Load all files in directory, parse into a list of dicts, then load them into dataframe
i=0
for f in ListOfDataFiles:
print i, f
CurrentFileName = path_json + '\\' + f
input_json = open( CurrentFileName, 'rb' )
content = input_json.read()
input_json.close()
parsed_json = json.loads( content ) #results in a list of dictionaries
del content #This line is critical, memory bogs down incredibly if content isn't deleted each time
json_df = pd.DataFrame(parsed_json['data']) #3 column dataframe results from each json
#Concatenate new dataframe to the first one
if i>0:
merge_list = [all_jsons_df, json_df]
all_jsons_df = pd.concat(merge_list)
else:
all_jsons_df = json_df.copy()
i+=1
basic_stats = all_jsons_df.describe() #All the EventId's are unique!
#Cleanup
del parsed_json, i, f, merge_list, json_df
###############################
#ETL
# Make some conversions
# Then join text data with structured data, then slice out small and large plane data
###############################
#Some basic stats/checks
basic_stats = aviationdata_df.describe()
#NOTE: EventId not all unique, but AccidentNumber are...
ids = aviationdata_df["EventId"]
dups_df = aviationdata_df[ids.isin(ids[ids.duplicated()])]
#UPDATE: This is ok, one EventID can involve 2 aircraft, which is two AccidentNumbers
#Convert various string columns into numeric
aviationdata_df.dtypes
aviationdata_df['TotalFatalInjuries'] = pd.to_numeric(aviationdata_df['TotalFatalInjuries'], errors='coerce')
aviationdata_df['TotalSeriousInjuries'] = pd.to_numeric(aviationdata_df['TotalSeriousInjuries'], errors='coerce')
aviationdata_df['TotalMinorInjuries'] = pd.to_numeric(aviationdata_df['TotalMinorInjuries'], errors='coerce')
aviationdata_df['TotalUninjured'] = pd.to_numeric(aviationdata_df['TotalUninjured'], errors='coerce')
aviationdata_df['NumberOfEngines'] = pd.to_numeric(aviationdata_df['NumberOfEngines'], errors='coerce')
aviationdata_df.dtypes
#Convert accident EventDate to datetime
aviationdata_df['datetimes']=""
aviationdata_df['datetimes'] = pd.to_datetime(aviationdata_df['EventDate'], format="%m/%d/%Y", errors='coerce')
#Calculate number of passengers on board (if known)
aviationdata_df['total_passengers']=""
aviationdata_df['total_passengers'] = aviationdata_df['TotalFatalInjuries'].fillna(0) + aviationdata_df['TotalSeriousInjuries'].fillna(0) + aviationdata_df['TotalMinorInjuries'].fillna(0) + aviationdata_df['TotalUninjured'].fillna(0)
#Outer Join the json data to the aviationdata
all_data_df = pd.merge(aviationdata_df, all_jsons_df, on='EventId', how='outer')
#OPTIONAL (save memory)
#If not mining the narratives, remove all rows with empty probable cause statements
all_data_df['probable_cause'].replace('', np.nan, inplace=True)
all_data_df.dropna(subset=['probable_cause'], inplace=True)
###############################
#Exploration & Summary Stats
###############################
#Find Accident Date Range
min_date = aviationdata_df['datetimes'].min()
max_date = aviationdata_df['datetimes'].max()
#Find number of incidents based on number of passengers on board
print aviationdata_df[(aviationdata_df["total_passengers"]>0) & (aviationdata_df["total_passengers"]<5)].count()["AccidentNumber"]
print aviationdata_df[(aviationdata_df["total_passengers"]>=5) & (aviationdata_df["total_passengers"]<20)].count()["AccidentNumber"]
print aviationdata_df[aviationdata_df["total_passengers"]>=20].count()["AccidentNumber"]
print aviationdata_df[aviationdata_df["total_passengers"]==0].count()["AccidentNumber"]
#70148 accidents with <5 passengers
#4022 accidents with 5-19 passengers
#2395 accidents with >19 passengers
#692 accidents with unknown number of passengers
#Find number of fatal incidents based on number of passengers on board
print aviationdata_df[aviationdata_df["TotalFatalInjuries"]>0].count()["AccidentNumber"]
print aviationdata_df[(aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>0) & (aviationdata_df["total_passengers"]<5)].count()["AccidentNumber"]
print aviationdata_df[(aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>=5) & (aviationdata_df["total_passengers"]<20)].count()["AccidentNumber"]
print aviationdata_df[(aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>=20)].count()["AccidentNumber"]
print aviationdata_df[(aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]==0)].count()["AccidentNumber"]
#13988 fatal accidents with <5 passengers
#1217 fatal accidents with 5-19 passengers
#223 fatal accidents with >19 passengers
#0 fatal accidents with unknown number of passengers
###############################
# Basic Visualizations
###############################
#Break down the accidents and fatal accidents by year
aviationdata_df['year']=pd.DatetimeIndex(aviationdata_df['datetimes']).year
aviationdata_df['year'] = aviationdata_df['year'].fillna(0.0).astype(int)
#Very little data before 1982, just use 1982-2015
accidents_over_time_column_names = range(1982,2016)
accidents_over_time_index = ['Accidents_Small','Accidents_Medium','Accidents_Large','Accidents_Unknown']
accidents_over_time_df = pd.DataFrame(columns=accidents_over_time_column_names, index=accidents_over_time_index)
fatal_accidents_over_time_index = ['Fatal_Accidents_Small','Fatal_Accidents_Medium','Fatal_Accidents_Large']
fatal_accidents_over_time_df = pd.DataFrame(columns=accidents_over_time_column_names, index=fatal_accidents_over_time_index)
#Find total number of large flights with fatalities for last 10 years
recent_large_flight_fatalities_df = fatal_accidents_over_time_df.copy()
recent_large_flight_fatalities_df = recent_large_flight_fatalities_df.loc[:,'2006':]
recent_large_flight_fatalities_df['Total'] = recent_large_flight_fatalities_df.sum(axis=1)
#62 large flights with fatalities
#Determine number of accidents and fatal accidents for small, medium, large, and unknown size planes, by year
for year in accidents_over_time_column_names:
# print year, aviationdata_df[aviationdata_df["year"]==year].count()["AccidentNumber"]
accidents_over_time_df.iloc[accidents_over_time_df.index.get_loc('Accidents_Small'), accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["total_passengers"]>0) & (aviationdata_df["total_passengers"]<5)].count()["AccidentNumber"]
accidents_over_time_df.iloc[accidents_over_time_df.index.get_loc('Accidents_Medium'), accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["total_passengers"]>=5) & (aviationdata_df["total_passengers"]<20)].count()["AccidentNumber"]
accidents_over_time_df.iloc[accidents_over_time_df.index.get_loc('Accidents_Large'), accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["total_passengers"]>=20)].count()["AccidentNumber"]
accidents_over_time_df.iloc[accidents_over_time_df.index.get_loc('Accidents_Unknown'), accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["total_passengers"]==0)].count()["AccidentNumber"]
fatal_accidents_over_time_df.iloc[fatal_accidents_over_time_df.index.get_loc('Fatal_Accidents_Small'), fatal_accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>0) & (aviationdata_df["total_passengers"]<5)].count()["AccidentNumber"]
fatal_accidents_over_time_df.iloc[fatal_accidents_over_time_df.index.get_loc('Fatal_Accidents_Medium'), fatal_accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>=5) & (aviationdata_df["total_passengers"]<20)].count()["AccidentNumber"]
fatal_accidents_over_time_df.iloc[fatal_accidents_over_time_df.index.get_loc('Fatal_Accidents_Large'), fatal_accidents_over_time_df.columns.get_loc(year)] = aviationdata_df[(aviationdata_df["year"]==year) & (aviationdata_df["TotalFatalInjuries"]>0) & (aviationdata_df["total_passengers"]>=20)].count()["AccidentNumber"]
#Transpose the two dataframes for easy plotting
accidents_over_time_transpose_df = accidents_over_time_df.transpose()
fatal_accidents_over_time_transpose_df = fatal_accidents_over_time_df.transpose()
#Plot accidents over time for difference size aircraft
#Plot.ly Syntax Reference: https://plot.ly/javascript-graphing-library/reference/
import plotly
import plotly.graph_objs
plotly.offline.plot({
'data': [
plotly.graph_objs.Scatter(
x = fatal_accidents_over_time_transpose_df.index,
y = fatal_accidents_over_time_transpose_df['Fatal_Accidents_Small'],
mode = 'lines+markers',
name = '1-4 Passenger Flights'
),
plotly.graph_objs.Scatter(
x = fatal_accidents_over_time_transpose_df.index,
y = fatal_accidents_over_time_transpose_df['Fatal_Accidents_Medium'],
mode = 'lines+markers',
name = '5-19 Passenger Flights'
),
plotly.graph_objs.Scatter(
x = fatal_accidents_over_time_transpose_df.index,
y = fatal_accidents_over_time_transpose_df['Fatal_Accidents_Large'],
mode = 'lines+markers',
name = '20+ Passenger Flights'
)
],
'layout':
plotly.graph_objs.Layout(
showlegend=True,
legend=dict(
x=0.6,
y=0.98,
traceorder='normal',
font=dict(
family='Calibri',
size=24,
color='black'
)
),
xaxis=dict(
title='Year',
titlefont=dict(
family='Calibri',
size=30,
color='black'
),
tickfont=dict(
family='Calibri',
size=24,
color='black'
)
),
yaxis=dict(
title='Accidents Per Year',
titlefont=dict(
family='Calibri',
size=30,
color='black'
),
tickfont=dict(
family='Calibri',
size=24,
color='black'
)
),
height=600,
width=600
)
}, show_link=False,image='png')
###############################
# Text Mining
# NLTK Setup and Preprocessing
###############################
#OPTIONAL - Can drop all_json_df and aviationdata_df at this point to save memory
del all_jsons_df
del aviationdata_df
stop_words = nltk.corpus.stopwords.words( 'english' )
#Add more stop words that are not accident related (mostly investigation related)
stop_words.append('faa')
stop_words.append('federal')
stop_words.append('aviation')
stop_words.append('administration')
stop_words.append('investigators')
stop_words.append('investigation')
stop_words.append('ntsb')
stop_words.append('contributing')
# Use regular expressions to get rid of non-a-zA-Z chars
# leaves lots of spaces and makes two words out of "pilot's", but better than making one word out of "pilot.nextsentence"
def remove_nonletters(s):
letters_only = re.sub("[^a-zA-Z]", # The pattern to search for
" ", # The pattern to replace it with
s ) # The text to search
return letters_only
#Tokenize the text
def tokenize_field(s):
term_vec = [ ]
s=s.lower()
term_vec.append( nltk.word_tokenize( s ) )
return term_vec
#Remove stop words from pre-tokenized list, which outputs a list
def remove_stops(s):
s = [[word for word in text if word not in stop_words] for text in s]
return s
#Function for TfidfVectorizer
porter = nltk.stem.porter.PorterStemmer()
def tokenize_and_stem(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [porter.stem(t) for t in filtered_tokens]
return stems
#Function for TfidfVectorizer
def tokenize_only(text):
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
#Slice Data of Interest from main dataframe
#SF (small fatal)
all_small_fatal_data_df = all_data_df[(
all_data_df["total_passengers"]>0) & (
all_data_df["total_passengers"]<5) & (
all_data_df["TotalFatalInjuries"]>0)].copy()
#SNF (small non-fatal)
all_small_nonfatal_data_df = all_data_df[(
all_data_df["total_passengers"]>0) & (
all_data_df["total_passengers"]<5) & (
all_data_df["TotalFatalInjuries"]==0)].copy()
#LF (large fatal)
all_large_fatal_data_df = all_data_df[(
all_data_df["total_passengers"]>=20) & (
all_data_df["TotalFatalInjuries"]>0)].copy()
#LNF (large non-fatal)
all_large_nonfatal_data_df = all_data_df[(
all_data_df["total_passengers"]>=20) & (
all_data_df["TotalFatalInjuries"]==0)].copy()
#Copy data into corpus for text pre-processing
corpus_SF = all_small_fatal_data_df['probable_cause'].copy()
corpus_SNF = all_small_nonfatal_data_df['probable_cause'].copy()
corpus_LF = all_large_fatal_data_df['probable_cause'].copy()
corpus_LNF = all_large_nonfatal_data_df['probable_cause'].copy()
#Remove punctuation and numbers etc, leave only letters behind
corpus_SF = corpus_SF.apply(remove_nonletters)
corpus_SNF = corpus_SNF.apply(remove_nonletters)
corpus_LF = corpus_LF.apply(remove_nonletters)
corpus_LNF = corpus_LNF.apply(remove_nonletters)
#Remove stop_words from each dataset
i=0
for cause in corpus_SF.iloc[0:len(corpus_SF)]:
filtered_line = " ".join([word for word in cause.lower().split() if word not in stop_words])
corpus_SF.iloc[i] = filtered_line
i+=1
i=0
for cause in corpus_SNF.iloc[0:len(corpus_SNF)]:
filtered_line = " ".join([word for word in cause.lower().split() if word not in stop_words])
corpus_SNF.iloc[i] = filtered_line
i+=1
i=0
for cause in corpus_LF.iloc[0:len(corpus_LF)]:
filtered_line = " ".join([word for word in cause.lower().split() if word not in stop_words])
corpus_LF.iloc[i] = filtered_line
i+=1
i=0
for cause in corpus_LNF.iloc[0:len(corpus_LNF)]:
filtered_line = " ".join([word for word in cause.lower().split() if word not in stop_words])
corpus_LNF.iloc[i] = filtered_line
i+=1
#NOTE!!!
#CHOOSE WHICH CORPUS TO USE, THEN REST OF CODE MUST BE RERUN MANUALLY FOR EACH CORPUS
corpus = corpus_SF.copy()
#Or
corpus = corpus_SNF.copy()
#Or
corpus = corpus_LF.copy()
#Or
corpus = corpus_LNF.copy()
#Create vocabulary reference list, which will be used later to look up clusters topics
totalvocab_stemmed = []
totalvocab_tokenized = []
for wordlist in corpus.iloc[0:len(corpus)]:
allwords_stemmed = tokenize_and_stem(wordlist) #for each item in 'synopses', tokenize/stem
totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
allwords_tokenized = tokenize_only(wordlist)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
###############################
# Text Mining
# TFIDF
###############################
#Lots of super helpful info here:http://brandonrose.org/clustering
#Saved as Document Clustering with Python.pdf
#Setup TFIDF Vectorizer
#Lots of optimization still possible with min/max/ngram/tokenizing variation
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, max_features=200000,
min_df=2,
use_idf=True, tokenizer=tokenize_only, ngram_range=(4,7))
#Create TFIDF Matrices
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) #fit the vectorizer to text
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
features = tfidf_vectorizer.get_feature_names()
#OPTIONAL
#Cosine Similarity Matrix
#dist = 1 - cosine_similarity(tfidf_matrix)
###############################
# Text Mining
# KMeans
###############################
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
#Create New KMeans Model
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
#OPTIONAL TO RELOAD MODEL
#km = joblib.load('doc_cluster.pkl')
#clusters = km.labels_.tolist()
#NOTE!!!
#CHOOSE CORRECT ACCIDENTS LIST, DEPENDING ON DATABASE CHOSEN ABOVE
joblib.dump(km, kmeans_model_output_directory + '\\kmeans_SF.pkl')
accidents = { 'Accident': all_small_fatal_data_df['AccidentNumber'].tolist()
, 'EventID': all_small_fatal_data_df['EventId'].tolist()
, 'cluster': clusters}
joblib.dump(km, kmeans_model_output_directory + '\\kmeans_SNF.pkl')
accidents = { 'Accident': all_small_nonfatal_data_df['AccidentNumber'].tolist()
, 'EventID': all_small_nonfatal_data_df['EventId'].tolist()
, 'cluster': clusters}
joblib.dump(km, kmeans_model_output_directory + '\\kmeans_LF.pkl')
accidents = { 'Accident': all_large_fatal_data_df['AccidentNumber'].tolist()
, 'EventID': all_large_fatal_data_df['EventId'].tolist()
, 'cluster': clusters}
joblib.dump(km, kmeans_model_output_directory + '\\kmeans_LNF.pkl')
accidents = { 'Accident': all_large_nonfatal_data_df['AccidentNumber'].tolist()
, 'EventID': all_large_nonfatal_data_df['EventId'].tolist()
, 'cluster': clusters}
#Kmeans Model Summary
frame = pd.DataFrame(accidents, index = [clusters] , columns = ['Accident', 'EventID', 'cluster'])
frame['cluster'].value_counts()
#Print out cluster topics
from __future__ import print_function #FYI, above print statements won't work after this import
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
print("\nCluster %d words:" % i, end='')
for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
print (terms[ind],sep=" ",end=",")
print() #add whitespace
print("Cluster %d titles:" % i, end='')
for title in frame.ix[i]['Accident'].values.tolist():
print(' %s,' % title, end='')
print() #add whitespace
#SF Results
#1 7555
#0 519
#3 357
#2 238
#4 132
#SNF Results
#2 18629
#1 687
#4 488
#0 333
#3 264
#LF Results
#0 58
#2 4
#4 2
#3 2
#1 2
#LNF Results
#0 700
#3 13
#4 2
#2 2
#1 2
###############################
# Misc - unused/uninteresting
###############################
##This may also work for XML import, and be faster, but it is already super fast for this data, so not tested.
#for el in tree.iterfind('./*'):
# for i in el.iterfind('*'):
# data.append(dict(i.items()))
##Types of Aircraft
# #60737 values are blank, so not very interesting
#aviationdata_df["AircraftCategory"].unique()
#aviationdata_df["AircraftCategory"].value_counts()
##Types of Flights (commercial, general, etc)
# #60592 values are blank, so not very interesting
#aviationdata_df["FARDescription"].unique()
#aviationdata_df["FARDescription"].value_counts()
##Removing punctuation from unicode text via translate also needs an encode statement
#all_data_df['narrative'][5].encode('utf-8').translate(None, string.punctuation)
##Remove Punctuation
#def remove_punctuation(s):
# s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
# return s
#practice_df['narrative_punctuation_removed'] = practice_df['narrative'].apply(remove_punctuation)
#practice_df['cause_punctuation_removed'] = practice_df['probable_cause'].apply(remove_punctuation)
#def is_number(s):
# try:
# float(s)
# return True
# except ValueError:
# return False
#def remove_numbers(s):
# term_vec = [ ]
# for x in s:
# for y in x:
## print type(y)
# if not is_number(y):
# term_vec.append( y )
## s = [x for x in s if not is_number(x)]
# return term_vec
#CREATE SMALL PRACTICE DATASET
#practice_df = all_data_df[0:500].copy()
#
#practice_df['narrative_numbers_removed'] = practice_df['narrative_stopwords_removed'].apply(remove_numbers)
#practice_df['cause_numbers_removed'] = practice_df['cause_stopwords_removed'].apply(remove_numbers)
#
#Or, remove stop words from non-tokenized string, which outputs a string
#def remove_stops_two(s):
# list_of_words = s.split()
# resultwords = [word for word in list_of_words if word.lower() not in stop_words]
# result = ' '.join(resultwords)
# return result
#
#practice_df['narrative_stopwords_removed'] = practice_df['narrative_punctuation_removed'].apply(remove_stops_two)
#practice_df['cause_stopwords_removed'] = practice_df['cause_punctuation_removed'].apply(remove_stops_two)
##Combine all the text and run the vectorizer below to see what top words are:
#list_of_all_words_cause = []
#for all_lists in all_data_df['cause_stopwords_removed']:
# for second_list in all_lists:
# for all_words in second_list:
# list_of_all_words_cause.append(all_words)
##column wise df processing of text, memory intense and superseded
#all_data_df['narrative_nonletters_removed'] = all_data_df['narrative'].apply(remove_nonletters)
#all_data_df['cause_nonletters_removed'] = all_data_df['probable_cause'].apply(remove_nonletters)
##Remove original data to save resources
#all_data_df.drop('narrative', axis=1, inplace=True)
#all_data_df.drop('probable_cause', axis=1, inplace=True)
#
#all_data_df['narrative_tokenized'] = all_data_df['narrative_nonletters_removed'].apply(tokenize_field)
#all_data_df['cause_tokenized'] = all_data_df['cause_nonletters_removed'].apply(tokenize_field)
##Remove previous step data to save resources
#all_data_df.drop('narrative_nonletters_removed', axis=1, inplace=True)
#all_data_df.drop('cause_nonletters_removed', axis=1, inplace=True)
#
#all_data_df['narrative_stopwords_removed'] = all_data_df['narrative_tokenized'].apply(remove_stops)
#all_data_df['cause_stopwords_removed'] = all_data_df['cause_tokenized'].apply(remove_stops)
##Remove previous step data to save resources
#all_data_df.drop('narrative_tokenized', axis=1, inplace=True)
#all_data_df.drop('cause_tokenized', axis=1, inplace=True)
#
################################
## Text Mining
## Ngram Frequency Analysis
################################
##Add columns for narrative and probable cause text processing below
#all_data_df['narrative_nonletters_removed']=""
#all_data_df['cause_nonletters_removed']=""
#all_data_df['narrative_tokenized']=""
#all_data_df['cause_tokenized']=""
#all_data_df['narrative_stopwords_removed']=""
#all_data_df['cause_stopwords_removed']=""
#
##Determine main topics of each set of data (4 of them) with ngram frequency analysis
##Process each set of data into a final dataframe, then drop interim data to save memory
##Steps spelled out for first set, and repeat for rest
##Final sorted ngram results put into this dataframe
#cause_results_df = pd.DataFrame(columns=['small_fatal','small_nonfatal','large_fatal','large_nonfatal'])
#
##Slice off subsets of small and large planes, fatal and non-fatal accidents
#all_small_fatal_data_df = all_data_df[(
# all_data_df["total_passengers"]>0) & (
# all_data_df["total_passengers"]<5) & (
# all_data_df["TotalFatalInjuries"]>0)].copy()
#list_of_all_words_small_fatal_cause = []
##merge all words for analysis
#for all_lists in all_small_fatal_data_df['cause_stopwords_removed']:
# for second_list in all_lists:
# for all_words in second_list:
# list_of_all_words_small_fatal_cause.append(all_words)
##remove df to save memory
#del all_small_fatal_data_df
##Create lists of ngrams (3-7 words) for each category
#list_of_ngrams_small_fatal_cause = []
#for i in range(3,7):
# ngrams_list = []
# ngrams_list = list(ngrams(list_of_all_words_small_fatal_cause, i))
# list_of_ngrams_small_fatal_cause = list_of_ngrams_small_fatal_cause + ngrams_list
##remove large list to save memory
#del list_of_all_words_small_fatal_cause
##Make frequency counts of the ngrams to see which are most common
#counts_small_fatal_cause = Counter(list_of_ngrams_small_fatal_cause)
##remove large list to save memory
#del list_of_ngrams_small_fatal_cause
#cause_results_df['small_fatal'] =counts_small_fatal_cause.most_common(50)
##remove large list to save memory
#del counts_small_fatal_cause
#
##REPEAT ABOVE PROCEDURE FOR SMALL-NONFATAL ACCIDENTS
#all_small_nonfatal_data_df = all_data_df[(
# all_data_df["total_passengers"]>0) & (
# all_data_df["total_passengers"]<5) & (
# all_data_df["TotalFatalInjuries"]==0)].copy()
#list_of_all_words_small_nonfatal_cause = []
#for all_lists in all_small_nonfatal_data_df['cause_stopwords_removed']:
# for second_list in all_lists:
# for all_words in second_list:
# list_of_all_words_small_nonfatal_cause.append(all_words)
#del all_small_nonfatal_data_df
#list_of_ngrams_small_nonfatal_cause = []
#for i in range(3,7):
# ngrams_list = []
# ngrams_list = list(ngrams(list_of_all_words_small_nonfatal_cause, i))
# list_of_ngrams_small_nonfatal_cause = list_of_ngrams_small_nonfatal_cause + ngrams_list
#del list_of_all_words_small_nonfatal_cause
#counts_small_nonfatal_cause = Counter(list_of_ngrams_small_nonfatal_cause)
#del list_of_ngrams_small_nonfatal_cause
#cause_results_df['small_nonfatal'] =counts_small_nonfatal_cause.most_common(50)
#del counts_small_nonfatal_cause
#
##REPEAT ABOVE PROCEDURE FOR LARGE-FATAL ACCIDENTS
#all_large_fatal_data_df = all_data_df[(
# all_data_df["total_passengers"]>=20) & (
# all_data_df["TotalFatalInjuries"]>0)].copy()
##OPTIONAL
##remove 3 of 4 9/11 attack rows, repetition floods analysis
#all_large_fatal_data_df = all_large_fatal_data_df[all_large_fatal_data_df['EventId'] != '20020123X00106']
#all_large_fatal_data_df = all_large_fatal_data_df[all_large_fatal_data_df['EventId'] != '20020123X00105']
#all_large_fatal_data_df = all_large_fatal_data_df[all_large_fatal_data_df['EventId'] != '20020123X00104']
#
#list_of_all_words_large_fatal_cause = []
#for all_lists in all_large_fatal_data_df['cause_stopwords_removed']:
# for second_list in all_lists:
# for all_words in second_list:
# list_of_all_words_large_fatal_cause.append(all_words)
#del all_large_fatal_data_df
#list_of_ngrams_large_fatal_cause = []
#for i in range(3,7):
# ngrams_list = []
# ngrams_list = list(ngrams(list_of_all_words_large_fatal_cause, i))
# list_of_ngrams_large_fatal_cause = list_of_ngrams_large_fatal_cause + ngrams_list
#del list_of_all_words_large_fatal_cause
#counts_large_fatal_cause = Counter(list_of_ngrams_large_fatal_cause)
#del list_of_ngrams_large_fatal_cause
#cause_results_df['large_fatal'] =counts_large_fatal_cause.most_common(50)
#del counts_large_fatal_cause
#
##REPEAT ABOVE PROCEDURE FOR LARGE-NONFATAL ACCIDENTS
#all_large_nonfatal_data_df = all_data_df[(
# all_data_df["total_passengers"]>=20) & (
# all_data_df["TotalFatalInjuries"]==0)].copy()
#list_of_all_words_large_nonfatal_cause = []
#for all_lists in all_large_nonfatal_data_df['cause_stopwords_removed']:
# for second_list in all_lists:
# for all_words in second_list:
# list_of_all_words_large_nonfatal_cause.append(all_words)
#del all_large_nonfatal_data_df
#list_of_ngrams_large_nonfatal_cause = []
#for i in range(3,7):
# ngrams_list = []
# ngrams_list = list(ngrams(list_of_all_words_large_nonfatal_cause, i))
# list_of_ngrams_large_nonfatal_cause = list_of_ngrams_large_nonfatal_cause + ngrams_list
#del list_of_all_words_large_nonfatal_cause
#counts_large_nonfatal_cause = Counter(list_of_ngrams_large_nonfatal_cause)
#del list_of_ngrams_large_nonfatal_cause
#cause_results_df['large_nonfatal'] =counts_large_nonfatal_cause.most_common(50)
#del counts_large_nonfatal_cause
#
##Notes on results
# #Lots of deduplication still needed for top causes, many ngrams overlap somewhat