This repository has been archived by the owner on Jan 28, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordcount-summary.py
72 lines (51 loc) · 2.38 KB
/
wordcount-summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import sys, string, os
import csv as csv
from math import *
import operator
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import re
# ignore stop words
stop_words = ["a", "able","about","across","after","all","almost","also","am","among","an","and","any",
"are","as","at","be","because","been","but","by","can","cannot","could","dear","did","do","does","either",
"else", "ever","every","for","from","get","got","had","has","have","he","her","hers","him","his","how",
"however","i","if","in","into","is","it","its","just","least","let","like","likely","may","me","might",
"most","must","my","neither","no","nor","not","of","off","often","on","only","or","other","our","own",
"rather","said","say","says","she","should","since","so","some","than","that","the","their","them",
"then","there","these","they","this","tis","to","too","twas","us","wants","was","we","were","what","when",
"where","which","while","who","whom","why","will","with","would","yet","you","your"]
# ignore common words
common_words = ['text','page','gt', 'lt', 'code', 'pre', 'id', 'li','http', 'td', 'strong', 'amp', 'want',
'width', 'ul', 'nofollow', 'tr', 'img', 'br', 'one', 'height', 'image', 'need', 'here', 'link','way','first',
'works','two']
term_iter = 0
# File containing categories (tags) to be assigned to records
tagTerms = "subjects.txt"
# File containing training data (id, title, text, tags)
trainDocs = "data.csv"
numTerms = 10
searchterm = "Middle-Aged"
print searchterm,
idtags = dict()
searchterm = searchterm.strip()
csv_file_object = csv.reader(open(trainDocs, "rb")) #Load in the training csv file
# get the most common terms for documents with this tag
wordcount = dict()
tagcommonwords = dict()
i = 0
j = 0
# build an index for each subject.
# index contains each term in the records with this subject and the number of instances of that term.
for row in csv_file_object:
for t in row[3].strip().split():
if searchterm in t:
for w in re.findall(r"[0-9a-zA-Z#-]+", (row[2] + row[1])):
if w.lower() not in stop_words and w.lower() not in common_words and (len(w) > 2):
if w.lower() in wordcount:
wordcount[w.lower()] += 1
else:
wordcount[w.lower()] = 1
sortedwordcount = sorted(wordcount.iteritems(), key=operator.itemgetter(1), reverse=True)
for key,value in sortedwordcount:
print key + ": " + str(value)