This repository has been archived by the owner on Nov 11, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSearchPacket.py
131 lines (108 loc) · 4.34 KB
/
SearchPacket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'''
SearchPacket! Made of individual attributes. This will be passed into the scorer for some
sweet scoring action. Radical, dude!
@author: Justin A. Middleton
@date: 12 March 2015
'''
from nltk import word_tokenize
from Attribute import Attribute
from Lemmatizer import Lemmatizer
from nltk.corpus import stopwords
class SearchPacket:
'''
Creates the search packet by passing in a list of attributes from
the GUI interaction!
Once the attributes are in, the packet sanitizes them to check for
any that have errors (e.g. duplicate names, empty strings for words)
and either ignore the words or ignore the attribute altogether.
What I expect: a LIST of attributes.
'''
def __init__(self, attributes):
self.attributes = []
self.lemma = Lemmatizer()
for attr in attributes:
#If the attribute cannot be sanitized to an acceptable degree, skip it.
try:
sanitized = self.sanitizeAttribute(attr)
except ValueError, e:
continue
self.attributes.append(sanitized)
if len(attributes) < 1:
raise ValueError("__init__: No valid attributes to search.")
'''
Turns a rough attribute from the GUI into one that has exactly as many words
as it needs.
attr: dirty attribute from the GUI
returns: clean attribute, without any invalid words
'''
def sanitizeAttribute(self, attr):
if attr.get_name() is None or attr.get_name() == "":
raise ValueError("sanitizeAttribute: Invalid name for attribute.")
if attr.get_name() in [a.get_name() for a in self.attributes]:
raise ValueError("sanitizeAttribute: Duplicate name for an attribute.")
if attr.get_attr_weight_num() < 1 or attr.get_attr_weight_num() > 3:
raise ValueError("sanitizeAttribute: Bad attribute weight.")
dirtyWords = attr.get_words()
dirtyWeights = attr.get_weights()
dirtySents = attr.get_sentiments()
if dirtyWords is None or dirtyWeights is None or dirtySents is None:
raise ValueError("sanitizeAttribute: Unassigned values in attribute.")
elif len(dirtyWords) != len(dirtyWeights) or len(dirtyWords) != len(dirtySents):
raise ValueError("sanitizeAttribute: list length mismatch.")
cleanWords, cleanWeights, cleanSents = self.cleanInfoLists(dirtyWords,
dirtyWeights, dirtySents)
if len(cleanWords) < 1:
raise ValueError("sanitizeAttribute: no valid words in attribute.")
return Attribute(attr.get_name(), attr.get_attr_weight_num(),
cleanWords, cleanWeights, cleanSents)
'''
Removes any invalid combinations from the lists. This includes those that
have bad numbers for weights and sents or words with invalid names.
dirtyWords: list of str
dirtyWeights: list of int (between 1 and 3 inclusive)
dirtySents: list of int (between -1 and 1 inclusive)
Returns three lists with all bad combinations removed.
All three must be processed at the same time so corresponding information
can be discarded if any part of it is bad.
'''
def cleanInfoLists(self, dirtyWords, dirtyWeights, dirtySents):
cleanWords = []
cleanWeights = []
cleanSents = []
'''
Stop words: a list of the most commonly used words in the english language
I remove them because keeping them will bloat the running time and the words
probably won't give us any of the information we want.
'''
stop = stopwords.words("english")
for word, weight, sent in zip(dirtyWords, dirtyWeights, dirtySents):
word = word.lower().strip()
#Lemmatize it only if it's a single word; otherwise, preserve the phrase
if len(word.split()) == 1:
word = self.lemma.lemmatizeTokens([word])[0] #self.lemma.stem([word])[0]#
if word in cleanWords or word in stop or word == "":
continue
elif len(word.split()) == 0:
continue
if weight < 1 or weight > 3:
continue
elif sent < -1 or sent > 1:
continue
cleanWords.append(word)
cleanWeights.append(weight)
cleanSents.append(sent)
return cleanWords, cleanWeights, cleanSents
def getAttributes(self):
return self.attributes
'''
Get the query from all search terms inside. Just "OR"s them all together.
Example: ["one", "two", "three"] and ["four, "five", "six"] will yield
"one OR two OR three OR four OR five OR six"
'''
def getQuery(self):
attributeQueries = []
for attr in self.attributes:
query = '"' + '" OR "'.join(attr.get_words()) + '"'
attributeQueries.append(query)
finalQuery = ' OR '.join(attributeQueries)
return finalQuery