-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParseText.py
135 lines (100 loc) · 4.55 KB
/
ParseText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
from scipy.sparse import bsr_matrix
class ParseText:
def __init__(self):
self.features = None
self.samplename = None
self.labels = None
self.featurevector = None
self.numsamples = 0
self.fpos = None
self.labelnames = None
self.labelmapper = None
'''return mapping from feature(word) to index'''
def featureidx(self):
return self.fpos
'''return the list of words used as features'''
def getfeaturenames(self):
assert self.features is not None, "Use fit to set the features first"
return self.features
'''return the names of the labels used'''
def getfilenames(self):
assert self.samplename is not None, "Use fit to set the labels first"
return self.samplename
'''get the labels of the fit data'''
def getlabels(self):
assert self.labels is not None, "Use fit to set the labels first"
return self.labels
# '''flatten a nested list, faster than reduce ops'''
# def _flatten(self, items, seqtypes=(list, tuple)):
# for i, x in enumerate(items):
# while i < len(items) and isinstance(items[i], seqtypes):
# items[i:i + 1] = items[i]
# return items
'''Read csv of the format
<samplename> <label> <attribute> <frequency> ...
'''
def fit_transform(self, filename, delimiter=','):
featureset = set()
namepos = 0
labelpos = 1
# read from file, use an unused separator to get it as a whole line
alltext = np.genfromtxt(filename, dtype='str', delimiter='\UFFFD')
# split to lines
lines = np.char.split(alltext, delimiter)
# extract the names of the samples, labels, number of samples used
self.samplename = np.asarray([t[namepos] for t in lines])
labels = np.asarray([t[labelpos] for t in lines])
self.labelnames, self.labels = np.unique(labels, return_inverse=True)
self.labelmapper = dict(zip(self.labelnames, range(self.labelnames.shape[0])))
self.numsamples = self.labels.shape[0]
# create a set of all words in the file
for line in lines:
words = line[2::2]
for word in words:
if word.isalpha():
featureset.add(word.lower())
# convert to np array
self.features = np.asarray(sorted(list(featureset)))
# create a map of word to position
self.fpos = {k: v for v, k in enumerate(self.features)}
# create a list of lists contains (words, freq, line num)
tokendarray = []
for row, token in enumerate(lines):
words = token[2::2]
freq = token[3::2]
for w, f in zip(words, freq):
if w.lower() in self.fpos:
tokendarray.append((self.fpos[w], f, row))
tokendarray = np.asarray(tokendarray)
data = tokendarray[:, 1].astype(int)
row = tokendarray[:, 2].astype(int)
col = tokendarray[:, 0].astype(int)
# convert to sparse matrix
self.featurevector = bsr_matrix((data, (row, col)), shape=(self.numsamples, self.features.shape[0]))
return self.featurevector, self.labels
'''Vectorize a file given the features created using fit'''
def vectorize(self, filename, delimiter=','):
assert self.features is not None, "Use fit to set the features first"
namepos = 0
labelpos = 1
# read from file, use an unused separator to get it as a whole line
alltext = np.genfromtxt(filename, dtype='str', delimiter='\UFFFD')
# split to lines
lines = np.char.split(alltext, delimiter)
# create tuples of word_index, freq, row
tokendarray = []
for row, token in enumerate(lines):
words = token[2::2]
freq = token[3::2]
for w, f in zip(words, freq):
if w.lower() in self.fpos:
tokendarray.append((self.fpos[w], f, row))
# convert to nd array
tokendarray = np.asarray(tokendarray)
data = tokendarray[:, 1].astype(int)
row = tokendarray[:, 2].astype(int)
col = tokendarray[:, 0].astype(int)
labels = np.asarray([self.labelmapper[t[labelpos]] for t in lines])
featurevector = bsr_matrix((data, (row, col)), shape=(len(labels), self.features.shape[0]))
return featurevector, labels