-
Notifications
You must be signed in to change notification settings - Fork 28
/
2vw.py
118 lines (75 loc) · 2.3 KB
/
2vw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
'convert a combined train+test file to VW format'
import csv
import sys
import re
import math
from collections import defaultdict
def construct_vw_line( line, value_indexes, headers ):
label = line[target_index]
label = str( math.log( float( label )))
new_line = []
for i in indexes2tokenize:
col_name = headers[i]
words = get_words( line[i] )
new_item = "|%s %s" % ( col_name, words )
new_line.append( new_item )
for i in indexes2binarize:
col_name = headers[i]
value = line[i]
value_index = value_indexes[i][value]
new_item = "|%s %s" % ( col_name, value_index )
new_line.append( new_item )
new_line.insert( 0, label )
new_line = " ".join( new_line )
return new_line
def get_words( text ):
text = text.replace( "'", "" )
text = re.sub( r'\W+', ' ', text )
text = text.lower()
text = text.split()
words = []
for w in text:
if w in words:
continue
words.append( w )
words = " ".join( words )
return words
#################################################
csv.field_size_limit( 1000000 )
input_file = sys.argv[1]
output_file = sys.argv[2]
target_col = 'SalaryNormalized'
cols2tokenize = [ 'Title', 'FullDescription', 'LocationRaw' ]
cols2binarize = [ 'LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName' ]
cols2drop = [ 'SalaryRaw' ]
###
print "%s ---> %s" % ( input_file, output_file )
i_f = open( input_file )
o_f = open( output_file, 'wb' )
reader = csv.reader( i_f )
headers = reader.next()
target_index = headers.index( target_col )
indexes2tokenize = map( lambda x: headers.index( x ), cols2tokenize )
indexes2binarize = map( lambda x: headers.index( x ), cols2binarize )
indexes2drop = map( lambda x: headers.index( x ), cols2drop )
# first pass: unique values
unique_values = defaultdict( set )
for line in reader:
for i in indexes2binarize:
value = line[i]
unique_values[i].add( value )
# mapping values to indexes
value_indexes = defaultdict( dict )
for i in unique_values:
for index, value in enumerate( sorted( list( unique_values[i] ))):
value_indexes[i][value] = index + 1
print "second pass..."
i_f.seek( 0 )
reader.next()
n = 0
for line in reader:
new_line = construct_vw_line( line, value_indexes, headers )
o_f.write( new_line + "\n" )
n += 1
if n % 10000 == 0:
print n