-
Notifications
You must be signed in to change notification settings - Fork 16
/
init.lua
372 lines (340 loc) · 12.7 KB
/
init.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
----------------------------------------------------------------------
--
-- Copyright (c) 2012 Roy Lowrance, Clement Farabet
--
-- Permission is hereby granted, free of charge, to any person obtaining
-- a copy of this software and associated documentation files (the
-- "Software"), to deal in the Software without restriction, including
-- without limitation the rights to use, copy, modify, merge, publish,
-- distribute, sublicense, and/or sell copies of the Software, and to
-- permit persons to whom the Software is furnished to do so, subject to
-- the following conditions:
--
-- The above copyright notice and this permission notice shall be
-- included in all copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--
----------------------------------------------------------------------
-- description:
-- csvigo - a little package to handle CSV files (read/write)
--
-- history:
-- June 24, 2012 - create a complete API to make queries - C. Farabet
-- June 23, 2012 - made a pkg, and high-level functions - C. Farabet
-- June 1, 2012 - csvigo.File class - R. Lowrance
----------------------------------------------------------------------
require 'torch'
require 'dok'
-- create global nnx table:
csvigo = {}
-- csvigo.File manager:
torch.include('csvigo', 'File.lua')
----------------------------------------------------------------------
-- functional API: simple shortcuts to serialize data using CSV files
-- this API is similar to the image.load/save, where the user doens't
-- have to create a csvigo.File object, and handle it later on.
-- load
function csvigo.load(...)
-- usage
local args, path, separator, mode, header, verbose, skip = dok.unpack(
{...},
'csvigo.load',
'Load a CSV file, according to the specified mode:\n'
.. ' - raw : no clean up, return a raw list of lists, a 1-to-1 mapping to the CSV file\n'
.. ' - tidy : return a clean table, where each entry is a variable that points to its values\n'
.. ' - query : return the tidy table, as well as query operators\n'
.. ' - large : returns a table that decodes rows on the fly, on indexing ',
{arg='path', type='string', help='path to file', req=true},
{arg='separator', type='string', help='separator (one character)', default=','},
{arg='mode', type='string', help='load mode: raw | tidy | query', default='tidy'},
{arg='header', type='string', help='file has a header (variable names): true | false | auto', default='auto'},
{arg='verbose', type='boolean', help='verbose load', default=true},
{arg='skip', type='number', help='skip this many lines at start of file', default=0},
{arg='column_order', type='boolean', help='return csv\'s column order in tidy mode', default=false}
)
local function checkheader(header, firstline)
if type(header) == 'boolean' then
return header
end
if type(header) == 'string' then
if header == 'auto' then
return (tonumber(firstline[1]) == nil)
end
else
-- convert to boolean
return not not header
end
end
-- check path
path = path:gsub('^~',os.getenv('HOME'))
-- verbose print
local function vprint(...) if verbose then print('<csv>',...) end end
-- load CSV
vprint('parsing file: ' .. path)
local f = csvigo.File(path, 'r', separator)
local loaded = f:readall(mode)
f:close()
-- do work depending on mode
if mode == 'raw' or mode == 'large' then
-- simple, dont do anything
vprint('parsing done')
return loaded
elseif mode == 'tidy' or mode == 'query' then
-- tidy up results:
vprint('tidying up entries')
local tidy = {}
local i2key = {}
-- header?
local start = 1 + skip
if checkheader(header, loaded[start]) then
-- use header names
i2key = loaded[start]
start = start + 1
else
-- generate names
for i = 1,#loaded[start] do
i2key[i] = 'var_'..i
end
end
for i,key in ipairs(i2key) do
tidy[key] = {}
end
-- parse all
for i = start,#loaded do
local entry = loaded[i]
for i,val in ipairs(entry) do
table.insert(tidy[i2key[i]], val)
end
end
-- return tidy table
if mode == 'tidy' then
vprint('returning tidy table')
if args.column_order then
return i2key,tidy
else
return tidy
end
end
-- query mode: build reverse index
vprint('generating reversed index for fast queries')
local revidx = {}
for var,vals in pairs(tidy) do
revidx[var] = {}
for i,val in ipairs(vals) do
revidx[var][val] = revidx[var][val] or {}
table.insert(revidx[var][val], i)
end
end
-- create a function/closure that can be used to query
-- the table
local function query(...)
-- usage
local args, query, varvals = dok.unpack(
{...},
'query',
'This closure was automatically generated to query your data.\n'
.. 'Example of query: query(\'union\', {var1={1}, var2={2,3,4}})\n'
.. 'this query will return a subset of the original data, where var1 = 1 OR var2 = 2 or 3 or 4 \n'
.. '\n'
.. 'Other example of query: query(\'inter\', {var1={1}, var2={2,3,4}})\n'
.. 'this query will return a subset of the original data, where var1 = 1 AND var2 = 2 or 3 or 4 \n'
.. '\n'
.. 'Other example of query: query(\'vars\')\n'
.. 'this will return a list of the variable names\n'
.. '\n'
.. 'Other example of query: query() or query(\'all\')\n'
.. 'this query will return the complete dataset'
,
{arg='query', type='string', help='query: all | help | vars | inter | union', default='all'},
{arg='vars', type='table', help='list of vars/vals'}
)
if query == 'help' then
-- help
print(args.usage)
return
elseif query == 'vars' then
-- return vars
local vars = {}
for k in pairs(tidy) do
table.insert(vars,k)
end
return vars
elseif query == 'all' then
-- query all: return the whole thing
return tidy
else
-- query has this form:
-- { var1 = {'value1', 'value2'}, var2 = {'value1'} }
-- OR
-- { var1 = 'value1', var2 = 'value2'}
-- convert second form into first one:
for var,vals in pairs(varvals) do
if type(vals) ~= 'table' then
varvals[var] = {vals}
end
end
-- find all indices that are ok
local indices = {}
if query == 'union' then
for var,vals in pairs(varvals) do
for _,val in ipairs(vals) do
local found = revidx[var][tostring(val)]
if found ~= nil then
for _,idx in ipairs(found) do
table.insert(indices, idx)
end
end
end
end
else -- 'inter'
local revindices = {}
local nvars = 0
for var,vals in pairs(varvals) do
for _,val in ipairs(vals) do
local found = revidx[var][tostring(val)]
for _,idx in ipairs(found) do
revindices[idx] = (revindices[idx] or 0) + 1
end
end
nvars = nvars + 1
end
for var,vals in pairs(varvals) do
for _,val in ipairs(vals) do
local found = revidx[var][tostring(val)]
for _,idx in ipairs(found) do
if revindices[idx] == nvars then
table.insert(indices, idx)
end
end
end
end
end
table.sort(indices, function(a,b) return a<b end)
-- generate filtered table
local filtered = {}
for k in pairs(tidy) do
filtered[k] = {}
end
for idx,i in ipairs(indices) do
if i ~= indices[idx-1] then -- check for doubles
for k in pairs(tidy) do
table.insert(filtered[k], tidy[k][i])
end
end
end
-- return filtered table
return filtered
end
end
-- returning query closure
vprint('returning query closure, type query(\'help\') to get help')
return query
else
print(args.usage)
error('unknown mode')
end
end
-- load
function csvigo.save(...)
-- usage
local args, path, data, separator, mode, header, verbose = dok.unpack(
{...},
'csvigo.save',
'Load a CSV file, according to the specifided mode:\n'
.. ' - raw : no clean up, return a raw list of lists, a 1-to-1 mapping to the CSV file\n'
.. ' - tidy : return a clean table, where each entry is a variable that points to its values\n'
.. ' - query : return the tidy table, as well as query operators',
{arg='path', type='string', help='path to file', req=true},
{arg='data', type='table', help='table to save as a CSV file', req=true},
{arg='separator', type='string', help='separator (one character)', default=','},
{arg='mode', type='string', help='table to save is represented as: raw | tidy | query', default='autodetect'},
{arg='header', type='boolean', help='table has a header (variable names)', default=true},
{arg='verbose', type='boolean', help='verbose load', default=true},
{arg='column_order', type='table', help='Write csv according to given column order', default=nil},
{arg='nan_as_missing', type='boolean', help='Save nan values (0/0) as missing', default=false}
)
-- check path
path = path:gsub('^~',os.getenv('HOME'))
-- verbose print
local function vprint(...) if verbose then print('<csv>',...) end end
-- save CSV
vprint('writing to file: ' .. path)
local f = csvigo.File(path,'w',separator, args.nan_as_missing)
-- autodetect mode?
if mode == 'autodetect' then
if type(data) == 'function' then
mode = 'query'
elseif type(data) == 'table' then
if #data == 0 then
mode = 'tidy'
else
mode = 'raw'
end
else
error('cannot autodetect mode, incorrect data type')
end
end
-- do work depending on mode
if mode == 'raw' then
-- simple, just write table
f:writeall(data)
vprint('writing done')
elseif mode == 'tidy' or mode == 'query' then
-- query mode?
if mode == 'query' then
-- query all data:
vprint('generating tidy table')
data = data('all')
end
-- 'data' is a tidy table, export to raw mode
vprint('exporting tidy table to raw CSV')
local raw = {}
-- use headers?
local headers
if header then
headers = {}
if args.column_order then
for _,var in pairs(args.column_order) do
table.insert(headers, var)
end
else
for var in pairs(data) do
table.insert(headers, var)
end
end
end
-- export data
if args.column_order then
for var,vals in pairs(args.column_order) do
for i,val in ipairs(data[vals]) do
raw[i] = raw[i] or {}
table.insert(raw[i], val)
end
end
else
for var,vals in pairs(data) do
for i,val in ipairs(vals) do
raw[i] = raw[i] or {}
table.insert(raw[i], val)
end
end
end
-- write raw data
if headers then f:write(headers) end
f:writeall(raw)
vprint('writing done')
else
print(args.usage)
error('unknown mode')
end
-- done
f:close()
end
return csvigo