-
Notifications
You must be signed in to change notification settings - Fork 2
/
jsonlines.py
334 lines (273 loc) · 10.6 KB
/
jsonlines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
jsonlines implementation
"""
import numbers
import io
import json
import six
TYPE_MAPPING = {
dict: dict,
list: list,
str: six.text_type,
int: six.integer_types,
float: float,
numbers.Number: numbers.Number,
bool: bool,
}
class Error(Exception):
"""Base error class."""
pass
class InvalidLineError(Error, ValueError):
"""
Error raised when an invalid line is encountered.
This happens when the line does not contain valid JSON, or if a
specific data type has been requested, and the line contained a
different data type.
The original line itself is stored on the exception instance as the
``.line`` attribute, and the line number as ``.lineno``.
This class subclasses both ``jsonlines.Error`` and the built-in
``ValueError``.
"""
#: The invalid line
line = None
#: The line number
lineno = None
def __init__(self, msg, line, lineno):
msg = "{} (line {})".format(msg, lineno)
self.line = line.rstrip()
self.lineno = lineno
super(InvalidLineError, self).__init__(msg)
class ReaderWriterBase(object):
"""
Base class with shared behaviour for both the reader and writer.
"""
def close(self):
"""
Close this reader/writer.
This closes the underlying file if that file has been opened by
this reader/writer. When an already opened file-like object was
provided, the caller is responsible for closing it.
"""
if self._closed:
return
self._closed = True
if self._should_close_fp:
self._fp.close()
def __repr__(self):
name = getattr(self._fp, 'name', None)
if name:
wrapping = repr(name)
else:
wrapping = '<{} at 0x{:x}>'.format(
type(self._fp).__name__,
id(self._fp))
return '<jsonlines.{} at 0x{:x} wrapping {}>'.format(
type(self).__name__, id(self), wrapping)
def __enter__(self):
return self
def __exit__(self, *exc_info):
self.close()
return False
class Reader(ReaderWriterBase):
"""
Reader for the jsonlines format.
The first argument must be an iterable that yields JSON encoded
strings. Usually this will be a readable file-like object, such as
an open file or an ``io.TextIO`` instance, but it can also be
something else as long as it yields strings when iterated over.
The `loads` argument can be used to replace the standard json
decoder. If specified, it must be a callable that accepts a
(unicode) string and returns the decoded object.
Instances are iterable and can be used as a context manager.
:param file-like iterable: iterable yielding lines as strings
:param callable loads: custom json decoder callable
"""
def __init__(self, iterable, loads=None):
self._fp = iterable
self._should_close_fp = False
self._closed = False
if loads is None:
loads = json.loads
self._loads = loads
self._line_iter = enumerate(iterable, 1)
def read(self, type=None, allow_none=False, skip_empty=False):
"""
Read and decode a line.
The optional `type` argument specifies the expected data type.
Supported types are ``dict``, ``list``, ``str``, ``int``,
``float``, ``numbers.Number`` (accepts both integers and
floats), and ``bool``. When specified, non-conforming lines
result in :py:exc:`InvalidLineError`.
By default, input lines containing ``null`` (in JSON) are
considered invalid, and will cause :py:exc:`InvalidLineError`.
The `allow_none` argument can be used to change this behaviour,
in which case ``None`` will be returned instead.
If `skip_empty` is set to ``True``, empty lines and lines
containing only whitespace are silently skipped.
"""
if self._closed:
raise RuntimeError('reader is closed')
if type is not None and type not in TYPE_MAPPING:
raise ValueError("invalid type specified")
try:
lineno, line = next(self._line_iter)
while skip_empty and not line.rstrip():
lineno, line = next(self._line_iter)
except StopIteration:
six.raise_from(EOFError, None)
if isinstance(line, six.binary_type):
try:
line = line.decode('utf-8')
except UnicodeDecodeError as orig_exc:
exc = InvalidLineError(
"line is not valid utf-8: {}".format(orig_exc),
line, lineno)
six.raise_from(exc, orig_exc)
try:
value = self._loads(line)
except ValueError as orig_exc:
exc = InvalidLineError(
"line contains invalid json: {}".format(orig_exc),
line, lineno)
six.raise_from(exc, orig_exc)
if value is None:
if allow_none:
return None
raise InvalidLineError(
"line contains null value", line, lineno)
if type is not None:
valid = isinstance(value, TYPE_MAPPING[type])
if type in (int, numbers.Number):
valid = valid and not isinstance(value, bool)
if not valid:
raise InvalidLineError(
"line does not match requested type", line, lineno)
return value
def iter(self, type=None, allow_none=False, skip_empty=False,
skip_invalid=False):
"""
Iterate over all lines.
This is the iterator equivalent to repeatedly calling
:py:meth:`~Reader.read()`. If no arguments are specified, this
is the same as directly iterating over this :py:class:`Reader`
instance.
When `skip_invalid` is set to ``True``, invalid lines will be
silently ignored.
See :py:meth:`~Reader.read()` for a description of the other
arguments.
"""
try:
while True:
try:
yield self.read(
type=type,
allow_none=allow_none,
skip_empty=skip_empty)
except InvalidLineError:
if not skip_invalid:
raise
except EOFError:
pass
def __iter__(self):
"""
See :py:meth:`~Reader.iter()`.
"""
return self.iter()
class Writer(ReaderWriterBase):
"""
Writer for the jsonlines format.
The `fp` argument must be a file-like object with a ``.write()``
method accepting either text (unicode) or bytes.
The `compact` argument can be used to to produce smaller output.
The `sort_keys` argument can be used to sort keys in json objects,
and will produce deterministic output.
For more control, provide a a custom encoder callable using the
`dumps` argument. The callable must produce (unicode) string output.
If specified, the `compact` and `sort` arguments will be ignored.
When the `flush` argument is set to ``True``, the writer will call
``fp.flush()`` after each written line.
Instances can be used as a context manager.
:param file-like fp: writable file-like object
:param bool compact: whether to use a compact output format
:param bool sort_keys: whether to sort object keys
:param callable dumps: custom encoder callable
:param bool flush: whether to flush the file-like object after
writing each line
"""
def __init__(
self, fp, compact=False, sort_keys=False, dumps=None, flush=False):
self._closed = False
try:
fp.write(u'')
self._fp_is_binary = False
except TypeError:
self._fp_is_binary = True
if dumps is None:
encoder_kwargs = dict(ensure_ascii=False, sort_keys=sort_keys)
if compact:
encoder_kwargs.update(separators=(',', ':'))
dumps = json.JSONEncoder(**encoder_kwargs).encode
self._fp = fp
self._should_close_fp = False
self._dumps = dumps
self._flush = flush
def write(self, obj):
"""
Encode and write a single object.
:param obj: the object to encode and write
"""
if self._closed:
raise RuntimeError('writer is closed')
line = self._dumps(obj)
# On Python 2, the JSON module has the nasty habit of returning
# either a byte string or unicode string, depending on whether
# the serialised structure can be encoded using ASCII only, so
# this means this code needs to handle all combinations.
if self._fp_is_binary:
if not isinstance(line, six.binary_type):
line = line.encode('utf-8')
self._fp.write(line)
self._fp.write(b'\n')
else:
if not isinstance(line, six.text_type):
line = line.decode('ascii') # For Python 2.
self._fp.write(line)
self._fp.write(u'\n')
if self._flush:
self._fp.flush()
def write_all(self, iterable):
"""
Encode and write multiple objects.
:param iterable: an iterable of objects
"""
for obj in iterable:
self.write(obj)
def open(name, mode='r', **kwargs):
"""
Open a jsonlines file for reading or writing.
This is a convenience function that opens a file, and wraps it in
either a :py:class:`Reader` or :py:class:`Writer` instance,
depending on the specified `mode`.
Any additional keyword arguments will be passed on to the reader and
writer: see their documentation for available options.
The resulting reader or writer must be closed after use by the
caller, which will also close the opened file. This can be done by
calling ``.close()``, but the easiest way to ensure proper resource
finalisation is to use a ``with`` block (context manager), e.g.
::
with jsonlines.open('out.jsonl', mode='w') as writer:
writer.write(...)
:param file-like fp: name of the file to open
:param str mode: whether to open the file for reading (``r``),
writing (``w``) or appending (``a``).
:param \*\*kwargs: additional arguments, forwarded to the reader or writer
"""
if mode not in {'r', 'w', 'a'}:
raise ValueError("'mode' must be either 'r', 'w', or 'a'")
fp = io.open(name, mode=mode + 't', encoding='utf-8')
if mode == 'r':
instance = Reader(fp, **kwargs)
else:
instance = Writer(fp, **kwargs)
instance._should_close_fp = True
return instance