diff --git a/README.md b/README.md
index a171d8f..70cc285 100644
--- a/README.md
+++ b/README.md
@@ -136,11 +136,11 @@ data = {"foo": "bar", "baz": 123}
srsly.write_json("/path/to/file.json", data)
```
-| Argument | Type | Description |
-| ---------- | ------------ | ------------------------------------------------------ |
-| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
-| `data` | - | The JSON-serializable data to output. |
-| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
+| Argument | Type | Description |
+| -------- | ------------ | ------------------------------------------------------ |
+| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
+| `data` | - | The JSON-serializable data to output. |
+| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
#### function `srsly.read_json`
@@ -152,7 +152,7 @@ data = srsly.read_json("/path/to/file.json")
| Argument | Type | Description |
| ----------- | ------------ | ------------------------------------------ |
-| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
+| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
| **RETURNS** | dict / list | The loaded JSON content. |
#### function `srsly.write_gzip_json`
@@ -164,11 +164,27 @@ data = {"foo": "bar", "baz": 123}
srsly.write_gzip_json("/path/to/file.json.gz", data)
```
-| Argument | Type | Description |
-| ---------- | ------------ | ------------------------------------------------------ |
-| `path` | str / `Path` | The file path. |
-| `data` | - | The JSON-serializable data to output. |
-| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
+| Argument | Type | Description |
+| -------- | ------------ | ------------------------------------------------------ |
+| `path` | str / `Path` | The file path. |
+| `data` | - | The JSON-serializable data to output. |
+| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. |
+
+#### function `srsly.write_gzip_jsonl`
+
+Create a gzipped JSONL file and dump contents.
+
+```python
+data = [{"foo": "bar"}, {"baz": 123}]
+srsly.write_gzip_json("/path/to/file.jsonl.gz", data)
+```
+
+| Argument | Type | Description |
+| ----------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | The file path. |
+| `lines` | - | The JSON-serializable contents of each line. |
+| `append` | bool | Whether or not to append to the location. Appending to .gz files is generally not recommended, as it doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly compressed. |
+| `append_new_line` | bool | Whether or not to write a new line before appending to the file. |
#### function `srsly.read_gzip_json`
@@ -180,9 +196,22 @@ data = srsly.read_gzip_json("/path/to/file.json.gz")
| Argument | Type | Description |
| ----------- | ------------ | ------------------------ |
-| `path` | str / `Path` | The file path. |
+| `path` | str / `Path` | The file path. |
| **RETURNS** | dict / list | The loaded JSON content. |
+#### function `srsly.read_gzip_jsonl`
+
+Load gzipped JSONL from a file.
+
+```python
+data = srsly.read_gzip_jsonl("/path/to/file.jsonl.gz")
+```
+
+| Argument | Type | Description |
+| ----------- | ------------ | ------------------------- |
+| `path` | str / `Path` | The file path. |
+| **RETURNS** | dict / list | The loaded JSONL content. |
+
#### function `srsly.write_jsonl`
Create a JSONL file (newline-delimited JSON) and dump contents line by line, or
@@ -195,7 +224,7 @@ srsly.write_jsonl("/path/to/file.jsonl", data)
| Argument | Type | Description |
| ----------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------- |
-| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
+| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
| `lines` | iterable | The JSON-serializable lines. |
| `append` | bool | Append to an existing file. Will open it in `"a"` mode and insert a newline before writing lines. Defaults to `False`. |
| `append_new_line` | bool | Defines whether a new line should first be written when appending to an existing file. Defaults to `True`. |
@@ -211,7 +240,7 @@ data = srsly.read_jsonl("/path/to/file.jsonl")
| Argument | Type | Description |
| ---------- | ---------- | -------------------------------------------------------------------- |
-| `path` | str / Path | The file path or `"-"` to read from stdin. |
+| `path` | str / Path | The file path or `"-"` to read from stdin. |
| `skip` | bool | Skip broken lines and don't raise `ValueError`. Defaults to `False`. |
| **YIELDS** | - | The loaded JSON contents of each line. |
@@ -272,10 +301,10 @@ data = {"foo": "bar", "baz": 123}
srsly.write_msgpack("/path/to/file.msg", data)
```
-| Argument | Type | Description |
-| ---------- | ------------ | ---------------------- |
-| `path` | str / `Path` | The file path. |
-| `data` | - | The data to serialize. |
+| Argument | Type | Description |
+| -------- | ------------ | ---------------------- |
+| `path` | str / `Path` | The file path. |
+| `data` | - | The data to serialize. |
#### function `srsly.read_msgpack`
@@ -287,7 +316,7 @@ data = srsly.read_msgpack("/path/to/file.msg")
| Argument | Type | Description |
| ----------- | ------------ | --------------------------------------------------------------------------------------- |
-| `path` | str / `Path` | The file path. |
+| `path` | str / `Path` | The file path. |
| `use_list` | bool | Don't use tuples instead of lists. Can make deserialization slower. Defaults to `True`. |
| **RETURNS** | - | The loaded and deserialized content. |
@@ -343,7 +372,7 @@ yaml_string = srsly.yaml_dumps(data)
| ----------------- | ---- | ------------------------------------------ |
| `data` | - | The JSON-serializable data to output. |
| `indent_mapping` | int | Mapping indentation. Defaults to `2`. |
-| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
+| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
| `indent_offset` | int | Indentation offset. Defaults to `2`. |
| `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. |
| **RETURNS** | str | The serialized string. |
@@ -373,10 +402,10 @@ srsly.write_yaml("/path/to/file.yml", data)
| Argument | Type | Description |
| ----------------- | ------------ | ------------------------------------------ |
-| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
+| `path` | str / `Path` | The file path or `"-"` to write to stdout. |
| `data` | - | The JSON-serializable data to output. |
| `indent_mapping` | int | Mapping indentation. Defaults to `2`. |
-| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
+| `indent_sequence` | int | Sequence indentation. Defaults to `4`. |
| `indent_offset` | int | Indentation offset. Defaults to `2`. |
| `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. |
@@ -390,7 +419,7 @@ data = srsly.read_yaml("/path/to/file.yml")
| Argument | Type | Description |
| ----------- | ------------ | ------------------------------------------ |
-| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
+| `path` | str / `Path` | The file path or `"-"` to read from stdin. |
| **RETURNS** | dict / list | The loaded YAML content. |
#### function `srsly.is_yaml_serializable`
diff --git a/srsly/_json_api.py b/srsly/_json_api.py
index 900e42b..24d25fd 100644
--- a/srsly/_json_api.py
+++ b/srsly/_json_api.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterable, Sequence, Any, Optional
+from typing import Union, Iterable, Sequence, Any, Optional, Iterator
import sys
import json as _builtin_json
import gzip
@@ -56,14 +56,27 @@ def read_json(path: FilePath) -> JSONOutput:
def read_gzip_json(path: FilePath) -> JSONOutput:
"""Load JSON from a gzipped file.
- location (FilePath): The file path.
- RETURNS (JSONOutput): The loaded JSON content.
+ location (FilePath): The file path.
+ RETURNS (JSONOutput): The loaded JSON content.
"""
file_path = force_string(path)
with gzip.open(file_path, "r") as f:
return ujson.load(f)
+def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]:
+ """Read a gzipped .jsonl file and yield contents line by line.
+ Blank lines will always be skipped.
+
+ path (FilePath): The file path.
+ skip (bool): Skip broken lines and don't raise ValueError.
+ YIELDS (JSONOutput): The unpacked, deserialized Python objects.
+ """
+ with gzip.open(force_path(path), "r") as f:
+ for line in _yield_json_lines(f, skip=skip):
+ yield line
+
+
def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
"""Create a .json file and dump contents or write to standard
output.
@@ -94,6 +107,30 @@ def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None:
f.write(json_data.encode("utf-8"))
+def write_gzip_jsonl(
+ path: FilePath,
+ lines: Iterable[JSONInput],
+ append: bool = False,
+ append_new_line: bool = True,
+) -> None:
+ """Create a .jsonl.gz file and dump contents.
+
+ location (FilePath): The file path.
+ lines (Sequence[JSONInput]): The JSON-serializable contents of each line.
+ append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it
+ doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly
+ compressed.
+ append_new_line (bool): Whether or not to write a new line before appending
+ to the file.
+ """
+ mode = "a" if append else "w"
+ file_path = force_path(path, require_exists=False)
+ with gzip.open(file_path, mode=mode) as f:
+ if append and append_new_line:
+ f.write("\n".encode("utf-8"))
+ f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines])
+
+
def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]:
"""Read a .jsonl file or standard input and yield contents line by line.
Blank lines will always be skipped.
diff --git a/srsly/tests/cloudpickle/cloudpickle_test.py b/srsly/tests/cloudpickle/cloudpickle_test.py
index 1d33369..b293c53 100644
--- a/srsly/tests/cloudpickle/cloudpickle_test.py
+++ b/srsly/tests/cloudpickle/cloudpickle_test.py
@@ -872,8 +872,10 @@ def test_builtin_classicmethod(self):
@pytest.mark.skipif(
(platform.machine() == "aarch64" and sys.version_info[:2] >= (3, 10))
or platform.python_implementation() == "PyPy"
- or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8)),
- reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8")
+ or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8))
+ # Skipping tests on 3.11 due to https://github.com/cloudpipe/cloudpickle/pull/486.
+ or sys.version_info[:2] == (3, 11),
+ reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8+ and 3.11")
def test_builtin_classmethod(self):
obj = 1.5 # float object
diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py
index dc23952..89ce400 100644
--- a/srsly/tests/test_json_api.py
+++ b/srsly/tests/test_json_api.py
@@ -4,7 +4,14 @@
import gzip
import numpy
-from .._json_api import read_json, write_json, read_jsonl, write_jsonl
+from .._json_api import (
+ read_json,
+ write_json,
+ read_jsonl,
+ write_jsonl,
+ read_gzip_jsonl,
+ write_gzip_jsonl,
+)
from .._json_api import write_gzip_json, json_dumps, is_json_serializable
from .._json_api import json_loads
from ..util import force_string
@@ -204,3 +211,54 @@ def test_unsupported_type_error():
f = numpy.float32()
with pytest.raises(TypeError):
s = json_dumps(f)
+
+
+def test_write_jsonl_gzip():
+ """Tests writing data to a gzipped .jsonl file."""
+ data = [{"hello": "world"}, {"test": 123}]
+ expected = ['{"hello":"world"}\n', '{"test":123}\n']
+
+ with make_tempdir() as temp_dir:
+ file_path = temp_dir / "tmp.json"
+ write_gzip_jsonl(file_path, data)
+ with gzip.open(file_path, "r") as f:
+ assert [line.decode("utf8") for line in f.readlines()] == expected
+
+
+def test_write_jsonl_gzip_append():
+ """Tests appending data to a gzipped .jsonl file."""
+ data = [{"hello": "world"}, {"test": 123}]
+ expected = [
+ '{"hello":"world"}\n',
+ '{"test":123}\n',
+ "\n",
+ '{"hello":"world"}\n',
+ '{"test":123}\n',
+ ]
+ with make_tempdir() as temp_dir:
+ file_path = temp_dir / "tmp.json"
+ write_gzip_jsonl(file_path, data)
+ write_gzip_jsonl(file_path, data, append=True)
+ with gzip.open(file_path, "r") as f:
+ assert [line.decode("utf8") for line in f.readlines()] == expected
+
+
+def test_read_jsonl_gzip():
+ """Tests reading data from a gzipped .jsonl file."""
+ file_contents = [{"hello": "world"}, {"test": 123}]
+ with make_tempdir() as temp_dir:
+ file_path = temp_dir / "tmp.json"
+ with gzip.open(file_path, "w") as f:
+ f.writelines(
+ [(json_dumps(line) + "\n").encode("utf-8") for line in file_contents]
+ )
+ assert file_path.exists()
+ data = read_gzip_jsonl(file_path)
+ # Make sure this returns a generator, not just a list
+ assert not hasattr(data, "__len__")
+ data = list(data)
+ assert len(data) == 2
+ assert len(data[0]) == 1
+ assert len(data[1]) == 1
+ assert data[0]["hello"] == "world"
+ assert data[1]["test"] == 123