Skip to content

Commit

Permalink
Adding Python facing API for nested JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
cjermain committed May 22, 2022
1 parent 0b69a1f commit e682abc
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 14 deletions.
15 changes: 8 additions & 7 deletions examples/json_path/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@ fn main() -> Result<()> {
let s = Series::new(
"json",
[
r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#,
r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#,
Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#),
Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#),
None,
]
);
let ca = s.utf8()?;

dbg!(ca);
dbg!(ca.str_lengths().into_series());
dbg!(ca.json_path_select("$.a")?);
dbg!(ca.json_path_extract("$.a")?);
dbg!(ca.json_path_extract("$.a", None)?);
dbg!(ca.json_path_select("$.b")?);
dbg!(ca.json_path_extract("$.b")?);
dbg!(ca.json_path_extract("$.b")?.dtype());
dbg!(ca.json_path_extract("$.b[:].c")?);
dbg!(ca.json_path_extract("$.b[:].c")?.dtype());
dbg!(ca.json_path_extract("$.b", None)?);
dbg!(ca.json_path_extract("$.b", None)?.dtype());
dbg!(ca.json_path_extract("$.b[:].c", None)?);
dbg!(ca.json_path_extract("$.b[:].c", None)?.dtype());
Ok(())
}
18 changes: 11 additions & 7 deletions polars/polars-core/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ impl Utf8Chunked {
/// When None is passed for the number of rows, all rows are inspected.
pub fn json_infer(&self, number_of_rows: Option<usize>) -> Result<DataType> {
let values_iter = self
.into_no_null_iter()
.into_iter()
.map(|x| x.unwrap_or("null"))
.take(number_of_rows.unwrap_or(self.len()));

ndjson::read::infer_iter(values_iter)
Expand All @@ -69,12 +70,17 @@ impl Utf8Chunked {


/// Extracts a JSON value for each row in the Utf8Chunked
pub fn json_deserialize(&self, data_type: DataType) -> Result<Series> {
pub fn json_extract(&self, dtype: Option<DataType>) -> Result<Series> {
let dtype = match dtype {
Some(dt) => dt,
None => self.json_infer(None)?,
};

let iter = self
.into_iter()
.map(|x| x.unwrap_or("null"));

let array = ndjson::read::deserialize_iter(iter, data_type.to_arrow())
let array = ndjson::read::deserialize_iter(iter, dtype.to_arrow())
.map_err(|e| PolarsError::ComputeError(
format!("error deserializing JSON {:?}", e).into(),
))?;
Expand All @@ -91,10 +97,8 @@ impl Utf8Chunked {
}
}

pub fn json_path_extract(&self, json_path: &str) -> Result<Series> {
pub fn json_path_extract(&self, json_path: &str, dtype: Option<DataType>) -> Result<Series> {
let selected_json = self.json_path_select(json_path)?;

let data_type = selected_json.json_infer(None)?;
selected_json.json_deserialize(data_type)
selected_json.json_extract(dtype)
}
}
138 changes: 138 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3971,6 +3971,144 @@ def json_path_match(self, json_path: str) -> Series:
"""
return wrap_s(self._s.str_json_path_match(json_path))

def json_infer(self, number_of_rows: Optional[int] = None) -> DataType:
"""
Returns the inferred DataType for JSON values for each row in the Series,
with an optional number of rows to inspect. By default, all rows are
inspected.
Parameters
----------
number_of_rows
A integer number of rows to consider for inferring the type
Returns
-------
DataType. Contains null if original value is null or values are not JSON format
Examples
--------
>>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None])
>>> s.str.json_infer()
Struct[Field("b": <class 'polars.datatypes.Int64'>)]
"""
return self._s.str_json_infer(number_of_rows)

def json_extract(self, dtype: Optional[Type[DataType]] = None) -> Series:
"""
Extracts a JSON value for each row in the Series. If the row does not
contain JSON or is null, the value for that row will be null. The
common data type for all rows is inferred by default, unless an optional
data type is provided.
Parameters
----------
dtype
An optional DataType matching the schema of the JSON
Returns
-------
Series. Contains null if original value is null or values are not JSON format
Examples
--------
>>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None])
>>> s.str.json_extract()
shape: (4,)
Series: '' [struct[1]]
[
{null}
{5}
{1}
{null}
]
"""
return wrap_s(self._s.str_json_extract(dtype))

def json_path_select(self, json_path: str) -> Series:
"""
Selects the JSON fields with provided JsonPath expression, returning a
string presentation of the selected fields.
Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/
Parameters
----------
json_path
A valid JSON path query string
Returns
-------
Utf8 array. Contain null if original value is null or the json_path return nothing.
Examples
--------
>>> df = pl.DataFrame(
... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']}
... )
>>> df.select(pl.col("json_val").str.json_path_select("$.a"))[:, 0]
shape: (5,)
Series: 'json_val' [str]
[
"1"
null
"2"
"2.1"
"true"
]
"""
return wrap_s(self._s.str_json_path_select(json_path))

def json_path_extract(self, json_path: str, dtype: Optional[Type[DataType]] = None) -> Series:
"""
Extracts the JSON fields with provided JsonPath expression, returning an
appropriately typed Series. The data type can optionally be provided to
specify the schema for the JSON fields being extracted, or by default the
data type is inferred by inspecting all of the rows in the Series.
Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/
Parameters
----------
json_path
A valid JSON path query string
dtype
A DataType for the JSON to be deserialized into
Returns
-------
Series. Contains null if original value is null or the json_path returns nothing.
Examples
--------
>>> s = pl.Series("a", ['{"b": null}', '{"b": [5]}', '{"b": [1, 2]}', None])
>>> s.str.json_path_extract('$.b[0]')
shape: (4,)
Series: '' [i64]
[
null
5
1
null
]
>>> s.str.json_path_extract('$.b[-1]')
shape: (4,)
Series: '' [i64]
[
null
5
2
null
]
"""
return wrap_s(self._s.str_json_path_extract(json_path, dtype))

def extract(self, pattern: str, group_index: int = 1) -> Series:
r"""
Extract the target capture group from provided patterns.
Expand Down
35 changes: 35 additions & 0 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,41 @@ impl PySeries {
Ok(s.into())
}

pub fn str_json_infer(&self, py: Python, number_of_rows: Option<usize>) -> PyResult<PyObject> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let dtype = ca
.json_infer(number_of_rows)
.map_err(PyPolarsErr::from)?;
Ok(Wrap(dtype.clone()).to_object(py))
}

pub fn str_json_extract(&self, dtype: Option<Wrap<DataType>>) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_extract(dtype.map(|x| x.0))
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_json_path_select(&self, path: &str) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_path_select(path)
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_json_path_extract(&self, path: &str, dtype: Option<Wrap<DataType>>) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_path_extract(path, dtype.map(|x| x.0))
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_extract(&self, pat: &str, group_index: usize) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
Expand Down

0 comments on commit e682abc

Please sign in to comment.