Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python,rust): Typed JsonPath implementation #3413

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ members = [
"examples/read_csv",
"examples/read_parquet",
"examples/python_rust_compiled_function",
"examples/json_path",
]

[patch.crates-io]
Expand Down
9 changes: 9 additions & 0 deletions examples/json_path/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "json_path"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
polars = { path = "../../polars", features = ["strings", "json", "extract_jsonpath", "dtype-struct"] }
24 changes: 24 additions & 0 deletions examples/json_path/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
use polars::prelude::*;

fn main() -> Result<()> {
let s = Series::new(
"json",
[
Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#),
Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#),
None,
],
);
let ca = s.utf8()?;

dbg!(ca);
dbg!(ca.str_lengths().into_series());
dbg!(ca.json_path_select("$.a")?);
dbg!(ca.json_path_extract("$.a", None)?);
dbg!(ca.json_path_select("$.b")?);
dbg!(ca.json_path_extract("$.b", None)?);
dbg!(ca.json_path_extract("$.b", None)?.dtype());
dbg!(ca.json_path_extract("$.b[:].c", None)?);
dbg!(ca.json_path_extract("$.b[:].c", None)?.dtype());
Ok(())
}
66 changes: 65 additions & 1 deletion polars/polars-core/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::prelude::*;
use arrow::io::ndjson;
use jsonpath_lib::PathCompiled;
use serde_json::Value;
use std::borrow::Cow;
Expand All @@ -18,10 +19,29 @@ fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, st
})
}

#[cfg(feature = "extract_jsonpath")]
fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {
serde_json::from_str(json_str).ok().and_then(|value| {
// TODO: a lot of heap allocations here. Improve json path by adding a take?
let result = expr.select(&value).ok()?;

let result_str = match result.len() {
0 => None,
1 => serde_json::to_string(&result[0]).ok(),
_ => serde_json::to_string(&result).ok(),
};

match result_str {
Some(s) => Some(Cow::Owned(s.clone())),
None => None,
}
})
}

#[cfg(feature = "extract_jsonpath")]
impl Utf8Chunked {
/// Extract json path, first match
/// Refer to <https://goessner.net/articles/JsonPath/>
#[cfg(feature = "extract_jsonpath")]
pub fn json_path_match(&self, json_path: &str) -> Result<Utf8Chunked> {
match PathCompiled::compile(json_path) {
Ok(pat) => Ok(self.apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))),
Expand All @@ -30,4 +50,48 @@ impl Utf8Chunked {
)),
}
}

/// Returns the infered DataType for JSON values for each row
/// in the Utf8Chunked, with an optional number of rows to inspect.
/// When None is passed for the number of rows, all rows are inspected.
pub fn json_infer(&self, number_of_rows: Option<usize>) -> Result<DataType> {
let values_iter = self
.into_iter()
.map(|x| x.unwrap_or("null"))
.take(number_of_rows.unwrap_or(self.len()));

ndjson::read::infer_iter(values_iter)
.map(|d| DataType::from(&d))
.map_err(|e| PolarsError::ComputeError(format!("error infering JSON {:?}", e).into()))
}

/// Extracts a JSON value for each row in the Utf8Chunked
pub fn json_extract(&self, dtype: Option<DataType>) -> Result<Series> {
let dtype = match dtype {
Some(dt) => dt,
None => self.json_infer(None)?,
};

let iter = self.into_iter().map(|x| x.unwrap_or("null"));

let array = ndjson::read::deserialize_iter(iter, dtype.to_arrow()).map_err(|e| {
PolarsError::ComputeError(format!("error deserializing JSON {:?}", e).into())
})?;

Series::try_from(("", array))
}

pub fn json_path_select(&self, json_path: &str) -> Result<Utf8Chunked> {
match PathCompiled::compile(json_path) {
Ok(pat) => Ok(self.apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s)))),
Err(e) => Err(PolarsError::ComputeError(
format!("error compiling JSONpath expression {:?}", e).into(),
)),
}
}

pub fn json_path_extract(&self, json_path: &str, dtype: Option<DataType>) -> Result<Series> {
let selected_json = self.json_path_select(json_path)?;
selected_json.json_extract(dtype)
}
}
140 changes: 140 additions & 0 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3971,6 +3971,146 @@ def json_path_match(self, json_path: str) -> Series:
"""
return wrap_s(self._s.str_json_path_match(json_path))

def json_infer(self, number_of_rows: Optional[int] = None) -> DataType:
"""
Returns the inferred DataType for JSON values for each row in the Series,
with an optional number of rows to inspect. By default, all rows are
inspected.

Parameters
----------
number_of_rows
A integer number of rows to consider for inferring the type

Returns
-------
DataType. Contains null if original value is null or values are not JSON format

Examples
--------

>>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None])
>>> s.str.json_infer()
Struct[Field("b": <class 'polars.datatypes.Int64'>)]

"""
return self._s.str_json_infer(number_of_rows)

def json_extract(self, dtype: Optional[Type[DataType]] = None) -> Series:
"""
Extracts a JSON value for each row in the Series. If the row does not
contain JSON or is null, the value for that row will be null. The
common data type for all rows is inferred by default, unless an optional
data type is provided.

Parameters
----------
dtype
An optional DataType matching the schema of the JSON

Returns
-------
Series. Contains null if original value is null or values are not JSON format

Examples
--------

>>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None])
>>> s.str.json_extract()
shape: (4,)
Series: '' [struct[1]]
[
{null}
{5}
{1}
{null}
]

"""
return wrap_s(self._s.str_json_extract(dtype))

def json_path_select(self, json_path: str) -> Series:
"""
Selects the JSON fields with provided JsonPath expression, returning a
string presentation of the selected fields.
Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/

Parameters
----------
json_path
A valid JSON path query string

Returns
-------
Utf8 array. Contain null if original value is null or the json_path return nothing.

Examples
--------

>>> df = pl.DataFrame(
... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']}
... )
>>> df.select(pl.col("json_val").str.json_path_select("$.a"))[:, 0]
shape: (5,)
Series: 'json_val' [str]
[
"1"
null
"2"
"2.1"
"true"
]

"""
return wrap_s(self._s.str_json_path_select(json_path))

def json_path_extract(
self, json_path: str, dtype: Optional[Type[DataType]] = None
) -> Series:
"""
Extracts the JSON fields with provided JsonPath expression, returning an
appropriately typed Series. The data type can optionally be provided to
specify the schema for the JSON fields being extracted, or by default the
data type is inferred by inspecting all of the rows in the Series.
Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/

Parameters
----------
json_path
A valid JSON path query string
dtype
A DataType for the JSON to be deserialized into

Returns
-------
Series. Contains null if original value is null or the json_path returns nothing.

Examples
--------

>>> s = pl.Series("a", ['{"b": null}', '{"b": [5]}', '{"b": [1, 2]}', None])
>>> s.str.json_path_extract("$.b[0]")
shape: (4,)
Series: '' [i64]
[
null
5
1
null
]
>>> s.str.json_path_extract("$.b[-1]")
shape: (4,)
Series: '' [i64]
[
null
5
2
null
]

"""
return wrap_s(self._s.str_json_path_extract(json_path, dtype))

def extract(self, pattern: str, group_index: int = 1) -> Series:
r"""
Extract the target capture group from provided patterns.
Expand Down
37 changes: 37 additions & 0 deletions py-polars/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,43 @@ impl PySeries {
Ok(s.into())
}

pub fn str_json_infer(&self, py: Python, number_of_rows: Option<usize>) -> PyResult<PyObject> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let dtype = ca.json_infer(number_of_rows).map_err(PyPolarsErr::from)?;
Ok(Wrap(dtype.clone()).to_object(py))
}

pub fn str_json_extract(&self, dtype: Option<Wrap<DataType>>) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_extract(dtype.map(|x| x.0))
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_json_path_select(&self, path: &str) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_path_select(path)
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_json_path_extract(
&self,
path: &str,
dtype: Option<Wrap<DataType>>,
) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
.json_path_extract(path, dtype.map(|x| x.0))
.map_err(PyPolarsErr::from)?
.into_series();
Ok(s.into())
}

pub fn str_extract(&self, pat: &str, group_index: usize) -> PyResult<Self> {
let ca = self.series.utf8().map_err(PyPolarsErr::from)?;
let s = ca
Expand Down