From e937aac16fadce52e60b9d59180456eabfc9dec7 Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sat, 14 May 2022 15:05:48 -0400 Subject: [PATCH 1/7] Adding basic example of typed JsonPath matching --- Cargo.toml | 1 + examples/json_path/Cargo.toml | 10 +++ examples/json_path/src/main.rs | 28 ++++++++ .../src/chunked_array/strings/json_path.rs | 65 +++++++++++++++++-- 4 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 examples/json_path/Cargo.toml create mode 100644 examples/json_path/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 62adc8822687..5a52323d6ce7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "examples/read_csv", "examples/read_parquet", "examples/python_rust_compiled_function", + "examples/json_path", ] [patch.crates-io] diff --git a/examples/json_path/Cargo.toml b/examples/json_path/Cargo.toml new file mode 100644 index 000000000000..16ef66644378 --- /dev/null +++ b/examples/json_path/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "json_path" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +polars = { path = "../../polars", features = ["strings", "json", "extract_jsonpath", "dtype-struct"] } +serde_json = { version = "1" } diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs new file mode 100644 index 000000000000..f33bcdd6e680 --- /dev/null +++ b/examples/json_path/src/main.rs @@ -0,0 +1,28 @@ +use polars::prelude::*; +use serde_json::json; + +fn main() -> Result<()> { + //let s: Series = [ + //&json!(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#).to_string(), + //&json!(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#).to_string(), + //].iter().collect(); + let s = Series::new( + "json", + [ + r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#, + r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#, + ] + ); + let ca = s.utf8()?; + + dbg!(ca); + dbg!(ca.str_lengths().into_series()); + dbg!(ca.json_path_match("$.a")?); + dbg!(ca.json_path_extract("$.a")?); + dbg!(ca.json_path_match("$.b")?); + dbg!(ca.json_path_extract("$.b")?); + dbg!(ca.json_path_extract("$.b")?.dtype()); + dbg!(ca.json_path_extract("$.b[:].c")?); + dbg!(ca.json_path_extract("$.b[:].c")?.dtype()); + Ok(()) +} diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index 734a6498e146..8adbf0237334 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -2,26 +2,35 @@ use crate::prelude::*; use jsonpath_lib::PathCompiled; use serde_json::Value; use std::borrow::Cow; +use indexmap::set::IndexSet as HashSet; +use arrow::io::{json, ndjson}; +use arrow::datatypes::DataType as ArrowDataType; + #[cfg(feature = "extract_jsonpath")] fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option> { serde_json::from_str(json_str).ok().and_then(|value| { // TODO: a lot of heap allocations here. Improve json path by adding a take? let result = expr.select(&value).ok()?; - let first = *result.get(0)?; - match first { - Value::String(s) => Some(Cow::Owned(s.clone())), - Value::Null => None, - v => Some(Cow::Owned(v.to_string())), + let result_str = match result.len() { + 0 => None, + 1 => serde_json::to_string(&result[0]).ok(), + _ => serde_json::to_string(&result).ok(), + }; + //let first = *result.get(0)?; + + match result_str { + Some(s) => Some(Cow::Owned(s.clone())), + None => None, } }) } +#[cfg(feature = "extract_jsonpath")] impl Utf8Chunked { /// Extract json path, first match /// Refer to - #[cfg(feature = "extract_jsonpath")] pub fn json_path_match(&self, json_path: &str) -> Result { match PathCompiled::compile(json_path) { Ok(pat) => Ok(self.apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))), @@ -30,4 +39,48 @@ impl Utf8Chunked { )), } } + + /// Returns the infered DataType for JSON values for each row + /// in the Utf8Chunked, with an optional number of rows to inspect. + /// When None is passed for the number of rows, all rows are inspected. + pub fn json_infer(&self, number_of_rows: Option) -> Result { + // rechunk to have a continuous array + self.rechunk(); + let chunk = &self.chunks()[0]; + let utf8_array = chunk.as_any().downcast_ref::>().unwrap(); + utf8_array + .json_infer(number_of_rows) + .map(|d| DataType::from(&d)) + .map_err(|e| PolarsError::ComputeError( + format!("error infering JSON {:?}", e).into(), + )) + } + + + /// Extracts a JSON value for each row in the Utf8Chunked + pub fn json_deserialize(&self, data_type: DataType) -> Result { + // rechunk to have a continuous array + self.rechunk(); + let chunk = &self.chunks()[0]; + let utf8_array = chunk.as_any().downcast_ref::>().unwrap(); + let array = utf8_array + .json_deserialize(data_type.to_arrow()) + .map_err(|e| PolarsError::ComputeError( + format!("error deserializing JSON {:?}", e).into(), + ))?; + + Series::try_from(("", array)) + } + + pub fn json_path_extract(&self, json_path: &str) -> Result { + let expr = Compiled::compile(json_path) + .map_err(|e| PolarsError::ComputeError( + format!("error compiling JSONpath expression {:?}", e).into(), + ))?; + + let selected_json = self.apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&expr, s))); + + let data_type = selected_json.json_infer(None)?; + selected_json.json_deserialize(data_type) + } } From f8c82badc25a48f288a0e0ce48d3c457f9a20ea8 Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Mon, 16 May 2022 20:13:25 -0400 Subject: [PATCH 2/7] Updating based on arrow2 changes --- examples/json_path/src/main.rs | 4 --- .../src/chunked_array/strings/json_path.rs | 28 ++++++++----------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs index f33bcdd6e680..27ec7673087d 100644 --- a/examples/json_path/src/main.rs +++ b/examples/json_path/src/main.rs @@ -2,10 +2,6 @@ use polars::prelude::*; use serde_json::json; fn main() -> Result<()> { - //let s: Series = [ - //&json!(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#).to_string(), - //&json!(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#).to_string(), - //].iter().collect(); let s = Series::new( "json", [ diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index 8adbf0237334..3c7d18fd9c8d 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -1,10 +1,7 @@ use crate::prelude::*; use jsonpath_lib::PathCompiled; -use serde_json::Value; use std::borrow::Cow; -use indexmap::set::IndexSet as HashSet; -use arrow::io::{json, ndjson}; -use arrow::datatypes::DataType as ArrowDataType; +use arrow::io::ndjson; #[cfg(feature = "extract_jsonpath")] @@ -18,7 +15,6 @@ fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option serde_json::to_string(&result[0]).ok(), _ => serde_json::to_string(&result).ok(), }; - //let first = *result.get(0)?; match result_str { Some(s) => Some(Cow::Owned(s.clone())), @@ -44,12 +40,11 @@ impl Utf8Chunked { /// in the Utf8Chunked, with an optional number of rows to inspect. /// When None is passed for the number of rows, all rows are inspected. pub fn json_infer(&self, number_of_rows: Option) -> Result { - // rechunk to have a continuous array - self.rechunk(); - let chunk = &self.chunks()[0]; - let utf8_array = chunk.as_any().downcast_ref::>().unwrap(); - utf8_array - .json_infer(number_of_rows) + let values_iter = self + .into_no_null_iter() + .take(number_of_rows.unwrap_or(self.len())); + + ndjson::read::infer_iter(values_iter) .map(|d| DataType::from(&d)) .map_err(|e| PolarsError::ComputeError( format!("error infering JSON {:?}", e).into(), @@ -59,12 +54,11 @@ impl Utf8Chunked { /// Extracts a JSON value for each row in the Utf8Chunked pub fn json_deserialize(&self, data_type: DataType) -> Result { - // rechunk to have a continuous array - self.rechunk(); - let chunk = &self.chunks()[0]; - let utf8_array = chunk.as_any().downcast_ref::>().unwrap(); - let array = utf8_array - .json_deserialize(data_type.to_arrow()) + let iter = self + .into_iter() + .map(|x| x.unwrap_or("null")); + + let array = ndjson::read::deserialize_iter(iter, data_type.to_arrow()) .map_err(|e| PolarsError::ComputeError( format!("error deserializing JSON {:?}", e).into(), ))?; From 630d97f1e7f2b98916457ab89df370a300e56391 Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sat, 21 May 2022 11:26:20 -0400 Subject: [PATCH 3/7] Fixing merge conflicts --- examples/json_path/Cargo.toml | 1 - examples/json_path/src/main.rs | 1 - polars/polars-core/src/chunked_array/strings/json_path.rs | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/json_path/Cargo.toml b/examples/json_path/Cargo.toml index 16ef66644378..20f17ee09842 100644 --- a/examples/json_path/Cargo.toml +++ b/examples/json_path/Cargo.toml @@ -7,4 +7,3 @@ edition = "2021" [dependencies] polars = { path = "../../polars", features = ["strings", "json", "extract_jsonpath", "dtype-struct"] } -serde_json = { version = "1" } diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs index 27ec7673087d..cd8cc9df6e8d 100644 --- a/examples/json_path/src/main.rs +++ b/examples/json_path/src/main.rs @@ -1,5 +1,4 @@ use polars::prelude::*; -use serde_json::json; fn main() -> Result<()> { let s = Series::new( diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index 3c7d18fd9c8d..8a29c2cc3026 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -67,7 +67,7 @@ impl Utf8Chunked { } pub fn json_path_extract(&self, json_path: &str) -> Result { - let expr = Compiled::compile(json_path) + let expr = PathCompiled::compile(json_path) .map_err(|e| PolarsError::ComputeError( format!("error compiling JSONpath expression {:?}", e).into(), ))?; From 63d891a31873acf2d958f0fa0a3b10f6c54c1abb Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sat, 21 May 2022 11:46:22 -0400 Subject: [PATCH 4/7] Updating function signatures --- examples/json_path/src/main.rs | 4 +-- .../src/chunked_array/strings/json_path.rs | 30 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs index cd8cc9df6e8d..038db682fd05 100644 --- a/examples/json_path/src/main.rs +++ b/examples/json_path/src/main.rs @@ -12,9 +12,9 @@ fn main() -> Result<()> { dbg!(ca); dbg!(ca.str_lengths().into_series()); - dbg!(ca.json_path_match("$.a")?); + dbg!(ca.json_path_select("$.a")?); dbg!(ca.json_path_extract("$.a")?); - dbg!(ca.json_path_match("$.b")?); + dbg!(ca.json_path_select("$.b")?); dbg!(ca.json_path_extract("$.b")?); dbg!(ca.json_path_extract("$.b")?.dtype()); dbg!(ca.json_path_extract("$.b[:].c")?); diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index 8a29c2cc3026..3061a10695ce 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -1,11 +1,27 @@ use crate::prelude::*; use jsonpath_lib::PathCompiled; +use serde_json::Value; use std::borrow::Cow; use arrow::io::ndjson; #[cfg(feature = "extract_jsonpath")] fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option> { + serde_json::from_str(json_str).ok().and_then(|value| { + // TODO: a lot of heap allocations here. Improve json path by adding a take? + let result = expr.select(&value).ok()?; + let first = *result.get(0)?; + + match first { + Value::String(s) => Some(Cow::Owned(s.clone())), + Value::Null => None, + v => Some(Cow::Owned(v.to_string())), + } + }) +} + +#[cfg(feature = "extract_jsonpath")] +fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option> { serde_json::from_str(json_str).ok().and_then(|value| { // TODO: a lot of heap allocations here. Improve json path by adding a take? let result = expr.select(&value).ok()?; @@ -66,13 +82,17 @@ impl Utf8Chunked { Series::try_from(("", array)) } - pub fn json_path_extract(&self, json_path: &str) -> Result { - let expr = PathCompiled::compile(json_path) - .map_err(|e| PolarsError::ComputeError( + pub fn json_path_select(&self, json_path: &str) -> Result { + match PathCompiled::compile(json_path) { + Ok(pat) => Ok(self.apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s)))), + Err(e) => Err(PolarsError::ComputeError( format!("error compiling JSONpath expression {:?}", e).into(), - ))?; + )), + } + } - let selected_json = self.apply_on_opt(|opt_s| opt_s.and_then(|s| extract_json(&expr, s))); + pub fn json_path_extract(&self, json_path: &str) -> Result { + let selected_json = self.json_path_select(json_path)?; let data_type = selected_json.json_infer(None)?; selected_json.json_deserialize(data_type) From 5ac8a196e1de5ac1b9a6819586321a9a772e17c0 Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sun, 22 May 2022 08:34:15 -0400 Subject: [PATCH 5/7] Improving Python DataType support for Struct and repr --- py-polars/polars/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index 07de1fd44277..a2e418a7dd72 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -34,6 +34,7 @@ def version() -> str: List, Null, Object, + Field, Struct, Time, UInt8, From a5a1008f249e1916e6eea446619f9beb525a3505 Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sun, 22 May 2022 08:34:42 -0400 Subject: [PATCH 6/7] Adding Python facing API for nested JSON --- examples/json_path/src/main.rs | 15 +- .../src/chunked_array/strings/json_path.rs | 18 ++- py-polars/polars/internals/series.py | 138 ++++++++++++++++++ py-polars/src/series.rs | 35 +++++ 4 files changed, 192 insertions(+), 14 deletions(-) diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs index 038db682fd05..246b4b5f90fd 100644 --- a/examples/json_path/src/main.rs +++ b/examples/json_path/src/main.rs @@ -4,8 +4,9 @@ fn main() -> Result<()> { let s = Series::new( "json", [ - r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#, - r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#, + Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#), + Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#), + None, ] ); let ca = s.utf8()?; @@ -13,11 +14,11 @@ fn main() -> Result<()> { dbg!(ca); dbg!(ca.str_lengths().into_series()); dbg!(ca.json_path_select("$.a")?); - dbg!(ca.json_path_extract("$.a")?); + dbg!(ca.json_path_extract("$.a", None)?); dbg!(ca.json_path_select("$.b")?); - dbg!(ca.json_path_extract("$.b")?); - dbg!(ca.json_path_extract("$.b")?.dtype()); - dbg!(ca.json_path_extract("$.b[:].c")?); - dbg!(ca.json_path_extract("$.b[:].c")?.dtype()); + dbg!(ca.json_path_extract("$.b", None)?); + dbg!(ca.json_path_extract("$.b", None)?.dtype()); + dbg!(ca.json_path_extract("$.b[:].c", None)?); + dbg!(ca.json_path_extract("$.b[:].c", None)?.dtype()); Ok(()) } diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index 3061a10695ce..f11293479728 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -57,7 +57,8 @@ impl Utf8Chunked { /// When None is passed for the number of rows, all rows are inspected. pub fn json_infer(&self, number_of_rows: Option) -> Result { let values_iter = self - .into_no_null_iter() + .into_iter() + .map(|x| x.unwrap_or("null")) .take(number_of_rows.unwrap_or(self.len())); ndjson::read::infer_iter(values_iter) @@ -69,12 +70,17 @@ impl Utf8Chunked { /// Extracts a JSON value for each row in the Utf8Chunked - pub fn json_deserialize(&self, data_type: DataType) -> Result { + pub fn json_extract(&self, dtype: Option) -> Result { + let dtype = match dtype { + Some(dt) => dt, + None => self.json_infer(None)?, + }; + let iter = self .into_iter() .map(|x| x.unwrap_or("null")); - let array = ndjson::read::deserialize_iter(iter, data_type.to_arrow()) + let array = ndjson::read::deserialize_iter(iter, dtype.to_arrow()) .map_err(|e| PolarsError::ComputeError( format!("error deserializing JSON {:?}", e).into(), ))?; @@ -91,10 +97,8 @@ impl Utf8Chunked { } } - pub fn json_path_extract(&self, json_path: &str) -> Result { + pub fn json_path_extract(&self, json_path: &str, dtype: Option) -> Result { let selected_json = self.json_path_select(json_path)?; - - let data_type = selected_json.json_infer(None)?; - selected_json.json_deserialize(data_type) + selected_json.json_extract(dtype) } } diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py index 858a4bf6350f..e052320a4ba1 100644 --- a/py-polars/polars/internals/series.py +++ b/py-polars/polars/internals/series.py @@ -3971,6 +3971,144 @@ def json_path_match(self, json_path: str) -> Series: """ return wrap_s(self._s.str_json_path_match(json_path)) + def json_infer(self, number_of_rows: Optional[int] = None) -> DataType: + """ + Returns the inferred DataType for JSON values for each row in the Series, + with an optional number of rows to inspect. By default, all rows are + inspected. + + Parameters + ---------- + number_of_rows + A integer number of rows to consider for inferring the type + + Returns + ------- + DataType. Contains null if original value is null or values are not JSON format + + Examples + -------- + + >>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None]) + >>> s.str.json_infer() + Struct[Field("b": )] + + """ + return self._s.str_json_infer(number_of_rows) + + def json_extract(self, dtype: Optional[Type[DataType]] = None) -> Series: + """ + Extracts a JSON value for each row in the Series. If the row does not + contain JSON or is null, the value for that row will be null. The + common data type for all rows is inferred by default, unless an optional + data type is provided. + + Parameters + ---------- + dtype + An optional DataType matching the schema of the JSON + + Returns + ------- + Series. Contains null if original value is null or values are not JSON format + + Examples + -------- + + >>> s = pl.Series("a", ['{"b": null}', '{"b": 5}', '{"b": 1}', None]) + >>> s.str.json_extract() + shape: (4,) + Series: '' [struct[1]] + [ + {null} + {5} + {1} + {null} + ] + + """ + return wrap_s(self._s.str_json_extract(dtype)) + + def json_path_select(self, json_path: str) -> Series: + """ + Selects the JSON fields with provided JsonPath expression, returning a + string presentation of the selected fields. + Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/ + + Parameters + ---------- + json_path + A valid JSON path query string + + Returns + ------- + Utf8 array. Contain null if original value is null or the json_path return nothing. + + Examples + -------- + + >>> df = pl.DataFrame( + ... {"json_val": ['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}']} + ... ) + >>> df.select(pl.col("json_val").str.json_path_select("$.a"))[:, 0] + shape: (5,) + Series: 'json_val' [str] + [ + "1" + null + "2" + "2.1" + "true" + ] + + """ + return wrap_s(self._s.str_json_path_select(json_path)) + + def json_path_extract(self, json_path: str, dtype: Optional[Type[DataType]] = None) -> Series: + """ + Extracts the JSON fields with provided JsonPath expression, returning an + appropriately typed Series. The data type can optionally be provided to + specify the schema for the JSON fields being extracted, or by default the + data type is inferred by inspecting all of the rows in the Series. + Documentation on JSONPath standard: https://goessner.net/articles/JsonPath/ + + Parameters + ---------- + json_path + A valid JSON path query string + dtype + A DataType for the JSON to be deserialized into + + Returns + ------- + Series. Contains null if original value is null or the json_path returns nothing. + + Examples + -------- + + >>> s = pl.Series("a", ['{"b": null}', '{"b": [5]}', '{"b": [1, 2]}', None]) + >>> s.str.json_path_extract('$.b[0]') + shape: (4,) + Series: '' [i64] + [ + null + 5 + 1 + null + ] + >>> s.str.json_path_extract('$.b[-1]') + shape: (4,) + Series: '' [i64] + [ + null + 5 + 2 + null + ] + + """ + return wrap_s(self._s.str_json_path_extract(json_path, dtype)) + def extract(self, pattern: str, group_index: int = 1) -> Series: r""" Extract the target capture group from provided patterns. diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index 3ab0c3a17e4a..1f800c7dc845 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -1075,6 +1075,41 @@ impl PySeries { Ok(s.into()) } + pub fn str_json_infer(&self, py: Python, number_of_rows: Option) -> PyResult { + let ca = self.series.utf8().map_err(PyPolarsErr::from)?; + let dtype = ca + .json_infer(number_of_rows) + .map_err(PyPolarsErr::from)?; + Ok(Wrap(dtype.clone()).to_object(py)) + } + + pub fn str_json_extract(&self, dtype: Option>) -> PyResult { + let ca = self.series.utf8().map_err(PyPolarsErr::from)?; + let s = ca + .json_extract(dtype.map(|x| x.0)) + .map_err(PyPolarsErr::from)? + .into_series(); + Ok(s.into()) + } + + pub fn str_json_path_select(&self, path: &str) -> PyResult { + let ca = self.series.utf8().map_err(PyPolarsErr::from)?; + let s = ca + .json_path_select(path) + .map_err(PyPolarsErr::from)? + .into_series(); + Ok(s.into()) + } + + pub fn str_json_path_extract(&self, path: &str, dtype: Option>) -> PyResult { + let ca = self.series.utf8().map_err(PyPolarsErr::from)?; + let s = ca + .json_path_extract(path, dtype.map(|x| x.0)) + .map_err(PyPolarsErr::from)? + .into_series(); + Ok(s.into()) + } + pub fn str_extract(&self, pat: &str, group_index: usize) -> PyResult { let ca = self.series.utf8().map_err(PyPolarsErr::from)?; let s = ca From 71e31dcdfb8c86de8e4779137bd0751030ff054f Mon Sep 17 00:00:00 2001 From: Colin Jermain Date: Sun, 22 May 2022 13:21:36 -0400 Subject: [PATCH 7/7] Fixing formatting --- examples/json_path/src/main.rs | 2 +- .../src/chunked_array/strings/json_path.rs | 19 ++++++------------- py-polars/polars/__init__.py | 1 - py-polars/polars/internals/series.py | 8 +++++--- py-polars/src/series.rs | 10 ++++++---- 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/examples/json_path/src/main.rs b/examples/json_path/src/main.rs index 246b4b5f90fd..dae2accc0fcd 100644 --- a/examples/json_path/src/main.rs +++ b/examples/json_path/src/main.rs @@ -7,7 +7,7 @@ fn main() -> Result<()> { Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#), Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#), None, - ] + ], ); let ca = s.utf8()?; diff --git a/polars/polars-core/src/chunked_array/strings/json_path.rs b/polars/polars-core/src/chunked_array/strings/json_path.rs index f11293479728..fd337cb82b34 100644 --- a/polars/polars-core/src/chunked_array/strings/json_path.rs +++ b/polars/polars-core/src/chunked_array/strings/json_path.rs @@ -1,9 +1,8 @@ use crate::prelude::*; +use arrow::io::ndjson; use jsonpath_lib::PathCompiled; use serde_json::Value; use std::borrow::Cow; -use arrow::io::ndjson; - #[cfg(feature = "extract_jsonpath")] fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option> { @@ -63,12 +62,9 @@ impl Utf8Chunked { ndjson::read::infer_iter(values_iter) .map(|d| DataType::from(&d)) - .map_err(|e| PolarsError::ComputeError( - format!("error infering JSON {:?}", e).into(), - )) + .map_err(|e| PolarsError::ComputeError(format!("error infering JSON {:?}", e).into())) } - /// Extracts a JSON value for each row in the Utf8Chunked pub fn json_extract(&self, dtype: Option) -> Result { let dtype = match dtype { @@ -76,14 +72,11 @@ impl Utf8Chunked { None => self.json_infer(None)?, }; - let iter = self - .into_iter() - .map(|x| x.unwrap_or("null")); + let iter = self.into_iter().map(|x| x.unwrap_or("null")); - let array = ndjson::read::deserialize_iter(iter, dtype.to_arrow()) - .map_err(|e| PolarsError::ComputeError( - format!("error deserializing JSON {:?}", e).into(), - ))?; + let array = ndjson::read::deserialize_iter(iter, dtype.to_arrow()).map_err(|e| { + PolarsError::ComputeError(format!("error deserializing JSON {:?}", e).into()) + })?; Series::try_from(("", array)) } diff --git a/py-polars/polars/__init__.py b/py-polars/polars/__init__.py index a2e418a7dd72..07de1fd44277 100644 --- a/py-polars/polars/__init__.py +++ b/py-polars/polars/__init__.py @@ -34,7 +34,6 @@ def version() -> str: List, Null, Object, - Field, Struct, Time, UInt8, diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py index e052320a4ba1..d494e232dab3 100644 --- a/py-polars/polars/internals/series.py +++ b/py-polars/polars/internals/series.py @@ -4064,7 +4064,9 @@ def json_path_select(self, json_path: str) -> Series: """ return wrap_s(self._s.str_json_path_select(json_path)) - def json_path_extract(self, json_path: str, dtype: Optional[Type[DataType]] = None) -> Series: + def json_path_extract( + self, json_path: str, dtype: Optional[Type[DataType]] = None + ) -> Series: """ Extracts the JSON fields with provided JsonPath expression, returning an appropriately typed Series. The data type can optionally be provided to @@ -4087,7 +4089,7 @@ def json_path_extract(self, json_path: str, dtype: Optional[Type[DataType]] = No -------- >>> s = pl.Series("a", ['{"b": null}', '{"b": [5]}', '{"b": [1, 2]}', None]) - >>> s.str.json_path_extract('$.b[0]') + >>> s.str.json_path_extract("$.b[0]") shape: (4,) Series: '' [i64] [ @@ -4096,7 +4098,7 @@ def json_path_extract(self, json_path: str, dtype: Optional[Type[DataType]] = No 1 null ] - >>> s.str.json_path_extract('$.b[-1]') + >>> s.str.json_path_extract("$.b[-1]") shape: (4,) Series: '' [i64] [ diff --git a/py-polars/src/series.rs b/py-polars/src/series.rs index 1f800c7dc845..2b172ab7e50d 100644 --- a/py-polars/src/series.rs +++ b/py-polars/src/series.rs @@ -1077,9 +1077,7 @@ impl PySeries { pub fn str_json_infer(&self, py: Python, number_of_rows: Option) -> PyResult { let ca = self.series.utf8().map_err(PyPolarsErr::from)?; - let dtype = ca - .json_infer(number_of_rows) - .map_err(PyPolarsErr::from)?; + let dtype = ca.json_infer(number_of_rows).map_err(PyPolarsErr::from)?; Ok(Wrap(dtype.clone()).to_object(py)) } @@ -1101,7 +1099,11 @@ impl PySeries { Ok(s.into()) } - pub fn str_json_path_extract(&self, path: &str, dtype: Option>) -> PyResult { + pub fn str_json_path_extract( + &self, + path: &str, + dtype: Option>, + ) -> PyResult { let ca = self.series.utf8().map_err(PyPolarsErr::from)?; let s = ca .json_path_extract(path, dtype.map(|x| x.0))