From 72b65508fffd9392d130748cc9b6f41121bd8cd6 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Thu, 10 Aug 2023 16:39:10 +0000 Subject: [PATCH 1/7] Add tests for quoted comma case --- vcf/src/headers.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/vcf/src/headers.rs b/vcf/src/headers.rs index b4d086f..4e8f292 100644 --- a/vcf/src/headers.rs +++ b/vcf/src/headers.rs @@ -48,4 +48,44 @@ mod tests { ], ); } + + #[test] + fn can_parse_when_quoted_text_contains_comma_in_last_key_value_pair() { + let input = "##FORMAT="; + let header = Header::parse(input); + + assert_eq!( + header, + Ok( + Header { + key: "FORMAT", + value: HeaderValue::Nested(HashMap::from([ + ("abc", "123"), + ("xyz", "3125"), + ("sfh", "1,574"), + ])), + } + ) + ); + } + + #[test] + fn can_parse_when_quoted_text_contains_comma_in_first_key_value_pair() { + let input = "##FORMAT="; + let header = Header::parse(input); + + assert_eq!( + header, + Ok( + Header { + key: "FORMAT", + value: HeaderValue::Nested(HashMap::from([ + ("abc", "1,233"), + ("xyz", "3125"), + ("sfh", "157"), + ])), + } + ) + ); + } } From 8d5597ca47eecd364dfb7862451d7d44750e6fe9 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Tue, 15 Aug 2023 20:58:47 +0000 Subject: [PATCH 2/7] Add regex dependency --- vcf/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf/Cargo.toml b/vcf/Cargo.toml index 9c16fd4..85c4f0c 100644 --- a/vcf/Cargo.toml +++ b/vcf/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +regex = "1.9.3" From 1b5bba53ee47cfef53880dfbc2dcfe45fdcb9455 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Tue, 15 Aug 2023 20:07:29 +0000 Subject: [PATCH 3/7] Split using regex --- vcf/src/parse.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index 786ddd3..cf5685a 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -1,5 +1,7 @@ use std::collections::HashMap; +use regex::Regex; + use crate::{Header, HeaderValue}; impl<'src> Header<'src> { @@ -15,10 +17,12 @@ impl<'src> Header<'src> { impl<'src> HeaderValue<'src> { pub fn parse(input: &'src str) -> Result { + let re = Regex::new(r#"((?:[^,"]+|(?:"[^"]*"))+)"#).unwrap(); match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) { None => Ok(Self::Flat(input)), Some(pairs) => { - pairs.split(',') + re.captures_iter(pairs) + .map(|c| c.get(0).unwrap().as_str()) .map(|pair| pair.split_once('=').ok_or(ParseError)) .collect::, _>>() .map(HeaderValue::Nested) From cf12c1dcd81602636ed1ed7ac770e8ff309ace86 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Wed, 16 Aug 2023 12:33:36 +0000 Subject: [PATCH 4/7] Remove unnecessary capture from regex --- vcf/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index cf5685a..89af9a6 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -17,7 +17,7 @@ impl<'src> Header<'src> { impl<'src> HeaderValue<'src> { pub fn parse(input: &'src str) -> Result { - let re = Regex::new(r#"((?:[^,"]+|(?:"[^"]*"))+)"#).unwrap(); + let re = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap(); match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) { None => Ok(Self::Flat(input)), Some(pairs) => { From a8164a730e2fe9987431c425a721e41aaf970635 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Tue, 15 Aug 2023 21:00:41 +0000 Subject: [PATCH 5/7] Trim off quotes --- vcf/src/parse.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index 89af9a6..bb45be1 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -24,6 +24,12 @@ impl<'src> HeaderValue<'src> { re.captures_iter(pairs) .map(|c| c.get(0).unwrap().as_str()) .map(|pair| pair.split_once('=').ok_or(ParseError)) + .map( + |r| match r { + Ok((k, v)) => Ok((k, v.trim_matches('\"'))), + x => x, + } + ) .collect::, _>>() .map(HeaderValue::Nested) } From 9f378132a6724b648abb8d2c111cad99a4f768c6 Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Wed, 16 Aug 2023 12:32:50 +0000 Subject: [PATCH 6/7] Add explanation for regex --- vcf/src/parse.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index bb45be1..c9b0fce 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -17,6 +17,9 @@ impl<'src> Header<'src> { impl<'src> HeaderValue<'src> { pub fn parse(input: &'src str) -> Result { + // Repeatedly match either non-comma/non-quote characters or blocks of text enclosed in + // quotes, until we can't, in which case we're either at a non-quote-enclosed comma or the + // end of the string. let re = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap(); match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) { None => Ok(Self::Flat(input)), From 41431891e29611da6be4cf30ae897be59278e6bf Mon Sep 17 00:00:00 2001 From: ac <4184070+MrCurtis@users.noreply.github.com> Date: Wed, 16 Aug 2023 18:46:59 +0000 Subject: [PATCH 7/7] Make regex static ref. This avoids repeatedly compiling the regex each call to parse. --- vcf/Cargo.toml | 1 + vcf/src/parse.rs | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vcf/Cargo.toml b/vcf/Cargo.toml index 85c4f0c..a26ba6e 100644 --- a/vcf/Cargo.toml +++ b/vcf/Cargo.toml @@ -7,3 +7,4 @@ edition = "2021" [dependencies] regex = "1.9.3" +lazy_static = "1.4.0" diff --git a/vcf/src/parse.rs b/vcf/src/parse.rs index c9b0fce..dfa33dd 100644 --- a/vcf/src/parse.rs +++ b/vcf/src/parse.rs @@ -1,9 +1,17 @@ use std::collections::HashMap; use regex::Regex; +use lazy_static::lazy_static; use crate::{Header, HeaderValue}; +lazy_static! { + // Repeatedly match either non-comma/non-quote characters or blocks of text enclosed in + // quotes, until we can't, in which case we're either at a non-quote-enclosed comma or the + // end of the string. + static ref HEADER_VALUE_REGEX: Regex = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap(); +} + impl<'src> Header<'src> { pub fn parse(input: &'src str) -> Result { let line = input.trim(); @@ -17,14 +25,10 @@ impl<'src> Header<'src> { impl<'src> HeaderValue<'src> { pub fn parse(input: &'src str) -> Result { - // Repeatedly match either non-comma/non-quote characters or blocks of text enclosed in - // quotes, until we can't, in which case we're either at a non-quote-enclosed comma or the - // end of the string. - let re = Regex::new(r#"(?:[^,"]+|(?:"[^"]*"))+"#).unwrap(); match input.strip_prefix('<').and_then(|input| input.strip_suffix('>')) { None => Ok(Self::Flat(input)), Some(pairs) => { - re.captures_iter(pairs) + HEADER_VALUE_REGEX.captures_iter(pairs) .map(|c| c.get(0).unwrap().as_str()) .map(|pair| pair.split_once('=').ok_or(ParseError)) .map(