From d2559b8387725a7429650bf543ef090e14da7d88 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Mon, 1 Jan 2024 16:39:27 -0700 Subject: [PATCH] Have `clean_whitespace` re-run type inference (#1464) * Have `clean_whitespace` re-infer types * make dev output * unit-test files * drive-by typofix * make dev output --- docs/src/manpage.md | 4 ++-- docs/src/manpage.txt | 4 ++-- docs/src/reference-dsl-builtin-functions.md | 4 ++-- man/manpage.txt | 4 ++-- man/mlr.1 | 4 ++-- pkg/bifs/strings.go | 3 ++- pkg/dsl/cst/builtin_function_manager.go | 4 ++-- test/cases/dsl-clean-whitespace/0010/cmd | 1 + test/cases/dsl-clean-whitespace/0010/experr | 0 test/cases/dsl-clean-whitespace/0010/expout | 18 ++++++++++++++++++ test/cases/dsl-clean-whitespace/0010/input.csv | 3 +++ test/cases/dsl-clean-whitespace/0010/mlr | 2 ++ 12 files changed, 38 insertions(+), 13 deletions(-) create mode 100644 test/cases/dsl-clean-whitespace/0010/cmd create mode 100644 test/cases/dsl-clean-whitespace/0010/experr create mode 100644 test/cases/dsl-clean-whitespace/0010/expout create mode 100644 test/cases/dsl-clean-whitespace/0010/input.csv create mode 100644 test/cases/dsl-clean-whitespace/0010/mlr diff --git a/docs/src/manpage.md b/docs/src/manpage.md index c4a65ea40a..8d25329699 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -2312,7 +2312,7 @@ MILLER(1) MILLER(1) (class=math #args=1) Ceiling: nearest integer at or above. 1mclean_whitespace0m - (class=string #args=1) Same as collapse_whitespace and strip. + (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference. 1mcollapse_whitespace0m (class=string #args=1) Strip repeated whitespace from string. @@ -3011,7 +3011,7 @@ MILLER(1) MILLER(1) strmatch(12345, "34") is true 1mstrmatchx0m - (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in constrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. + (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. Examples: strmatchx("a", "abc") returns: { diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index fe77c56722..915a1b7274 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -2291,7 +2291,7 @@ MILLER(1) MILLER(1) (class=math #args=1) Ceiling: nearest integer at or above. 1mclean_whitespace0m - (class=string #args=1) Same as collapse_whitespace and strip. + (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference. 1mcollapse_whitespace0m (class=string #args=1) Strip repeated whitespace from string. @@ -2990,7 +2990,7 @@ MILLER(1) MILLER(1) strmatch(12345, "34") is true 1mstrmatchx0m - (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in constrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. + (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here. Examples: strmatchx("a", "abc") returns: { diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md index f3b8efdef4..3a55821f3f 100644 --- a/docs/src/reference-dsl-builtin-functions.md +++ b/docs/src/reference-dsl-builtin-functions.md @@ -1209,7 +1209,7 @@ capitalize (class=string #args=1) Convert string's first character to uppercase ### clean_whitespace
-clean_whitespace  (class=string #args=1) Same as collapse_whitespace and strip.
+clean_whitespace  (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.
 
@@ -1364,7 +1364,7 @@ strmatch(12345, "34") is true ### strmatchx
-strmatchx  (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in constrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
+strmatchx  (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
 Examples:
 strmatchx("a", "abc") returns:
   {
diff --git a/man/manpage.txt b/man/manpage.txt
index fe77c56722..915a1b7274 100644
--- a/man/manpage.txt
+++ b/man/manpage.txt
@@ -2291,7 +2291,7 @@ MILLER(1)                                                            MILLER(1)
         (class=math #args=1) Ceiling: nearest integer at or above.
 
    1mclean_whitespace0m
-        (class=string #args=1) Same as collapse_whitespace and strip.
+        (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.
 
    1mcollapse_whitespace0m
         (class=string #args=1) Strip repeated whitespace from string.
@@ -2990,7 +2990,7 @@ MILLER(1)                                                            MILLER(1)
        strmatch(12345, "34") is true
 
    1mstrmatchx0m
-        (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in constrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
+        (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \1 through \9, an arbitrary number are supported here.
        Examples:
        strmatchx("a", "abc") returns:
          {
diff --git a/man/mlr.1 b/man/mlr.1
index c6c5c540f2..28940393c9 100644
--- a/man/mlr.1
+++ b/man/mlr.1
@@ -3100,7 +3100,7 @@ Map example: apply({"a":1, "b":3, "c":5}, func(k,v) {return {toupper(k): v ** 2}
 .RS 0
 .\}
 .nf
- (class=string #args=1) Same as collapse_whitespace and strip.
+ (class=string #args=1) Same as collapse_whitespace and strip, followed by type inference.
 .fi
 .if n \{\
 .RE
@@ -4675,7 +4675,7 @@ strmatch(12345, "34") is true
 .RS 0
 .\}
 .nf
- (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \e1, \e2, etc. are not set, in constrast to the `=~` operator. As well, while the `=~` operator limits matches to \e1 through \e9, an arbitrary number are supported here.
+ (class=string #args=2) Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \e1, \e2, etc. are not set, in contrast to the `=~` operator. As well, while the `=~` operator limits matches to \e1 through \e9, an arbitrary number are supported here.
 Examples:
 strmatchx("a", "abc") returns:
   {
diff --git a/pkg/bifs/strings.go b/pkg/bifs/strings.go
index cd68ee4808..e77de7c684 100644
--- a/pkg/bifs/strings.go
+++ b/pkg/bifs/strings.go
@@ -344,11 +344,12 @@ func BIF_capitalize(input1 *mlrval.Mlrval) *mlrval.Mlrval {
 
 // ----------------------------------------------------------------
 func BIF_clean_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
-	return BIF_strip(
+	mv := BIF_strip(
 		BIF_collapse_whitespace_regexp(
 			input1, _whitespace_regexp,
 		),
 	)
+	return mlrval.FromInferredType(mv.String())
 }
 
 // ================================================================
diff --git a/pkg/dsl/cst/builtin_function_manager.go b/pkg/dsl/cst/builtin_function_manager.go
index c55f9edd9c..965c9529bf 100644
--- a/pkg/dsl/cst/builtin_function_manager.go
+++ b/pkg/dsl/cst/builtin_function_manager.go
@@ -355,7 +355,7 @@ used within subsequent DSL statements. See also "Regular expressions" at ` + lib
 		{
 			name:  "strmatchx",
 			class: FUNC_CLASS_STRING,
-			help:  `Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in constrast to the ` + "`=~` operator. As well, while the `=~` operator limits matches to \\1 through \\9, an arbitrary number are supported here.",
+			help:  `Extended information for whether the stringable first argument matches the regular-expression second argument. Regex captures are provided in the return-value map; \1, \2, etc. are not set, in contrast to the ` + "`=~` operator. As well, while the `=~` operator limits matches to \\1 through \\9, an arbitrary number are supported here.",
 			examples: []string{
 				`strmatchx("a", "abc") returns:`,
 				`  {`,
@@ -444,7 +444,7 @@ used within subsequent DSL statements. See also "Regular expressions" at ` + lib
 		{
 			name:      "clean_whitespace",
 			class:     FUNC_CLASS_STRING,
-			help:      "Same as collapse_whitespace and strip.",
+			help:      "Same as collapse_whitespace and strip, followed by type inference.",
 			unaryFunc: bifs.BIF_clean_whitespace,
 		},
 
diff --git a/test/cases/dsl-clean-whitespace/0010/cmd b/test/cases/dsl-clean-whitespace/0010/cmd
new file mode 100644
index 0000000000..2fd915d023
--- /dev/null
+++ b/test/cases/dsl-clean-whitespace/0010/cmd
@@ -0,0 +1 @@
+mlr --icsv --ojson clean-whitespace then put -f ${CASEDIR}/mlr ${CASEDIR}/input.csv
diff --git a/test/cases/dsl-clean-whitespace/0010/experr b/test/cases/dsl-clean-whitespace/0010/experr
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/cases/dsl-clean-whitespace/0010/expout b/test/cases/dsl-clean-whitespace/0010/expout
new file mode 100644
index 0000000000..db3fe878d9
--- /dev/null
+++ b/test/cases/dsl-clean-whitespace/0010/expout
@@ -0,0 +1,18 @@
+[
+{
+  "a": 1,
+  "b": 2,
+  "c": 3,
+  "d": 4,
+  "e": 9,
+  "t": "int"
+},
+{
+  "a": 5,
+  "b": 6,
+  "c": 7,
+  "d": 8,
+  "e": 13,
+  "t": "int"
+}
+]
diff --git a/test/cases/dsl-clean-whitespace/0010/input.csv b/test/cases/dsl-clean-whitespace/0010/input.csv
new file mode 100644
index 0000000000..4320372396
--- /dev/null
+++ b/test/cases/dsl-clean-whitespace/0010/input.csv
@@ -0,0 +1,3 @@
+a, b, c, d
+1, 2, 3, 4
+5, 6, 7, 8
diff --git a/test/cases/dsl-clean-whitespace/0010/mlr b/test/cases/dsl-clean-whitespace/0010/mlr
new file mode 100644
index 0000000000..e51c30c8b6
--- /dev/null
+++ b/test/cases/dsl-clean-whitespace/0010/mlr
@@ -0,0 +1,2 @@
+$e = $d + 5;
+$t = typeof($d)