From 1a669ad7f817b142873df5112fdf061b5052c4b5 Mon Sep 17 00:00:00 2001 From: Michael Sharp Date: Thu, 7 May 2020 12:59:02 -0700 Subject: [PATCH 1/4] added in standard conversions from types to ReadOnlyMemory --- src/Microsoft.ML.Data/Data/Conversion.cs | 33 +++++++++++++++++++ .../Transformers/ConvertTests.cs | 19 +++++++++++ 2 files changed, 52 insertions(+) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 82de43bb48..6c9d2b5dbe 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -114,6 +114,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -123,6 +124,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -132,6 +134,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -141,6 +144,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -151,6 +155,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -161,6 +166,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -171,6 +177,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -181,6 +188,7 @@ private Conversions() AddStd(Convert); AddAux(Convert); AddStd(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -188,16 +196,19 @@ private Conversions() AddStd(Convert); // REVIEW: Conversion from UG to R4/R8, should we? AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); @@ -225,22 +236,26 @@ private Conversions() AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddStd(Convert); AddStd(Convert); AddStd(Convert); AddAux(Convert); + AddStd(Convert); AddIsNA(IsNA); AddIsNA(IsNA); @@ -912,6 +927,24 @@ public void Convert(in BL src, ref SB dst) public void Convert(in DZ src, ref SB dst) { ClearDst(ref dst); dst.AppendFormat("{0:o}", src); } #endregion ToStringBuilder + #region ToTX + public void Convert(in I1 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I2 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I4 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in I8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U1 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U2 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U4 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in U8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in UG src, ref TX dst) => dst = string.Format("0x{0:x16}{1:x16}", src.High, src.Low).AsMemory(); + public void Convert(in R4 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in R8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in BL src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in TS src, ref TX dst) => dst = string.Format("{0:c}", src).AsMemory(); + public void Convert(in DT src, ref TX dst) => string.Format("{0:o}", src).AsMemory(); + public void Convert(in DZ src, ref TX dst) => string.Format("{0:o}", src).AsMemory(); + #endregion ToTX + #region ToBL public void Convert(in R8 src, ref BL dst) => dst = System.Convert.ToBoolean(src); public void Convert(in R4 src, ref BL dst) => dst = System.Convert.ToBoolean(src); diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index 7ff7abc797..c116612f6c 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -248,6 +248,25 @@ public void TestConvertWorkout() var expectedConvertedValues = ML.Data.LoadFromEnumerable(allTypesDataConverted); CheckSameValues(expectedConvertedValues, actualConvertedValues); + + var allInputTypesData = new[] { new { A = (sbyte)sbyte.MinValue, B = (byte)byte.MinValue, C = double.MaxValue, D = float.MinValue, E = "already a string", F = false } }; + var allInputTypesDataView = ML.Data.LoadFromEnumerable(allInputTypesData); + var allInputTypesDataPipe = ML.Transforms.Conversion.ConvertType(columns: new[] {new TypeConvertingEstimator.ColumnOptions("A1", DataKind.String, "A"), + new TypeConvertingEstimator.ColumnOptions("B1", DataKind.String, "B"), + new TypeConvertingEstimator.ColumnOptions("C1", DataKind.String, "C"), + new TypeConvertingEstimator.ColumnOptions("D1", DataKind.String, "D"), + new TypeConvertingEstimator.ColumnOptions("E1", DataKind.String, "E"), + new TypeConvertingEstimator.ColumnOptions("F1", DataKind.String, "F"), + }); + + var convertedValues = allInputTypesDataPipe.Fit(allInputTypesDataView).Transform(allInputTypesDataView); + var expectedValuesData = new[] { new { A = (sbyte)sbyte.MinValue, B = (byte)byte.MinValue, C = double.MaxValue, D = float.MinValue, E = "already a string", F = false, + A1 = "-128", B1 = "0", C1 = "1.79769313486232E+308", D1 = "-3.402823E+38", E1 = "already a string", F1 = "False" } }; + var expectedValuesDataView = ML.Data.LoadFromEnumerable(expectedValuesData); + + CheckSameValues(expectedValuesDataView, convertedValues); + TestEstimatorCore(allInputTypesDataPipe, allInputTypesDataView); + Done(); } From 66c21a60e65ddf0fc248e504a7dbf0a2b4edb8b2 Mon Sep 17 00:00:00 2001 From: Michael Sharp Date: Fri, 8 May 2020 14:41:09 -0700 Subject: [PATCH 2/4] fixed issues with differences in tostring of .netcore 3 --- src/Microsoft.ML.Data/Data/Conversion.cs | 4 ++-- test/Microsoft.ML.Tests/Transformers/ConvertTests.cs | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Data/Data/Conversion.cs b/src/Microsoft.ML.Data/Data/Conversion.cs index 6c9d2b5dbe..ead72b3adb 100644 --- a/src/Microsoft.ML.Data/Data/Conversion.cs +++ b/src/Microsoft.ML.Data/Data/Conversion.cs @@ -937,8 +937,8 @@ public void Convert(in BL src, ref SB dst) public void Convert(in U4 src, ref TX dst) => dst = src.ToString().AsMemory(); public void Convert(in U8 src, ref TX dst) => dst = src.ToString().AsMemory(); public void Convert(in UG src, ref TX dst) => dst = string.Format("0x{0:x16}{1:x16}", src.High, src.Low).AsMemory(); - public void Convert(in R4 src, ref TX dst) => dst = src.ToString().AsMemory(); - public void Convert(in R8 src, ref TX dst) => dst = src.ToString().AsMemory(); + public void Convert(in R4 src, ref TX dst) => dst = src.ToString("G7", CultureInfo.InvariantCulture).AsMemory(); + public void Convert(in R8 src, ref TX dst) => dst = src.ToString("G17", CultureInfo.InvariantCulture).AsMemory(); public void Convert(in BL src, ref TX dst) => dst = src.ToString().AsMemory(); public void Convert(in TS src, ref TX dst) => dst = string.Format("{0:c}", src).AsMemory(); public void Convert(in DT src, ref TX dst) => string.Format("{0:o}", src).AsMemory(); diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index c116612f6c..c7731b22d0 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -121,7 +121,7 @@ private sealed class TestStringClass public string A; } - [Fact] + [Fact, TestCategory("RunSpecificTest")] public void TestConvertWorkout() { var data = new[] { new TestClass() { A = 1, B = new int[2] { 1,4 } }, @@ -260,8 +260,9 @@ public void TestConvertWorkout() }); var convertedValues = allInputTypesDataPipe.Fit(allInputTypesDataView).Transform(allInputTypesDataView); + var expectedValuesData = new[] { new { A = (sbyte)sbyte.MinValue, B = (byte)byte.MinValue, C = double.MaxValue, D = float.MinValue, E = "already a string", F = false, - A1 = "-128", B1 = "0", C1 = "1.79769313486232E+308", D1 = "-3.402823E+38", E1 = "already a string", F1 = "False" } }; + A1 = "-128", B1 = "0", C1 = "1.7976931348623157E+308", D1 = "-3.402823E+38", E1 = "already a string", F1 = "False" } }; var expectedValuesDataView = ML.Data.LoadFromEnumerable(expectedValuesData); CheckSameValues(expectedValuesDataView, convertedValues); From decb5ff825e05fabaa3432af691bd89ba88c8718 Mon Sep 17 00:00:00 2001 From: Michael Sharp Date: Mon, 11 May 2020 14:40:02 -0700 Subject: [PATCH 3/4] removing RunSpecificTest test attribute --- test/Microsoft.ML.Tests/Transformers/ConvertTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index c7731b22d0..6ef9214866 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -121,7 +121,7 @@ private sealed class TestStringClass public string A; } - [Fact, TestCategory("RunSpecificTest")] + [Fact] public void TestConvertWorkout() { var data = new[] { new TestClass() { A = 1, B = new int[2] { 1,4 } }, From efcdab0e29c3e0a33966c80c3c6b49c14bbe8dcd Mon Sep 17 00:00:00 2001 From: Michael Sharp Date: Mon, 11 May 2020 16:42:16 -0700 Subject: [PATCH 4/4] added comments into documentation about type changes --- docs/code/IDataViewTypeSystem.md | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/code/IDataViewTypeSystem.md b/docs/code/IDataViewTypeSystem.md index 76e32c22ca..cabf3dd54e 100644 --- a/docs/code/IDataViewTypeSystem.md +++ b/docs/code/IDataViewTypeSystem.md @@ -288,7 +288,7 @@ true/false values. The `BooleanDataViewType` class derives from The default value of `BL` is `false`, and it has no `NA` value. -There is a standard conversion from `TX` to `BL`. There are standard +There are standard conversions from `TX` to `BL`, and from `BL` to `TX`. There are standard conversions from `BL` to all signed integer and floating point numeric types, with `false` mapping to zero and `true` mapping to one. @@ -332,7 +332,8 @@ values being the canonical `NA` values. There are standard conversions from each floating-point type to the other floating-point type. There are also standard conversions from text to each -floating-point type and from each integer type to each floating-point type. +floating-point type, from floating-point type to text types, and from each +integer type to each floating-point type. ### Signed Integer Types @@ -342,8 +343,8 @@ default value of each of these is zero. There are standard conversions from each signed integer type to every other signed integer type. There are also standard conversions from text to each -signed integer type and from each signed integer type to each floating-point -type. +signed integer type, from each signed integer type to text, and from each +signed integer type to each floating-point type. Note that we have not defined standard conversions from floating-point types to signed integer types. @@ -357,8 +358,8 @@ have an `NA` value. There are standard conversions from each unsigned integer type to every other unsigned integer type. There are also standard conversions from text to each -unsigned integer type and from each unsigned integer type to each floating- -point type. +unsigned integer type, each unsigned integer type to text, and from each unsigned +integer type to each floating-point type. Note that we have not defined standard conversions from floating-point types to unsigned integer types, or between signed integer types and unsigned @@ -541,6 +542,13 @@ case, it is simple to map implicit items (suppressed due to sparsity) to zero. In the former case, these items are first mapped to the empty text value. To get the same result, we need empty text to map to zero. +### To Text + +There are standard conversions to `TX` from the standard primitive types, +`R4`, `R8`, `I1`, `I2`, `I4`, `I8`, `U1`, `U2`, `U4`, `U8`, `BL`, `TS`, `DT`, and `DZ`. +`R4` uses the G7 format and `R8` uses the G17 format. `BL` converts to "True" or "False". +`TS` uses the format "0:c". `DT` and `DZ` use the "0:o" format. + ### Floating Point There are standard conversions from `R4` to `R8` and from `R8` to `R4`. These