From e64cbad5436112090e5f2d065687ee83c95aef86 Mon Sep 17 00:00:00 2001 From: Eric Erhardt Date: Thu, 5 Dec 2019 16:07:23 -0600 Subject: [PATCH] DataFrame.LoadCsv throws an exception on projects targeting < netcore3.0 (#2797) Fixing by passing in an encoding and a default buffer size. Also, get our tests running on .NET Framework. Fix #2783 --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 6 +++-- .../BufferTests.cs | 2 ++ .../DataFrameTests.IDataView.cs | 4 ++-- .../DataFrameTests.cs | 23 ++++++++++--------- .../Microsoft.Data.Analysis.Tests.csproj | 3 ++- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index ed452c140e..b1ff654c27 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -5,12 +5,14 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text; namespace Microsoft.Data.Analysis { - public partial class DataFrame { + private const int DefaultStreamReaderBufferSize = 1024; + private static Type GuessKind(int col, List read) { Type res = typeof(string); @@ -205,7 +207,7 @@ public static DataFrame LoadCsv(Stream csvStream, List columns; long streamStart = csvStream.Position; // First pass: schema and number of rows. - using (var streamReader = new StreamReader(csvStream, encoding: null, detectEncodingFromByteOrderMarks: true, bufferSize: -1, leaveOpen: true)) + using (var streamReader = new StreamReader(csvStream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true)) { string line = null; if (dataTypes == null) diff --git a/tests/Microsoft.Data.Analysis.Tests/BufferTests.cs b/tests/Microsoft.Data.Analysis.Tests/BufferTests.cs index 0b5a500100..67edb50d67 100644 --- a/tests/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/BufferTests.cs @@ -188,6 +188,7 @@ public void TestArrowStringColumnClone() Assert.Null(clone[i]); } +#if !NETFRAMEWORK // https://github.com/dotnet/corefxlab/issues/2796 [Fact] public void TestPrimitiveColumnGetReadOnlyBuffers() { @@ -275,5 +276,6 @@ public void TestArrowStringColumnGetReadOnlyBuffers() } } } +#endif //!NETFRAMEWORK } } diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs index 459c4a47ce..9ed4963b7f 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.IDataView.cs @@ -66,8 +66,8 @@ public void TestIDataView() Assert.Equal((ushort)1, preview.ColumnView[10].Values[1]); Assert.Equal("String", preview.ColumnView[11].Column.Name); - Assert.Equal("0".AsMemory(), preview.ColumnView[11].Values[0]); - Assert.Equal("1".AsMemory(), preview.ColumnView[11].Values[1]); + Assert.Equal("0".ToString(), preview.ColumnView[11].Values[0].ToString()); + Assert.Equal("1".ToString(), preview.ColumnView[11].Values[1].ToString()); Assert.Equal("Char", preview.ColumnView[12].Column.Name); Assert.Equal((ushort)65, preview.ColumnView[12].Values[0]); diff --git a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index f32983acdd..36bce6d754 100644 --- a/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -169,7 +169,7 @@ public DataFrame SplitTrainTest(DataFrame input, float testRatio, out DataFrame { IEnumerable randomIndices = Enumerable.Range(0, (int)input.Rows.Count); IEnumerable trainIndices = randomIndices.Take((int)(input.Rows.Count * testRatio)); - IEnumerable testIndices = randomIndices.TakeLast((int)(input.Rows.Count * (1 - testRatio))); + IEnumerable testIndices = randomIndices.Skip((int)(input.Rows.Count * testRatio)); Test = input[testIndices]; return input[trainIndices]; } @@ -1540,11 +1540,11 @@ public void TestPrefixAndSuffix() DataFrame prefix = df.AddPrefix("Prefix_"); IEnumerable prefixNames = ((IDataView)prefix).Schema; - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(((IDataView)df).Schema)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(((IDataView)df).Schema, (e1, e2) => (e1, e2))) { Assert.Equal(First.Name, Second.Name); } - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in prefixNames.Zip(columnNames)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in prefixNames.Zip(columnNames, (e1, e2) => (e1, e2))) { Assert.Equal(First.Name, "Prefix_" + Second.Name); } @@ -1552,18 +1552,18 @@ public void TestPrefixAndSuffix() // Inplace df.AddPrefix("Prefix_", true); prefixNames = ((IDataView)df).Schema; - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(prefixNames)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(prefixNames, (e1, e2) => (e1, e2))) { Assert.Equal("Prefix_" + First.Name, Second.Name); } DataFrame suffix = df.AddSuffix("_Suffix"); IEnumerable suffixNames = ((IDataView)suffix).Schema; - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in ((IDataView)df).Schema.Zip(columnNames)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in ((IDataView)df).Schema.Zip(columnNames, (e1, e2) => (e1, e2))) { Assert.Equal(First.Name, "Prefix_" + Second.Name); } - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(suffixNames)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(suffixNames, (e1, e2) => (e1, e2))) { Assert.Equal("Prefix_" + First.Name + "_Suffix", Second.Name); } @@ -1571,7 +1571,7 @@ public void TestPrefixAndSuffix() // InPlace df.AddSuffix("_Suffix", true); suffixNames = ((IDataView)df).Schema; - foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(suffixNames)) + foreach ((DataViewSchema.Column First, DataViewSchema.Column Second) in columnNames.Zip(suffixNames, (e1, e2) => (e1, e2))) { Assert.Equal("Prefix_" + First.Name + "_Suffix", Second.Name); } @@ -1942,19 +1942,20 @@ public void TestAppendRow() Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(3, df.Columns[1].NullCount); - df.Append(new List> { KeyValuePair.Create("Column1", (object)5), KeyValuePair.Create("Column2", (object)false) }); + df.Append(new Dictionary { { "Column1", (object)5 } , { "Column2", false } }); Assert.Equal(14, df.Rows.Count); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(3, df.Columns[1].NullCount); - df.Append(new List> { KeyValuePair.Create("Column1", (object)5) }); + df.Append(new Dictionary { { "Column1", 5 } }); Assert.Equal(15, df.Rows.Count); + Assert.Equal(15, df["Column1"].Length); Assert.Equal(15, df["Column2"].Length); Assert.Equal(2, df.Columns[0].NullCount); Assert.Equal(4, df.Columns[1].NullCount); - df.Append(new List> { KeyValuePair.Create("Column2", (object)false) }); + df.Append(new Dictionary { { "Column2", false } }); Assert.Equal(16, df.Rows.Count); Assert.Equal(16, df["Column1"].Length); Assert.Equal(16, df["Column2"].Length); @@ -1970,7 +1971,7 @@ public void TestAppendRow() // DataFrame must remain usable even if Append throws Assert.Throws(() => df.Append(new List { 5, "str" })); - Assert.Throws(() => df.Append(new List> { KeyValuePair.Create("Column2", (object)"str") })); + Assert.Throws(() => df.Append(new Dictionary { { "Column2", "str" } })); Assert.Throws(() => df.Append(new List { 5, true, true })); df.Append(); diff --git a/tests/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj b/tests/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj index fb7ad0c725..f4bdb218b2 100644 --- a/tests/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj +++ b/tests/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj @@ -1,7 +1,8 @@  - netcoreapp3.0 + net461;netcoreapp3.0 + netcoreapp3.0