Skip to content
This repository has been archived by the owner on Aug 2, 2023. It is now read-only.

Commit

Permalink
Fix #2906 (#2907)
Browse files Browse the repository at this point in the history
* Fix #2906

* Improvements and unit tests

* sq

* Better fix

* sq
  • Loading branch information
Prashanth Govindarajan authored Apr 30, 2020
1 parent 32d36a1 commit 300885c
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 23 deletions.
54 changes: 31 additions & 23 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -198,55 +198,63 @@ public static DataFrame LoadCsv(Stream csvStream,
Encoding encoding = null)
{
if (!csvStream.CanSeek)
{
throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream));
}

if (dataTypes == null && guessRows <= 0)
{
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
}

var linesForGuessType = new List<string[]>();
long rowline = 0;
int numberOfColumns = dataTypes?.Length ?? 0;

if (header == true && numberOfRowsToRead != -1)
{
numberOfRowsToRead++;
}

List<DataFrameColumn> columns;
long streamStart = csvStream.Position;
// First pass: schema and number of rows.
using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true))
{
string line = null;
if (dataTypes == null)
line = streamReader.ReadLine();
while (line != null)
{
line = streamReader.ReadLine();
while (line != null)
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
{
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
if (linesForGuessType.Count < guessRows || (header && rowline == 0))
{
if (linesForGuessType.Count < guessRows)
var spl = line.Split(separator);
if (header && rowline == 0)
{
var spl = line.Split(separator);
if (header && rowline == 0)
{
if (columnNames == null)
columnNames = spl;
}
else
if (columnNames == null)
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
columnNames = spl;
}
}
else
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
}
}
++rowline;
if (rowline == guessRows)
{
break;
}
line = streamReader.ReadLine();
}

if (linesForGuessType.Count == 0)
++rowline;
if (rowline == guessRows || guessRows == 0)
{
throw new FormatException(Strings.EmptyFile);
break;
}
line = streamReader.ReadLine();
}

if (rowline == 0)
{
throw new FormatException(Strings.EmptyFile);
}

columns = new List<DataFrameColumn>(numberOfColumns);
Expand Down
9 changes: 9 additions & 0 deletions src/Microsoft.Data.Analysis/strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/Microsoft.Data.Analysis/strings.resx
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@
<data name="ExceedsNumberOfColumns" xml:space="preserve">
<value>Parameter.Count exceeds the number of columns({0}) in the DataFrame </value>
</data>
<data name="ExpectedEitherGuessRowsOrDataTypes" xml:space="preserve">
<value>Expected either {0} or {1} to be provided</value>
</data>
<data name="ImmutableColumn" xml:space="preserve">
<value>Column is immutable</value>
</data>
Expand Down
101 changes: 101 additions & 0 deletions tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,99 @@ Stream GetStream(string streamData)
VerifyColumnTypes(df);
}

void VerifyDataFrameWithNamedColumnsAndDataTypes(DataFrame df, bool verifyColumnDataType, bool verifyNames)
{
Assert.Equal(4, df.Rows.Count);
Assert.Equal(7, df.Columns.Count);

if (verifyColumnDataType)
{
Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(short) == df.Columns[1].DataType);
Assert.True(typeof(int) == df.Columns[2].DataType);
Assert.True(typeof(long) == df.Columns[3].DataType);
Assert.True(typeof(float) == df.Columns[4].DataType);
Assert.True(typeof(string) == df.Columns[5].DataType);
Assert.True(typeof(double) == df.Columns[6].DataType);
}

if (verifyNames)
{
Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
Assert.Equal("trip_distance", df.Columns[4].Name);
Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("fare_amount", df.Columns[6].Name);
}

VerifyColumnTypes(df);

foreach (var column in df.Columns)
{
Assert.Equal(0, column.NullCount);
}
}

[Theory]
[InlineData(true, 0)]
[InlineData(false, 0)]
[InlineData(true, 10)]
[InlineData(false, 10)]
public void TestReadCsvWithTypesAndGuessRows(bool header, int guessRows)
{
/* Tests this matrix
*
header GuessRows DataTypes
True 0 NotNull
False 0 NotNull
True 10 NotNull
False 10 NotNull
True 0 Null -----> Throws an exception
False 0 Null -----> Throws an exception
True 10 Null
False 10 Null
*
*/
string headerLine = @"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
";
string dataLines =
@"CMT,1,1,1271,3.8,CRD,17.5
CMT,1,1,474,1.5,CRD,8
CMT,1,1,637,1.4,CRD,8.5
CMT,1,1,181,0.6,CSH,4.5";

Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}

string data = header ? headerLine + dataLines : dataLines;
DataFrame df = DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows,
dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long), typeof(float), typeof(string), typeof(double) }
);
VerifyDataFrameWithNamedColumnsAndDataTypes(df, verifyColumnDataType: true, verifyNames: header);

if (guessRows == 10)
{
df = DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows
);
VerifyDataFrameWithNamedColumnsAndDataTypes(df, verifyColumnDataType: false, verifyNames: header);
}
else
{
Assert.ThrowsAny<ArgumentException>(() => DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows
));
}
}

[Fact]
public void TestReadCsvWithTypes()
{
Expand All @@ -176,6 +269,14 @@ Stream GetStream(string streamData)
Assert.True(typeof(float) == df.Columns[4].DataType);
Assert.True(typeof(string) == df.Columns[5].DataType);
Assert.True(typeof(double) == df.Columns[6].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
Assert.Equal("trip_distance", df.Columns[4].Name);
Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("fare_amount", df.Columns[6].Name);
VerifyColumnTypes(df);

foreach (var column in df.Columns)
Expand Down

0 comments on commit 300885c

Please sign in to comment.