Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Prashanth Govindarajan committed Apr 28, 2020
1 parent 4cf5e04 commit fe972e5
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 23 deletions.
49 changes: 26 additions & 23 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -198,55 +198,58 @@ public static DataFrame LoadCsv(Stream csvStream,
Encoding encoding = null)
{
if (!csvStream.CanSeek)
{
throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream));
}

var linesForGuessType = new List<string[]>();
long rowline = 0;
int numberOfColumns = dataTypes?.Length ?? 0;

if (header == true && numberOfRowsToRead != -1)
{
numberOfRowsToRead++;
}

List<DataFrameColumn> columns;
long streamStart = csvStream.Position;
// First pass: schema and number of rows.
using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true))
{
string line = null;
if (dataTypes == null)
line = streamReader.ReadLine();
while (line != null)
{
line = streamReader.ReadLine();
while (line != null)
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
{
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
if (linesForGuessType.Count < guessRows)
{
if (linesForGuessType.Count < guessRows)
var spl = line.Split(separator);
if (header && rowline == 0)
{
var spl = line.Split(separator);
if (header && rowline == 0)
{
if (columnNames == null)
columnNames = spl;
}
else
if (columnNames == null)
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
columnNames = spl;
}
}
else
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
}
}
++rowline;
if (rowline == guessRows)
{
break;
}
line = streamReader.ReadLine();
}

if (linesForGuessType.Count == 0)
++rowline;
if (rowline == guessRows)
{
throw new FormatException(Strings.EmptyFile);
break;
}
line = streamReader.ReadLine();
}

if (linesForGuessType.Count == 0)
{
throw new FormatException(Strings.EmptyFile);
}

columns = new List<DataFrameColumn>(numberOfColumns);
Expand Down
8 changes: 8 additions & 0 deletions tests/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ Stream GetStream(string streamData)
Assert.True(typeof(float) == df.Columns[4].DataType);
Assert.True(typeof(string) == df.Columns[5].DataType);
Assert.True(typeof(double) == df.Columns[6].DataType);

Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
Assert.Equal("trip_distance", df.Columns[4].Name);
Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("fare_amount", df.Columns[6].Name);
VerifyColumnTypes(df);

foreach (var column in df.Columns)
Expand Down

0 comments on commit fe972e5

Please sign in to comment.