Skip to content

Commit

Permalink
Append rows to a DataFrame (dotnet#2823)
Browse files Browse the repository at this point in the history
* Append rows to a DataFrame

* Unit test

* Update unit tests and doc

* Need to perfrom a type check every time

* sq

* Update unit test

* Address comments
  • Loading branch information
Prashanth Govindarajan authored and msftbot[bot] committed Jan 28, 2020
1 parent 70bb9e9 commit 82c315f
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 60 deletions.
2 changes: 1 addition & 1 deletion src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ public static DataFrame LoadCsv(Stream csvStream,
}
else
{
ret.Append(spl);
ret.Append(spl, inPlace: true);
}
++rowline;
line = streamReader.ReadLine();
Expand Down
112 changes: 68 additions & 44 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -424,56 +424,76 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value)
}

/// <summary>
/// Appends a row inplace to the DataFrame
/// Appends rows to the DataFrame
/// </summary>
/// <remarks>If an input column's value doesn't match a DataFrameColumn's data type, a conversion will be attempted</remarks>
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
/// <param name="rows">The rows to be appended to this DataFrame </param>
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
{
DataFrame ret = inPlace ? this : Clone();
foreach (DataFrameRow row in rows)
{
ret.Append(row, inPlace: true);
}
return ret;
}

/// <summary>
/// Appends a row to the DataFrame
/// </summary>
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
/// <param name="row"></param>
public void Append(IEnumerable<object> row = null)
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
{
IEnumerator<DataFrameColumn> columnEnumerator = Columns.GetEnumerator();
DataFrame ret = inPlace ? this : Clone();
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
bool columnMoveNext = columnEnumerator.MoveNext();
if (row != null)
{
// Go through row first to make sure there are no data type incompatibilities
IEnumerator<object> rowEnumerator = row.GetEnumerator();
bool rowMoveNext = rowEnumerator.MoveNext();
List<object> cachedObjectConversions = new List<object>();
while (columnMoveNext && rowMoveNext)
{
DataFrameColumn column = columnEnumerator.Current;
object value = rowEnumerator.Current;
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
{
value = null;
}
if (value != null)
// Go through row first to make sure there are no data type incompatibilities
IEnumerator<object> rowEnumerator = row.GetEnumerator();
bool rowMoveNext = rowEnumerator.MoveNext();
List<object> cachedObjectConversions = new List<object>();
while (columnMoveNext && rowMoveNext)
{
value = Convert.ChangeType(value, column.DataType);
if (value is null)
DataFrameColumn column = columnEnumerator.Current;
object value = rowEnumerator.Current;
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
value = null;
}
if (value != null)
{
value = Convert.ChangeType(value, column.DataType);
if (value is null)
{
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
}
}
cachedObjectConversions.Add(value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
}
cachedObjectConversions.Add(value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
}
if (rowMoveNext)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
}
columnEnumerator.Reset();
if (rowMoveNext)
{
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
}
// Reset the enumerators
columnEnumerator = ret.Columns.GetEnumerator();
columnMoveNext = columnEnumerator.MoveNext();
rowEnumerator.Reset();
rowEnumerator = row.GetEnumerator();
rowMoveNext = rowEnumerator.MoveNext();
int cacheIndex = 0;
while (columnMoveNext && rowMoveNext)
{
DataFrameColumn column = columnEnumerator.Current;
object value = cachedObjectConversions[cacheIndex];
ResizeByOneAndAppend(column, value);
ret.ResizeByOneAndAppend(column, value);
columnMoveNext = columnEnumerator.MoveNext();
rowMoveNext = rowEnumerator.MoveNext();
cacheIndex++;
Expand All @@ -483,19 +503,22 @@ public void Append(IEnumerable<object> row = null)
{
// Fill the remaining columns with null
DataFrameColumn column = columnEnumerator.Current;
ResizeByOneAndAppend(column, null);
ret.ResizeByOneAndAppend(column, null);
columnMoveNext = columnEnumerator.MoveNext();
}
Columns.RowCount++;
ret.Columns.RowCount++;
return ret;
}

/// <summary>
/// Appends a row inplace by enumerating column names and values from <paramref name="row"/>
/// Appends a row by enumerating column names and values from <paramref name="row"/>
/// </summary>
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
/// <param name="row"></param>
public void Append(IEnumerable<KeyValuePair<string, object>> row)
/// <param name="row">An enumeration of column name and value to be appended</param>
/// <param name="inPlace">If set, appends <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
public DataFrame Append(IEnumerable<KeyValuePair<string, object>> row, bool inPlace = false)
{
DataFrame ret = inPlace ? this : Clone();
if (row == null)
{
throw new ArgumentNullException(nameof(row));
Expand All @@ -505,13 +528,13 @@ public void Append(IEnumerable<KeyValuePair<string, object>> row)
foreach (KeyValuePair<string, object> columnAndValue in row)
{
string columnName = columnAndValue.Key;
int index = Columns.IndexOf(columnName);
int index = ret.Columns.IndexOf(columnName);
if (index == -1)
{
throw new ArgumentException(Strings.InvalidColumnName, nameof(columnName));
}

DataFrameColumn column = Columns[index];
DataFrameColumn column = ret.Columns[index];
object value = columnAndValue.Value;
if (value != null)
{
Expand All @@ -528,22 +551,23 @@ public void Append(IEnumerable<KeyValuePair<string, object>> row)
foreach (KeyValuePair<string, object> columnAndValue in row)
{
string columnName = columnAndValue.Key;
int index = Columns.IndexOf(columnName);
int index = ret.Columns.IndexOf(columnName);

DataFrameColumn column = Columns[index];
DataFrameColumn column = ret.Columns[index];
object value = cachedObjectConversions[cacheIndex];
ResizeByOneAndAppend(column, value);
ret.ResizeByOneAndAppend(column, value);
cacheIndex++;
}

foreach (DataFrameColumn column in Columns)
foreach (DataFrameColumn column in ret.Columns)
{
if (column.Length == Rows.Count)
{
ResizeByOneAndAppend(column, null);
ret.ResizeByOneAndAppend(column, null);
}
}
Columns.RowCount++;
ret.Columns.RowCount++;
return ret;
}

/// <summary>
Expand Down
122 changes: 107 additions & 15 deletions tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1947,84 +1947,176 @@ public void TestMutationOnRows()
}
}

[Fact]
public void TestAppendRows()
{
DataFrame df = MakeDataFrame<float, bool>(10);
DataFrame df2 = MakeDataFrame<int, bool>(5);
Assert.Equal(10, df.Rows.Count);
Assert.Equal(1, df.Columns[0].NullCount);
Assert.Equal(1, df.Columns[1].NullCount);

DataFrame ret = df.Append(df2.Rows, inPlace: false);
Assert.Equal(10, df.Rows.Count);
Assert.Equal(1, df.Columns[0].NullCount);
Assert.Equal(1, df.Columns[1].NullCount);

Verify(ret, df, df2);

void Verify(DataFrame ret, DataFrame check1, DataFrame check2)
{
Assert.Equal(15, ret.Rows.Count);
Assert.Equal(2, ret.Columns[0].NullCount);
Assert.Equal(2, ret.Columns[1].NullCount);
for (long i = 0; i < ret.Rows.Count; i++)
{
DataFrameRow row = ret.Rows[i];
for (int j = 0; j < check1.Columns.Count; j++)
{
if (i < check1.Rows.Count)
{
Assert.Equal(row[j], check1.Rows[i][j]);
}
else
{
Assert.Equal(row[j]?.ToString(), (check2.Rows[i - check1.Rows.Count][j])?.ToString());
}
}
}
}

DataFrame dfClone = df.Clone();
df.Append(df2.Rows, inPlace: true);
Verify(df, dfClone, df2);
}

[Fact]
public void TestAppendRow()
{
DataFrame df = MakeDataFrame<int, bool>(10);
df.Append(new List<object> { 5, true });
df.Append(new List<object> { 5, true }, inPlace: true);
Assert.Equal(11, df.Rows.Count);
Assert.Equal(1, df.Columns[0].NullCount);
Assert.Equal(1, df.Columns[1].NullCount);

df.Append(new List<object> { 100 });
DataFrame ret = df.Append(new List<object> { 5, true });
Assert.Equal(12, ret.Rows.Count);
Assert.Equal(1, ret.Columns[0].NullCount);
Assert.Equal(1, ret.Columns[1].NullCount);

df.Append(new List<object> { 100 }, inPlace: true);
Assert.Equal(12, df.Rows.Count);
Assert.Equal(1, df.Columns[0].NullCount);
Assert.Equal(2, df.Columns[1].NullCount);

df.Append(new List<object> { null, null });
ret = df.Append(new List<object> { 100 }, inPlace: false);
Assert.Equal(13, ret.Rows.Count);
Assert.Equal(1, ret.Columns[0].NullCount);
Assert.Equal(3, ret.Columns[1].NullCount);

df.Append(new List<object> { null, null }, inPlace: true);
Assert.Equal(13, df.Rows.Count);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(3, df.Columns[1].NullCount);
ret = df.Append(new List<object> { null, null }, inPlace: false);
Assert.Equal(14, ret.Rows.Count);
Assert.Equal(3, ret.Columns[0].NullCount);
Assert.Equal(4, ret.Columns[1].NullCount);

df.Append(new Dictionary<string, object> { { "Column1", (object)5 } , { "Column2", false } });
df.Append(new Dictionary<string, object> { { "Column1", (object)5 }, { "Column2", false } }, inPlace: true);
Assert.Equal(14, df.Rows.Count);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(3, df.Columns[1].NullCount);
ret = df.Append(new Dictionary<string, object> { { "Column1", (object)5 }, { "Column2", false } }, inPlace: false);
Assert.Equal(15, ret.Rows.Count);
Assert.Equal(2, ret.Columns[0].NullCount);
Assert.Equal(3, ret.Columns[1].NullCount);

df.Append(new Dictionary<string, object> { { "Column1", 5 } });
df.Append(new Dictionary<string, object> { { "Column1", 5 } }, inPlace: true);
Assert.Equal(15, df.Rows.Count);

Assert.Equal(15, df["Column1"].Length);
Assert.Equal(15, df["Column2"].Length);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(4, df.Columns[1].NullCount);
ret = df.Append(new Dictionary<string, object> { { "Column1", 5 } }, inPlace: false);
Assert.Equal(16, ret.Rows.Count);

df.Append(new Dictionary<string, object> { { "Column2", false } });
Assert.Equal(16, ret["Column1"].Length);
Assert.Equal(16, ret["Column2"].Length);
Assert.Equal(2, ret.Columns[0].NullCount);
Assert.Equal(5, ret.Columns[1].NullCount);

df.Append(new Dictionary<string, object> { { "Column2", false } }, inPlace: true);
Assert.Equal(16, df.Rows.Count);
Assert.Equal(16, df["Column1"].Length);
Assert.Equal(16, df["Column2"].Length);
Assert.Equal(3, df.Columns[0].NullCount);
Assert.Equal(4, df.Columns[1].NullCount);

df.Append((IEnumerable<object>)null);
ret = df.Append(new Dictionary<string, object> { { "Column2", false } }, inPlace: false);
Assert.Equal(17, ret.Rows.Count);
Assert.Equal(17, ret["Column1"].Length);
Assert.Equal(17, ret["Column2"].Length);
Assert.Equal(4, ret.Columns[0].NullCount);
Assert.Equal(4, ret.Columns[1].NullCount);

df.Append((IEnumerable<object>)null, inPlace: true);
Assert.Equal(17, df.Rows.Count);
Assert.Equal(17, df["Column1"].Length);
Assert.Equal(17, df["Column2"].Length);
Assert.Equal(4, df.Columns[0].NullCount);
Assert.Equal(5, df.Columns[1].NullCount);
ret = df.Append((IEnumerable<object>)null, inPlace: false);
Assert.Equal(18, ret.Rows.Count);
Assert.Equal(18, ret["Column1"].Length);
Assert.Equal(18, ret["Column2"].Length);
Assert.Equal(5, ret.Columns[0].NullCount);
Assert.Equal(6, ret.Columns[1].NullCount);

// DataFrame must remain usable even if Append throws
Assert.Throws<FormatException>(() => df.Append(new List<object> { 5, "str" }));
Assert.Throws<FormatException>(() => df.Append(new Dictionary<string, object> { { "Column2", "str" } }));
Assert.Throws<ArgumentException>(() => df.Append(new List<object> { 5, true, true }));
Assert.Throws<FormatException>(() => df.Append(new List<object> { 5, "str" }, inPlace: true));
Assert.Throws<FormatException>(() => df.Append(new Dictionary<string, object> { { "Column2", "str" } }, inPlace: true));
Assert.Throws<ArgumentException>(() => df.Append(new List<object> { 5, true, true }, inPlace: true));

df.Append(inPlace: true);
Assert.Equal(18, df.Rows.Count);
Assert.Equal(18, df["Column1"].Length);
Assert.Equal(18, df["Column2"].Length);
Assert.Equal(5, df.Columns[0].NullCount);
Assert.Equal(6, df.Columns[1].NullCount);

df.Append();
ret = df.Append(inPlace: false);
Assert.Equal(18, df.Rows.Count);
Assert.Equal(18, df["Column1"].Length);
Assert.Equal(18, df["Column2"].Length);
Assert.Equal(5, df.Columns[0].NullCount);
Assert.Equal(6, df.Columns[1].NullCount);
Assert.Equal(19, ret.Rows.Count);
Assert.Equal(19, ret["Column1"].Length);
Assert.Equal(19, ret["Column2"].Length);
Assert.Equal(6, ret.Columns[0].NullCount);
Assert.Equal(7, ret.Columns[1].NullCount);
}

[Fact]
public void TestAppendEmptyValue()
{
DataFrame df = MakeDataFrame<int, bool>(10);
df.Append(new List<object> { "", true });
df.Append(new List<object> { "", true }, inPlace: true);
Assert.Equal(11, df.Rows.Count);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(1, df.Columns[1].NullCount);

StringDataFrameColumn column = new StringDataFrameColumn("Strings", Enumerable.Range(0, 11).Select(x => x.ToString()));
df.Columns.Add(column);

df.Append(new List<object> { 1, true, "" });
df.Append(new List<object> { 1, true, "" }, inPlace: true);
Assert.Equal(12, df.Rows.Count);
Assert.Equal(2, df.Columns[0].NullCount);
Assert.Equal(1, df.Columns[1].NullCount);
Assert.Equal(0, df.Columns[2].NullCount);

df.Append(new List<object> { 1, true, null });
df.Append(new List<object> { 1, true, null }, inPlace: true);
Assert.Equal(13, df.Rows.Count);
Assert.Equal(1, df.Columns[2].NullCount);
}
Expand Down

0 comments on commit 82c315f

Please sign in to comment.