Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V2.8.1 Release #97

Merged
merged 5 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
<ManagePackageVersionsCentrally>true</ManagePackageVersionsCentrally>
</PropertyGroup>
<ItemGroup>
<PackageVersion Include="Apache.Arrow" Version="13.0.0" />
<PackageVersion Include="Parquet.Net" Version="4.16.4" />
<PackageVersion Include="Apache.Arrow" Version="14.0.1" />
<PackageVersion Include="Parquet.Net" Version="4.17.0" />
<PackageVersion Include="Microsoft.CSharp" Version="4.7.0" />
<PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.6.0" />
<PackageVersion Include="RichardSzalay.MockHttp" Version="6.0.0" />
Expand Down
30 changes: 16 additions & 14 deletions src/ParquetViewer.Engine/ParquetEngine.Processor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public partial class ParquetEngine
public async Task<DataTable> ReadRowsAsync(List<string> selectedFields, int offset, int recordCount, CancellationToken cancellationToken, IProgress<int>? progress = null)
{
long recordsLeftToRead = recordCount;
DataTable result = BuildDataTable(selectedFields);
DataTable result = BuildDataTable(null, selectedFields);
result.BeginLoadData(); //might speed things up

foreach (var reader in this.GetReaders(offset))
Expand Down Expand Up @@ -86,7 +86,7 @@ private async Task ProcessRowGroup(DataTable dataTable, ParquetRowGroupReader gr
{
cancellationToken.ThrowIfCancellationRequested();

var field = ParquetSchemaTree.GetChild(column.ColumnName);
var field = ParquetSchemaTree.GetChild(column.ExtendedProperties["Parent"] as string, column.ColumnName);
if (field.SchemaElement.LogicalType?.LIST is not null || field.SchemaElement.ConvertedType == Parquet.Meta.ConvertedType.LIST)
{
await ReadListField(dataTable, groupReader, rowBeginIndex, field, skipRecords,
Expand Down Expand Up @@ -120,7 +120,7 @@ private async Task ReadPrimitiveField(DataTable dataTable, ParquetRowGroupReader
int skippedRecords = 0;
var dataColumn = await groupReader.ReadColumnAsync(field.DataField ?? throw new Exception($"Pritimive field `{field.Path}` is missing its data field"), cancellationToken);

var fieldIndex = dataTable.Columns[field.DataField.Path.ToString()]?.Ordinal ?? throw new Exception($"Column `{field.Path}` is missing");
var fieldIndex = dataTable.Columns[field.Path]?.Ordinal ?? throw new Exception($"Column `{field.Path}` is missing");
foreach (var value in dataColumn.Data)
{
cancellationToken.ThrowIfCancellationRequested();
Expand Down Expand Up @@ -285,7 +285,7 @@ private async Task ReadStructField(DataTable dataTable, ParquetRowGroupReader gr
long skipRecords, long readRecords, bool isFirstColumn, CancellationToken cancellationToken, IProgress<int>? progress)
{
//Read struct data as a new datatable
DataTable structFieldDataTable = BuildDataTable(field.Children.Select(f => $"{field.Path}/{f.Path}").ToList());
DataTable structFieldDataTable = BuildDataTable(field.Path, field.Children.Select(f => f.Path).ToList());

//Need to calculate progress differently for structs
var structFieldReadProgress = new SimpleProgress();
Expand All @@ -309,16 +309,16 @@ private async Task ReadStructField(DataTable dataTable, ParquetRowGroupReader gr
//Read the struct data and populate the datatable
await ProcessRowGroup(structFieldDataTable, groupReader, skipRecords, readRecords, cancellationToken, structFieldReadProgress);

if (isFirstColumn)
{
var newRow = dataTable.NewRow();
dataTable.Rows.Add(newRow);
}

var rowIndex = rowBeginIndex;
var fieldIndex = dataTable.Columns[field.Path]?.Ordinal ?? throw new Exception($"Column `{field.Path}` is missing");
for (var i = 0; i < structFieldDataTable.Rows.Count; i++)
{
if (isFirstColumn)
{
var newRow = dataTable.NewRow();
dataTable.Rows.Add(newRow);
}

DataRow datarow = GetRow(dataTable, rowIndex);
datarow[fieldIndex] = new StructValue(field.Path, structFieldDataTable.Rows[i]);
rowIndex++;
Expand Down Expand Up @@ -350,12 +350,12 @@ private DataRow GetRow(DataTable dataTable, int rowIndex)
}
}

private DataTable BuildDataTable(List<string> fields)
private DataTable BuildDataTable(string? parent, List<string> fields)
{
DataTable dataTable = new();
foreach (var field in fields)
{
var schema = ParquetSchemaTree.GetChild(field);
var schema = ParquetSchemaTree.GetChild(parent, field);

DataColumn newColumn;
if (schema.SchemaElement.ConvertedType == ConvertedType.LIST)
Expand All @@ -379,14 +379,16 @@ private DataTable BuildDataTable(List<string> fields)
}
else
{
var clrType = schema.DataField?.ClrType ?? throw new Exception($"{field} has no data field");
var clrType = schema.DataField?.ClrType ?? throw new Exception($"{(parent is not null ? parent + "/" : string.Empty)}/{field} has no data field");
newColumn = new DataColumn(field, clrType);
}

newColumn.ExtendedProperties.Add("Parent", parent);

//We don't support case sensitive field names unfortunately
if (dataTable.Columns.Contains(newColumn.ColumnName))
{
throw new NotSupportedException($"Duplicate column '{field}' detected. Column names are case insensitive and must be unique.");
throw new NotSupportedException($"Duplicate column '{(parent is not null ? parent + "/" : string.Empty)}{field}' detected. Column names are case insensitive and must be unique.");
}

dataTable.Columns.Add(newColumn);
Expand Down
20 changes: 9 additions & 11 deletions src/ParquetViewer.Engine/ParquetSchemaElement.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,22 @@ public ParquetSchemaElement(SchemaElement schemaElement)
this.SchemaElement = schemaElement;
}

public ParquetSchemaElement GetChild(string name)
public ParquetSchemaElement GetChild(string name) => GetChildImpl(name);

public ParquetSchemaElement GetChild(string? parent, string name)
{
if (name?.Contains('/') == true)
{
string currentPath = name.Substring(0, name.IndexOf('/'));
string remainingPath = name.Substring(name.IndexOf('/') + 1);
var child = GetChildImpl(currentPath);
return child.GetChild(remainingPath);
}
else
if (parent is null)
{
return GetChildImpl(name);
}

ParquetSchemaElement GetChildImpl(string? name) => name is not null && _children.TryGetValue(name, out var result)
? result : throw new Exception($"Field schema path not found: `{Path}/{name}`");
var child = GetChildImpl(parent);
return child.GetChild(name);
}

private ParquetSchemaElement GetChildImpl(string? name) => name is not null && _children.TryGetValue(name, out var result)
? result : throw new Exception($"Field schema path not found: `{Path}/{name}`");

public ParquetSchemaElement GetImmediateChildOrSingle(string name)
{
if (_children.TryGetValue(name, out var result))
Expand Down
Binary file not shown.
Binary file not shown.
Binary file modified src/ParquetViewer.Tests/Data/STRUCT_TEST1.parquet
Binary file not shown.
6 changes: 6 additions & 0 deletions src/ParquetViewer.Tests/ParquetViewer.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@
<None Update="Data\COLUMN_ENDING_IN_PERIOD_TEST1.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="Data\COLUMN_NAME_WITH_FORWARD_SLASH1.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="Data\DATETIME_TEST1.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
Expand Down Expand Up @@ -77,6 +80,9 @@
<None Update="Data\NULLABLE_GUID_TEST1.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="Data\ORACLE_MALFORMED_INT64_TEST1.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
<None Update="Data\PARTITIONED_PARQUET_FILE_TEST1\bldgtype=B\bd8c129da60e412db4b21800b9e0b983.parquet">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</None>
Expand Down
53 changes: 41 additions & 12 deletions src/ParquetViewer.Tests/SanityTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -182,18 +182,21 @@ public async Task STRUCT_TYPE_TEST1()
{
using var parquetEngine = await ParquetEngine.OpenFileOrFolderAsync("Data/STRUCT_TEST1.parquet", default);

Assert.Equal(2, parquetEngine.RecordCount);
Assert.Equal(2, parquetEngine.Fields.Count);

var dataTable = await parquetEngine.ReadRowsAsync(parquetEngine.Fields, 0, int.MaxValue, default);
Assert.IsType<string>(dataTable.Rows[0][0]);
Assert.Equal("12345-6", (string)dataTable.Rows[0][0]);
Assert.Equal(10, parquetEngine.RecordCount);
Assert.Equal(6, parquetEngine.Fields.Count);

//We currently only support three of the fields in this test file
string[] supportedFields = { "txn", "remove", "protocol" };
var dataTable = await parquetEngine.ReadRowsAsync(
parquetEngine.Fields.Where(f => supportedFields.Contains(f)).ToList(), 0, int.MaxValue, default);
Assert.IsType<StructValue>(dataTable.Rows[0][0]);
Assert.Equal("{\"appId\":{},\"version\":0,\"lastUpdated\":{}}", ((StructValue)dataTable.Rows[0][0]).ToString());
Assert.IsType<StructValue>(dataTable.Rows[0][1]);
Assert.Equal("{\"firstName\":\"Ivan\",\"lastName\":\"Gavryliuk\"}", ((StructValue)dataTable.Rows[0][1]).ToString());
Assert.IsType<string>(dataTable.Rows[1][0]);
Assert.Equal("12345-7", (string)dataTable.Rows[1][0]);
Assert.IsType<StructValue>(dataTable.Rows[1][1]);
Assert.Equal("{\"firstName\":\"Richard\",\"lastName\":\"Conway\"}", ((StructValue)dataTable.Rows[1][1]).ToString());
Assert.Equal("{\"path\":{},\"deletionTimestamp\":{},\"dataChange\":false}", ((StructValue)dataTable.Rows[0][1]).ToString());
Assert.IsType<StructValue>(dataTable.Rows[0][2]);
Assert.Equal("{\"minReaderVersion\":1,\"minWriterVersion\":2}", ((StructValue)dataTable.Rows[0][2]).ToString());
Assert.IsType<DBNull>(dataTable.Rows[9][2]);
Assert.Equal(DBNull.Value, dataTable.Rows[9][2]);
}

[Fact]
Expand Down Expand Up @@ -309,6 +312,32 @@ public async Task MALFORMED_DATETIME_TEST1()
Assert.Fail("Looks like the Malformed DateTime Fix is no longer needed! Remove that part of the code.");
}
Assert.Equal(typeof(long), dataTable.Rows[0]["ds"]?.GetType()); //If it's not a datetime, then it should be a long.
}
}

[Fact]
public async Task COLUMN_NAME_WITH_FORWARD_SLASH_TEST1()
{
//TODO: need to make this file smaller
using var parquetEngine = await ParquetEngine.OpenFileOrFolderAsync("Data/COLUMN_NAME_WITH_FORWARD_SLASH1.parquet", default);

Assert.Equal(181966, parquetEngine.RecordCount);
Assert.Equal(320, parquetEngine.Fields.Count);

var dataTable = await parquetEngine.ReadRowsAsync(parquetEngine.Fields, 0, 1, default);
Assert.Equal((byte)0, dataTable.Rows[0]["FLC K/L"]);
}

[Fact]
public async Task ORACLE_MALFORMED_INT64_TEST1()
{
using var parquetEngine = await ParquetEngine.OpenFileOrFolderAsync("Data/ORACLE_MALFORMED_INT64_TEST1.parquet", default);

Assert.Equal(126, parquetEngine.RecordCount);
Assert.Equal(2, parquetEngine.Fields.Count);

var dataTable = await parquetEngine.ReadRowsAsync(parquetEngine.Fields, 0, int.MaxValue, default);
Assert.Equal("DEPOSIT", dataTable.Rows[0][0]);
Assert.Equal((long)1, dataTable.Rows[0][1]);
}
}
}
10 changes: 10 additions & 0 deletions src/ParquetViewer/Controls/QuickPeekForm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ public QuickPeekForm(string titleSuffix, object data, Guid uniqueTag, int source

private void QuickPeakForm_Load(object sender, EventArgs e)
{
var width = 0;
if (this.mainGridView is not null)
{
width = this.mainGridView.RowHeadersWidth + 26; /* magic number? */
foreach (DataGridViewColumn column in this.mainGridView.Columns)
{
width += column.Width;
}
}
this.Width = Math.Min(Math.Max(width, 280), 900); //900 pixel max seems reasonable, right?
this.Location = new Point(Cursor.Position.X + 5, Cursor.Position.Y);
}

Expand Down
2 changes: 1 addition & 1 deletion src/ParquetViewer/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@
// You can specify all the values or you can default the Build and Revision Numbers
// by using the '*' as shown below:
// [assembly: AssemblyVersion("1.0.*")]
[assembly: AssemblyVersion("2.8.0.3")]
[assembly: AssemblyVersion("2.8.1.2")]