forked from dotnet/machinelearning
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
get next pipeline API rev -- refactor API to consume column dimension…
…s, purpose, type, and name instead of available trainers & transforms (dotnet#19)
- Loading branch information
Showing
14 changed files
with
205 additions
and
293 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
namespace Microsoft.ML.Auto | ||
{ | ||
internal class ColumnDimensions | ||
{ | ||
public int? Cardinality; | ||
public bool? HasMissing; | ||
|
||
public ColumnDimensions(int? cardinality, bool? hasMissing) | ||
{ | ||
Cardinality = cardinality; | ||
HasMissing = hasMissing; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.Auto | ||
{ | ||
internal class DatasetDimensionsApi | ||
{ | ||
private const int MaxRowsToRead = 1000; | ||
|
||
public static ColumnDimensions[] CalcColumnDimensions(IDataView data, PurposeInference.Column[] purposes) | ||
{ | ||
data = data.Take(MaxRowsToRead); | ||
|
||
var colDimensions = new ColumnDimensions[data.Schema.Count]; | ||
|
||
for (var i = 0; i < data.Schema.Count; i++) | ||
{ | ||
var column = data.Schema[i]; | ||
var purpose = purposes[i]; | ||
|
||
// default column dimensions | ||
int? cardinality = null; | ||
bool? hasMissing = null; | ||
|
||
// if categorical text feature, calc cardinality | ||
if(column.Type.ItemType().IsText() && purpose.Purpose == ColumnPurpose.CategoricalFeature) | ||
{ | ||
cardinality = DatasetDimensionsUtil.GetTextColumnCardinality(data, i); | ||
} | ||
|
||
// if numeric feature, discover missing values | ||
// todo: upgrade logic to consider R8? | ||
if (column.Type.ItemType() == NumberType.R4) | ||
{ | ||
hasMissing = column.Type.IsVector() ? | ||
DatasetDimensionsUtil.HasMissingNumericVector(data, i) : | ||
DatasetDimensionsUtil.HasMissingNumericSingleValue(data, i); | ||
} | ||
|
||
colDimensions[i] = new ColumnDimensions(cardinality, hasMissing); | ||
} | ||
|
||
return colDimensions; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.ML.Data; | ||
|
||
namespace Microsoft.ML.Auto | ||
{ | ||
internal static class DatasetDimensionsUtil | ||
{ | ||
public static int GetTextColumnCardinality(IDataView data, int colIndex) | ||
{ | ||
var seen = new HashSet<string>(); | ||
using (var cursor = data.GetRowCursor(x => x == colIndex)) | ||
{ | ||
var getter = cursor.GetGetter<ReadOnlyMemory<char>>(colIndex); | ||
while (cursor.MoveNext()) | ||
{ | ||
var value = default(ReadOnlyMemory<char>); | ||
getter(ref value); | ||
var valueStr = value.ToString(); | ||
seen.Add(valueStr); | ||
} | ||
} | ||
return seen.Count; | ||
} | ||
|
||
public static bool HasMissingNumericSingleValue(IDataView data, int colIndex) | ||
{ | ||
using (var cursor = data.GetRowCursor(x => x == colIndex)) | ||
{ | ||
var getter = cursor.GetGetter<Single>(colIndex); | ||
var value = default(Single); | ||
while (cursor.MoveNext()) | ||
{ | ||
getter(ref value); | ||
if (Single.IsNaN(value)) | ||
{ | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
} | ||
|
||
public static bool HasMissingNumericVector(IDataView data, int colIndex) | ||
{ | ||
using (var cursor = data.GetRowCursor(x => x == colIndex)) | ||
{ | ||
var getter = cursor.GetGetter<VBuffer<Single>>(colIndex); | ||
var value = default(VBuffer<Single>); | ||
while (cursor.MoveNext()) | ||
{ | ||
getter(ref value); | ||
if (VBufferUtils.HasNaNs(value)) | ||
{ | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.