Skip to content

Commit

Permalink
review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
artidoro committed Dec 5, 2018
1 parent fed01cd commit 490e03f
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 64 deletions.
81 changes: 33 additions & 48 deletions docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,7 @@ If the schema of the data is not known at compile time, or too cumbersome, you c
var mlContext = new MLContext();

// Create the reader: define the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
// A boolean column depicting the 'label'.
new TextLoader.Column("IsOver50K", DataKind.BL, 0),
// Three text columns.
Expand All @@ -126,8 +124,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
},
// First line of the file is a header, not a data row.
HasHeader = true
});
hasHeader: true
);

// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
var data = reader.Read(dataPath);
Expand Down Expand Up @@ -175,19 +173,17 @@ The code is very similar using the dynamic API:
var mlContext = new MLContext();

// Create the reader: define the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
// A boolean column depicting the 'label'.
new TextLoader.Column("IsOver50k", DataKind.BL, 0),
new TextLoader.Column("IsOver50K", DataKind.BL, 0),
// Three text columns.
new TextLoader.Column("Workclass", DataKind.TX, 1),
new TextLoader.Column("Education", DataKind.TX, 2),
new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
},
// First line of the file is a header, not a data row.
HasHeader = true
});
hasHeader: true
);

var data = reader.Read(exampleFile1, exampleFile2);
```
Expand Down Expand Up @@ -365,19 +361,17 @@ You can also use the dynamic API to create the equivalent of the previous pipeli
var mlContext = new MLContext();

// Create the reader: define the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
// A boolean column depicting the 'label'.
new TextLoader.Column("IsOver50k", DataKind.BL, 0),
new TextLoader.Column("IsOver50K", DataKind.BL, 0),
// Three text columns.
new TextLoader.Column("Workclass", DataKind.TX, 1),
new TextLoader.Column("Education", DataKind.TX, 2),
new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
},
// First line of the file is a header, not a data row.
HasHeader = true
});
hasHeader: true
);

// Start creating our processing pipeline. For now, let's just concatenate all the text columns
// together into one.
Expand Down Expand Up @@ -468,20 +462,18 @@ var mlContext = new MLContext();

// Step one: read the data as an IDataView.
// First, we define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
// We read the first 11 values as a single float vector.
new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10),

// Separately, read the target variable.
new TextLoader.Column("Target", DataKind.R4, 11),
},
// First line of the file is a header, not a data row.
HasHeader = true,
hasHeader: true,
// Default separator is tab, but we need a semicolon.
Separator = ";"
});
separatorChar: ';'
);

// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
var trainData = reader.Read(trainDataPath);
Expand Down Expand Up @@ -617,9 +609,7 @@ var mlContext = new MLContext();

// Step one: read the data as an IDataView.
// First, we define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
new TextLoader.Column("SepalLength", DataKind.R4, 0),
new TextLoader.Column("SepalWidth", DataKind.R4, 1),
new TextLoader.Column("PetalLength", DataKind.R4, 2),
Expand All @@ -628,8 +618,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
new TextLoader.Column("Label", DataKind.TX, 4),
},
// Default separator is tab, but the dataset has comma.
Separator = ","
});
separatorChar: ','
);

// Retrieve the training data.
var trainData = reader.Read(irisDataPath);
Expand Down Expand Up @@ -910,17 +900,15 @@ You can achieve the same results using the dynamic API.
var mlContext = new MLContext();

// Define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[] {
// The four features of the Iris dataset will be grouped together as one Features column.
new TextLoader.Column("Features", DataKind.R4, 0, 3),
// Label: kind of iris.
new TextLoader.Column("Label", DataKind.TX, 4),
},
// Default separator is tab, but the dataset has comma.
Separator = ","
});
separatorChar: ','
);

// Read the training data.
var trainData = reader.Read(dataPath);
Expand Down Expand Up @@ -1027,9 +1015,8 @@ You can achieve the same results using the dynamic API.
var mlContext = new MLContext();

// Define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[]
{
new TextLoader.Column("Label", DataKind.BL, 0),
// We will load all the categorical features into one vector column of size 8.
new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8),
Expand All @@ -1038,8 +1025,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
// Let's also separately load the 'Workclass' column.
new TextLoader.Column("Workclass", DataKind.TX, 1),
},
HasHeader = true
});
hasHeader: true
);

// Read the data.
var data = reader.Read(dataPath);
Expand Down Expand Up @@ -1154,14 +1141,13 @@ You can achieve the same results using the dynamic API.
var mlContext = new MLContext();

// Define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[]
{
new TextLoader.Column("IsToxic", DataKind.BL, 0),
new TextLoader.Column("Message", DataKind.TX, 1),
},
HasHeader = true
});
hasHeader: true
);

// Read the data.
var data = reader.Read(dataPath);
Expand Down Expand Up @@ -1274,9 +1260,8 @@ var mlContext = new MLContext();

// Step one: read the data as an IDataView.
// First, we define the reader: specify the data columns and where to find them in the text file.
var reader = mlContext.Data.TextReader(new TextLoader.Arguments
{
Column = new[] {
var reader = mlContext.Data.TextReader(new[]
{
// We read the first 11 values as a single float vector.
new TextLoader.Column("SepalLength", DataKind.R4, 0),
new TextLoader.Column("SepalWidth", DataKind.R4, 1),
Expand All @@ -1286,8 +1271,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
new TextLoader.Column("Label", DataKind.TX, 4),
},
// Default separator is tab, but the dataset has comma.
Separator = ","
});
separatorChar: ','
);

// Read the data.
var data = reader.Read(dataPath);
Expand Down
7 changes: 4 additions & 3 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1033,11 +1033,12 @@ private static Arguments MakeArgs(Column[] columns, bool hasHeader, char[] separ
/// <param name="env">The environment to use.</param>
/// <param name="args">Defines the settings of the load operation.</param>
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
public TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource dataSample = null)
public TextLoader(IHostEnvironment env, Arguments args = null, IMultiStreamSource dataSample = null)
{
args = args ?? new Arguments();

Contracts.CheckValue(env, nameof(env));
_host = env.Register(RegistrationName);

_host.CheckValue(args, nameof(args));
_host.CheckValueOrNull(dataSample);

Expand Down Expand Up @@ -1332,7 +1333,7 @@ public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSo
/// <param name="env">The environment to use.</param>
/// <param name="fileSource">Specifies a file from which to read.</param>
/// <param name="args">Defines the settings of the load operation.</param>
public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args)
public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args = null)
=> new TextLoader(env, args, fileSource).Read(fileSource);

public void Save(ModelSaveContext ctx)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public static class TextLoaderSaverCatalog
/// <param name="catalog">The catalog.</param>
/// <param name="columns">The columns of the schema.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="dataSample">The optional location of a data sample.</param>
public static TextLoader TextReader(this DataOperations catalog,
Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
Expand All @@ -40,7 +40,7 @@ public static TextLoader TextReader(this DataOperations catalog, Arguments args,
/// <param name="catalog">The catalog.</param>
/// <param name="columns">The columns of the schema.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="path">The path to the file.</param>
/// <returns>The data view.</returns>
public static IDataView ReadFromTextFile(this DataOperations catalog,
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ public static IEnumerable<KeyValuePair<ColumnRole, string>> LoadRoleMappingsOrNu
{
// REVIEW: Should really validate the schema here, and consider
// ignoring this stream if it isn't as expected.
var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile), new TextLoader.Arguments());
var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile));

using (var cursor = loader.GetRowCursor(c => true))
{
Expand Down
3 changes: 0 additions & 3 deletions src/Microsoft.ML.Transforms/TermLookupTransformer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,6 @@ private static IComponentFactory<IMultiStreamSource, IDataLoader> GetLoaderFacto
ulong max = ulong.MinValue;
try
{
var txtArgs = new TextLoader.Arguments();
bool parsed = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs);
host.Assert(parsed);
var data = TextLoader.ReadFile(host, new MultiFileSource(filename), new[]
{
new TextLoader.Column("Term", DataKind.TX, 0),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public void TrainSentiment()
{
var env = new MLContext(seed: 1);
// Pipeline
var arguemnts = new TextLoader.Arguments()
var arguments = new TextLoader.Arguments()
{
Column = new TextLoader.Column[]
{
Expand All @@ -86,7 +86,7 @@ public void TrainSentiment()
AllowQuoting = false,
AllowSparse = false
};
var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath), arguemnts);
var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath), arguments);

var text = TextFeaturizingEstimator.Create(env,
new TextFeaturizingEstimator.Arguments()
Expand Down
8 changes: 4 additions & 4 deletions test/Microsoft.ML.Predictor.Tests/TestPredictors.cs
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ public void RankingLightGBMTest()
public void TestTreeEnsembleCombiner()
{
var dataPath = GetDataPath("breast-cancer.txt");
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));

var fastTrees = new IPredictorModel[3];
for (int i = 0; i < 3; i++)
Expand All @@ -628,7 +628,7 @@ public void TestTreeEnsembleCombiner()
public void TestTreeEnsembleCombinerWithCategoricalSplits()
{
var dataPath = GetDataPath("adult.tiny.with-schema.txt");
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));

var cat = new OneHotEncodingEstimator(Env, "Categories", "Features").Fit(dataView).Transform(dataView);
var fastTrees = new IPredictorModel[3];
Expand Down Expand Up @@ -729,7 +729,7 @@ private void CombineAndTestTreeEnsembles(IDataView idv, IPredictorModel[] fastTr
public void TestEnsembleCombiner()
{
var dataPath = GetDataPath("breast-cancer.txt");
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));

var predictors = new IPredictorModel[]
{
Expand Down Expand Up @@ -775,7 +775,7 @@ public void TestEnsembleCombiner()
public void TestMultiClassEnsembleCombiner()
{
var dataPath = GetDataPath("breast-cancer.txt");
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));

var predictors = new IPredictorModel[]
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ protected bool SaveLoadText(IDataView view, IHostEnvironment env,

// Note that we don't pass in "args", but pass in a default args so we test
// the auto-schema parsing.
var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData), new TextLoader.Arguments());
var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData));
if (!CheckMetadataTypes(loadedData.Schema))
Failed();

Expand Down

0 comments on commit 490e03f

Please sign in to comment.