review comments

dotnet · Dec 5, 2018 · 490e03f · 490e03f
1 parent fed01cd
commit 490e03f
Show file tree

Hide file tree

Showing 8 changed files with 47 additions and 64 deletions.
diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md
@@ -115,9 +115,7 @@ If the schema of the data is not known at compile time, or too cumbersome, you c
 var mlContext = new MLContext();
 
 // Create the reader: define the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         // A boolean column depicting the 'label'.
         new TextLoader.Column("IsOver50K", DataKind.BL, 0),
         // Three text columns.
@@ -126,8 +124,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
         new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
     },
     // First line of the file is a header, not a data row.
-    HasHeader = true
-});
+    hasHeader: true
+);
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
 var data = reader.Read(dataPath);
@@ -175,19 +173,17 @@ The code is very similar using the dynamic API:
 var mlContext = new MLContext();
 
 // Create the reader: define the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         // A boolean column depicting the 'label'.
-        new TextLoader.Column("IsOver50k", DataKind.BL, 0),
+        new TextLoader.Column("IsOver50K", DataKind.BL, 0),
         // Three text columns.
         new TextLoader.Column("Workclass", DataKind.TX, 1),
         new TextLoader.Column("Education", DataKind.TX, 2),
         new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
     },
     // First line of the file is a header, not a data row.
-    HasHeader = true
-});
+    hasHeader: true
+);
 
 var data = reader.Read(exampleFile1, exampleFile2);
 ```
@@ -365,19 +361,17 @@ You can also use the dynamic API to create the equivalent of the previous pipeli
 var mlContext = new MLContext();
 
 // Create the reader: define the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         // A boolean column depicting the 'label'.
-        new TextLoader.Column("IsOver50k", DataKind.BL, 0),
+        new TextLoader.Column("IsOver50K", DataKind.BL, 0),
         // Three text columns.
         new TextLoader.Column("Workclass", DataKind.TX, 1),
         new TextLoader.Column("Education", DataKind.TX, 2),
         new TextLoader.Column("MaritalStatus", DataKind.TX, 3)
     },
     // First line of the file is a header, not a data row.
-    HasHeader = true
-});
+    hasHeader: true
+);
 
 // Start creating our processing pipeline. For now, let's just concatenate all the text columns
 // together into one.
@@ -468,20 +462,18 @@ var mlContext = new MLContext();
 
 // Step one: read the data as an IDataView.
 // First, we define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         // We read the first 11 values as a single float vector.
         new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10),
 
         // Separately, read the target variable.
         new TextLoader.Column("Target", DataKind.R4, 11),
     },
     // First line of the file is a header, not a data row.
-    HasHeader = true,
+    hasHeader: true,
     // Default separator is tab, but we need a semicolon.
-    Separator = ";"
-});
+    separatorChar: ';'
+);
 
 // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
 var trainData = reader.Read(trainDataPath);
@@ -617,9 +609,7 @@ var mlContext = new MLContext();
 
 // Step one: read the data as an IDataView.
 // First, we define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         new TextLoader.Column("SepalLength", DataKind.R4, 0),
         new TextLoader.Column("SepalWidth", DataKind.R4, 1),
         new TextLoader.Column("PetalLength", DataKind.R4, 2),
@@ -628,8 +618,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
         new TextLoader.Column("Label", DataKind.TX, 4),
     },
     // Default separator is tab, but the dataset has comma.
-    Separator = ","
-});
+    separatorChar: ','
+);
 
 // Retrieve the training data.
 var trainData = reader.Read(irisDataPath);
@@ -910,17 +900,15 @@ You can achieve the same results using the dynamic API.
 var mlContext = new MLContext();
 
 // Define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] {
         // The four features of the Iris dataset will be grouped together as one Features column.
         new TextLoader.Column("Features", DataKind.R4, 0, 3),
         // Label: kind of iris.
         new TextLoader.Column("Label", DataKind.TX, 4),
     },
     // Default separator is tab, but the dataset has comma.
-    Separator = ","
-});
+    separatorChar: ','
+);
 
 // Read the training data.
 var trainData = reader.Read(dataPath);
@@ -1027,9 +1015,8 @@ You can achieve the same results using the dynamic API.
 var mlContext = new MLContext();
 
 // Define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] 
+    {
         new TextLoader.Column("Label", DataKind.BL, 0),
         // We will load all the categorical features into one vector column of size 8.
         new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8),
@@ -1038,8 +1025,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
         // Let's also separately load the 'Workclass' column.
         new TextLoader.Column("Workclass", DataKind.TX, 1),
     },
-    HasHeader = true
-});
+    hasHeader: true
+);
 
 // Read the data.
 var data = reader.Read(dataPath);
@@ -1154,14 +1141,13 @@ You can achieve the same results using the dynamic API.
 var mlContext = new MLContext();
 
 // Define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] 
+    {
         new TextLoader.Column("IsToxic", DataKind.BL, 0),
         new TextLoader.Column("Message", DataKind.TX, 1),
     },
-    HasHeader = true
-});
+    hasHeader: true
+);
 
 // Read the data.
 var data = reader.Read(dataPath);
@@ -1274,9 +1260,8 @@ var mlContext = new MLContext();
 
 // Step one: read the data as an IDataView.
 // First, we define the reader: specify the data columns and where to find them in the text file.
-var reader = mlContext.Data.TextReader(new TextLoader.Arguments
-{
-    Column = new[] {
+var reader = mlContext.Data.TextReader(new[] 
+    {
         // We read the first 11 values as a single float vector.
         new TextLoader.Column("SepalLength", DataKind.R4, 0),
         new TextLoader.Column("SepalWidth", DataKind.R4, 1),
@@ -1286,8 +1271,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments
         new TextLoader.Column("Label", DataKind.TX, 4),
     },
     // Default separator is tab, but the dataset has comma.
-    Separator = ","
-});
+    separatorChar: ','
+);
 
 // Read the data.
 var data = reader.Read(dataPath);

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -1033,11 +1033,12 @@ private static Arguments MakeArgs(Column[] columns, bool hasHeader, char[] separ
         /// <param name="env">The environment to use.</param>
         /// <param name="args">Defines the settings of the load operation.</param>
         /// <param name="dataSample">Allows to expose items that can be used for reading.</param>
-        public TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource dataSample = null)
+        public TextLoader(IHostEnvironment env, Arguments args = null, IMultiStreamSource dataSample = null)
         {
+            args = args ?? new Arguments();
+
             Contracts.CheckValue(env, nameof(env));
             _host = env.Register(RegistrationName);
-
             _host.CheckValue(args, nameof(args));
             _host.CheckValueOrNull(dataSample);
 
@@ -1332,7 +1333,7 @@ public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSo
         /// <param name="env">The environment to use.</param>
         /// <param name="fileSource">Specifies a file from which to read.</param>
         /// <param name="args">Defines the settings of the load operation.</param>
-        public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args)
+        public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args = null)
             => new TextLoader(env, args, fileSource).Read(fileSource);
 
         public void Save(ModelSaveContext ctx)

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
@@ -19,7 +19,7 @@ public static class TextLoaderSaverCatalog
         /// <param name="catalog">The catalog.</param>
         /// <param name="columns">The columns of the schema.</param>
         /// <param name="hasHeader">Whether the file has a header.</param>
-        /// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
+        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
         /// <param name="dataSample">The optional location of a data sample.</param>
         public static TextLoader TextReader(this DataOperations catalog,
             Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
@@ -40,7 +40,7 @@ public static TextLoader TextReader(this DataOperations catalog, Arguments args,
         /// <param name="catalog">The catalog.</param>
         /// <param name="columns">The columns of the schema.</param>
         /// <param name="hasHeader">Whether the file has a header.</param>
-        /// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
+        /// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
         /// <param name="path">The path to the file.</param>
         /// <returns>The data view.</returns>
         public static IDataView ReadFromTextFile(this DataOperations catalog,

diff --git a/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs b/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs
@@ -283,7 +283,7 @@ public static IEnumerable<KeyValuePair<ColumnRole, string>> LoadRoleMappingsOrNu
             {
                 // REVIEW: Should really validate the schema here, and consider
                 // ignoring this stream if it isn't as expected.
-                var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile), new TextLoader.Arguments());
+                var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile));
 
                 using (var cursor = loader.GetRowCursor(c => true))
                 {

diff --git a/src/Microsoft.ML.Transforms/TermLookupTransformer.cs b/src/Microsoft.ML.Transforms/TermLookupTransformer.cs
@@ -361,9 +361,6 @@ private static IComponentFactory<IMultiStreamSource, IDataLoader> GetLoaderFacto
             ulong max = ulong.MinValue;
             try
             {
-                var txtArgs = new TextLoader.Arguments();
-                bool parsed = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs);
-                host.Assert(parsed);
                 var data = TextLoader.ReadFile(host, new MultiFileSource(filename), new[]
                     {
                         new TextLoader.Column("Term", DataKind.TX, 0),

diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs
@@ -64,7 +64,7 @@ public void TrainSentiment()
         {
             var env = new MLContext(seed: 1);
             // Pipeline
-            var arguemnts = new TextLoader.Arguments()
+            var arguments = new TextLoader.Arguments()
             {
                 Column = new TextLoader.Column[]
                 {
@@ -86,7 +86,7 @@ public void TrainSentiment()
                 AllowQuoting = false,
                 AllowSparse = false
             };
-            var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath), arguemnts);
+            var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath), arguments);
 
             var text = TextFeaturizingEstimator.Create(env,
                 new TextFeaturizingEstimator.Arguments()

diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs
@@ -606,7 +606,7 @@ public void RankingLightGBMTest()
         public void TestTreeEnsembleCombiner()
         {
             var dataPath = GetDataPath("breast-cancer.txt");
-            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
+            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));
 
             var fastTrees = new IPredictorModel[3];
             for (int i = 0; i < 3; i++)
@@ -628,7 +628,7 @@ public void TestTreeEnsembleCombiner()
         public void TestTreeEnsembleCombinerWithCategoricalSplits()
         {
             var dataPath = GetDataPath("adult.tiny.with-schema.txt");
-            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
+            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));
 
             var cat = new OneHotEncodingEstimator(Env, "Categories", "Features").Fit(dataView).Transform(dataView);
             var fastTrees = new IPredictorModel[3];
@@ -729,7 +729,7 @@ private void CombineAndTestTreeEnsembles(IDataView idv, IPredictorModel[] fastTr
         public void TestEnsembleCombiner()
         {
             var dataPath = GetDataPath("breast-cancer.txt");
-            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
+            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));
 
             var predictors = new IPredictorModel[]
             {
@@ -775,7 +775,7 @@ public void TestEnsembleCombiner()
         public void TestMultiClassEnsembleCombiner()
         {
             var dataPath = GetDataPath("breast-cancer.txt");
-            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
+            var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath));
 
             var predictors = new IPredictorModel[]
             {

diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs
@@ -439,7 +439,7 @@ protected bool SaveLoadText(IDataView view, IHostEnvironment env,
 
             // Note that we don't pass in "args", but pass in a default args so we test
             // the auto-schema parsing.
-            var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData), new TextLoader.Arguments());
+            var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData));
             if (!CheckMetadataTypes(loadedData.Schema))
                 Failed();