From c97fd19d1c1220517f25f6bb581086449237d98a Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 17 May 2019 14:23:11 -0700 Subject: [PATCH 1/3] Adding samples for data save and load from text and binary files --- .../DataOperations/SaveAndLoadFromBinary.cs | 97 ++++++++++++++++++ .../DataOperations/SaveAndLoadFromText.cs | 98 +++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs new file mode 100644 index 0000000000..55011177be --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.ML; + +namespace Samples.Dynamic +{ + public static class SaveAndLoadFromBinary + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + IEnumerable dataPoints = GenerateRandomDataPoints(10); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints); + + // Inspect the data before saving to a binary file. + PrintPreviewRows(dataPoints); + + // The rows in the data. + // 0, 0.7262433 + // 1, 0.8173254 + // 0, 0.7680227 + // 1, 0.5581612 + // 0, 0.2060332 + // 1, 0.5588848 + // 0, 0.9060271 + // 1, 0.4421779 + // 0, 0.9775497 + // 1, 0.2737045 + + // Create a FileStream object and write the IDataView to it as a binary IDV file. + using (FileStream stream = new FileStream("data.idv", FileMode.Create)) + { + mlContext.Data.SaveAsBinary(data, stream); + } + + // Create an IDataView object by loading the binary IDV file. + IDataView loadedData = mlContext.Data.LoadFromBinary("data.idv"); + + // Inspect the data that is loaded from the previously saved binary file. + var loadedDataEnumerable = mlContext.Data.CreateEnumerable(loadedData, reuseRowObject: false); + PrintPreviewRows(loadedDataEnumerable); + + // The rows in the data. + // 0, 0.7262433 + // 1, 0.8173254 + // 0, 0.7680227 + // 1, 0.5581612 + // 0, 0.2060332 + // 1, 0.5588848 + // 0, 0.9060271 + // 1, 0.4421779 + // 0, 0.9775497 + // 1, 0.2737045 + + File.Delete("data.idv"); + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + yield return new DataPoint + { + Label = i % 2, + + // Create random features that are correlated with label. + Features = (float)random.NextDouble() + }; + } + } + + // Example with label and feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + + public float Features { get; set; } + } + + // Print helper. + private static void PrintPreviewRows(IEnumerable data) + { + Console.WriteLine($"The rows in the data."); + foreach (var row in data) + Console.WriteLine($"{row.Label}, {row.Features}"); + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs new file mode 100644 index 0000000000..8ba46e0497 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs @@ -0,0 +1,98 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.ML; + +namespace Samples.Dynamic +{ + public static class SaveAndLoadFromText + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + IEnumerable dataPoints = GenerateRandomDataPoints(10); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints); + + // Inspect the data before saving to a binary file. + PrintPreviewRows(dataPoints); + + // The rows in the data. + // 0, 0.7262433 + // 1, 0.8173254 + // 0, 0.7680227 + // 1, 0.5581612 + // 0, 0.2060332 + // 1, 0.5588848 + // 0, 0.9060271 + // 1, 0.4421779 + // 0, 0.9775497 + // 1, 0.2737045 + + // Create a FileStream object and write the IDataView to it as a binary IDV file. + using (FileStream stream = new FileStream("data.tsv", FileMode.Create)) + { + mlContext.Data.SaveAsText(data, stream); + } + + // Create an IDataView object by loading the binary IDV file. + IDataView loadedData = mlContext.Data.LoadFromTextFile("data.tsv"); + + // Inspect the data that is loaded from the previously saved binary file. + var loadedDataEnumerable = mlContext.Data.CreateEnumerable(loadedData, reuseRowObject: false); + PrintPreviewRows(loadedDataEnumerable); + + // The rows in the data. + // 0, 0.7262433 + // 1, 0.8173254 + // 0, 0.7680227 + // 1, 0.5581612 + // 0, 0.2060332 + // 1, 0.5588848 + // 0, 0.9060271 + // 1, 0.4421779 + // 0, 0.9775497 + // 1, 0.2737045 + + File.Delete("data.tsv"); + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + yield return new DataPoint + { + Label = i % 2, + + // Create random features that are correlated with label. + Features = (float)random.NextDouble() + }; + } + } + + // Example with label and feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + + public float Features { get; set; } + } + + // Print helper. + private static void PrintPreviewRows(IEnumerable data) + { + Console.WriteLine($"The rows in the data."); + foreach (var row in data) + Console.WriteLine($"{row.Label}, {row.Features}"); + } + } +} + From d26a846b93a7fec1a5d67b9651c5a522c6e24dfa Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Fri, 17 May 2019 17:36:10 -0700 Subject: [PATCH 2/3] PR comments --- .../DataOperations/SaveAndLoadFromBinary.cs | 73 +++++-------------- .../DataOperations/SaveAndLoadFromText.cs | 71 ++++-------------- .../Binary/BinaryLoaderSaverCatalog.cs | 14 ++++ .../Text/TextLoaderSaverCatalog.cs | 14 ++++ 4 files changed, 61 insertions(+), 111 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs index 55011177be..9606d9eec6 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs @@ -3,7 +3,7 @@ using System.IO; using Microsoft.ML; -namespace Samples.Dynamic +namespace Microsoft.ML.Samples.Dynamic { public static class SaveAndLoadFromBinary { @@ -15,67 +15,36 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - IEnumerable dataPoints = GenerateRandomDataPoints(10); + var dataPoints = new List() + { + new DataPoint(){ Label = 0, Features = 4}, + new DataPoint(){ Label = 0, Features = 5}, + new DataPoint(){ Label = 0, Features = 6}, + new DataPoint(){ Label = 1, Features = 8}, + new DataPoint(){ Label = 1, Features = 9}, + }; // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints); - // Inspect the data before saving to a binary file. - PrintPreviewRows(dataPoints); - - // The rows in the data. - // 0, 0.7262433 - // 1, 0.8173254 - // 0, 0.7680227 - // 1, 0.5581612 - // 0, 0.2060332 - // 1, 0.5588848 - // 0, 0.9060271 - // 1, 0.4421779 - // 0, 0.9775497 - // 1, 0.2737045 - // Create a FileStream object and write the IDataView to it as a binary IDV file. using (FileStream stream = new FileStream("data.idv", FileMode.Create)) - { mlContext.Data.SaveAsBinary(data, stream); - } // Create an IDataView object by loading the binary IDV file. IDataView loadedData = mlContext.Data.LoadFromBinary("data.idv"); // Inspect the data that is loaded from the previously saved binary file. var loadedDataEnumerable = mlContext.Data.CreateEnumerable(loadedData, reuseRowObject: false); - PrintPreviewRows(loadedDataEnumerable); - - // The rows in the data. - // 0, 0.7262433 - // 1, 0.8173254 - // 0, 0.7680227 - // 1, 0.5581612 - // 0, 0.2060332 - // 1, 0.5588848 - // 0, 0.9060271 - // 1, 0.4421779 - // 0, 0.9775497 - // 1, 0.2737045 - - File.Delete("data.idv"); - } - - private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0) - { - var random = new Random(seed); - for (int i = 0; i < count; i++) - { - yield return new DataPoint - { - Label = i % 2, + foreach (DataPoint row in loadedDataEnumerable) + Console.WriteLine($"{row.Label}, {row.Features}"); - // Create random features that are correlated with label. - Features = (float)random.NextDouble() - }; - } + // Preview of the loaded data. + // 0, 4 + // 0, 5 + // 0, 6 + // 1, 8 + // 1, 9 } // Example with label and feature values. A data set is a collection of such examples. @@ -85,13 +54,5 @@ private class DataPoint public float Features { get; set; } } - - // Print helper. - private static void PrintPreviewRows(IEnumerable data) - { - Console.WriteLine($"The rows in the data."); - foreach (var row in data) - Console.WriteLine($"{row.Label}, {row.Features}"); - } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs index 8ba46e0497..032e904777 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs @@ -15,67 +15,36 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - IEnumerable dataPoints = GenerateRandomDataPoints(10); + var dataPoints = new List() + { + new DataPoint(){ Label = 0, Features = 4}, + new DataPoint(){ Label = 0, Features = 5}, + new DataPoint(){ Label = 0, Features = 6}, + new DataPoint(){ Label = 1, Features = 8}, + new DataPoint(){ Label = 1, Features = 9}, + }; // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints); - // Inspect the data before saving to a binary file. - PrintPreviewRows(dataPoints); - - // The rows in the data. - // 0, 0.7262433 - // 1, 0.8173254 - // 0, 0.7680227 - // 1, 0.5581612 - // 0, 0.2060332 - // 1, 0.5588848 - // 0, 0.9060271 - // 1, 0.4421779 - // 0, 0.9775497 - // 1, 0.2737045 - // Create a FileStream object and write the IDataView to it as a binary IDV file. using (FileStream stream = new FileStream("data.tsv", FileMode.Create)) - { mlContext.Data.SaveAsText(data, stream); - } // Create an IDataView object by loading the binary IDV file. IDataView loadedData = mlContext.Data.LoadFromTextFile("data.tsv"); // Inspect the data that is loaded from the previously saved binary file. var loadedDataEnumerable = mlContext.Data.CreateEnumerable(loadedData, reuseRowObject: false); - PrintPreviewRows(loadedDataEnumerable); - - // The rows in the data. - // 0, 0.7262433 - // 1, 0.8173254 - // 0, 0.7680227 - // 1, 0.5581612 - // 0, 0.2060332 - // 1, 0.5588848 - // 0, 0.9060271 - // 1, 0.4421779 - // 0, 0.9775497 - // 1, 0.2737045 - - File.Delete("data.tsv"); - } - - private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0) - { - var random = new Random(seed); - for (int i = 0; i < count; i++) - { - yield return new DataPoint - { - Label = i % 2, + foreach (DataPoint row in loadedDataEnumerable) + Console.WriteLine($"{row.Label}, {row.Features}"); - // Create random features that are correlated with label. - Features = (float)random.NextDouble() - }; - } + // Preview of the loaded data. + // 0, 4 + // 0, 5 + // 0, 6 + // 1, 8 + // 1, 9 } // Example with label and feature values. A data set is a collection of such examples. @@ -85,14 +54,6 @@ private class DataPoint public float Features { get; set; } } - - // Print helper. - private static void PrintPreviewRows(IEnumerable data) - { - Console.WriteLine($"The rows in the data."); - foreach (var row in data) - Console.WriteLine($"{row.Label}, {row.Features}"); - } } } diff --git a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoaderSaverCatalog.cs index 91045b23e5..b4a48bd45a 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoaderSaverCatalog.cs @@ -37,6 +37,13 @@ public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, IMult /// /// The catalog. /// The path to the file to load from. + /// + /// + /// + /// + /// public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, string path) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -54,6 +61,13 @@ public static IDataView LoadFromBinary(this DataOperationsCatalog catalog, strin /// The data view to save. /// The stream to write to. /// Whether to keep hidden columns in the dataset. + /// + /// + /// + /// + /// public static void SaveAsBinary(this DataOperationsCatalog catalog, IDataView data, Stream stream, bool keepHidden = false) { diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 6297529c6f..e510c6df29 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -164,6 +164,13 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog cata /// The catalog. /// Specifies a file from which to load. /// Defines the settings of the load operation. + /// + /// + /// + /// + /// public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Options options = null) { @@ -186,6 +193,13 @@ public static IDataView LoadFromTextFile(this DataOperationsCatalog catalog, str /// Whether to write the header comment with the schema. /// Whether to keep hidden columns in the dataset. /// Whether to save columns in dense format even if they are sparse vectors. + /// + /// + /// + /// + /// public static void SaveAsText(this DataOperationsCatalog catalog, IDataView data, Stream stream, From 979f09c181bb3981f2ad95e819adba7056338264 Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Mon, 20 May 2019 16:06:39 -0700 Subject: [PATCH 3/3] nits --- .../Dynamic/DataOperations/SaveAndLoadFromBinary.cs | 2 +- .../Dynamic/DataOperations/SaveAndLoadFromText.cs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs index 9606d9eec6..b6448ec857 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromBinary.cs @@ -3,7 +3,7 @@ using System.IO; using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic +namespace Samples.Dynamic { public static class SaveAndLoadFromBinary { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs index 032e904777..9918b736ce 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/SaveAndLoadFromText.cs @@ -27,14 +27,14 @@ public static void Example() // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. IDataView data = mlContext.Data.LoadFromEnumerable(dataPoints); - // Create a FileStream object and write the IDataView to it as a binary IDV file. + // Create a FileStream object and write the IDataView to it as a text file. using (FileStream stream = new FileStream("data.tsv", FileMode.Create)) mlContext.Data.SaveAsText(data, stream); - // Create an IDataView object by loading the binary IDV file. + // Create an IDataView object by loading the text file. IDataView loadedData = mlContext.Data.LoadFromTextFile("data.tsv"); - // Inspect the data that is loaded from the previously saved binary file. + // Inspect the data that is loaded from the previously saved text file. var loadedDataEnumerable = mlContext.Data.CreateEnumerable(loadedData, reuseRowObject: false); foreach (DataPoint row in loadedDataEnumerable) Console.WriteLine($"{row.Label}, {row.Features}");