Adding documentation about the rest of the classes involved on genera…

…ting the CSharpAPI (#529) * Moving from xml strings to having the documentation details in xml files. For the summary text that is common between several learners, the examples will be added on a separate node. An example of how that will look like is in the LogisticRegressionBinaryClassifier and LogisticRegressionClassifier. * fixing the aftermath of renaming the XML file. * removing the Desc from the EntryPoint attribute is a bad idea. * removing the XML docs from the doc folder, and added them under the respective projects. * Some OS get picky about casing. * file name should be vanilla * Adding documentation for the first group of transforms * adding more documentation. changing the root of the XML documents from docs -> doc, since its only one. Switching all <see href /> to the valid <see cref /> * formatting tweaks, and adressing most of the code comments. * Extracted the examples outside of the member nodes in the xml, so that they only appear in the CSharpApi classes, and not on the runtime classes. * small fixes * addressing code comments * addressing Pete's comments. * Fixing language around the CharTokenizer description. Closes #389
dotnet · Jul 18, 2018 · 839bd6d · 839bd6d
1 parent 0e37508
commit 839bd6d
Show file tree

Hide file tree

Showing 69 changed files with 1,442 additions and 399 deletions.
diff --git a/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs b/src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs
@@ -47,7 +47,7 @@ public interface IReservoirSampler<T>
     /// This class produces a sample without replacement from a stream of data of type <typeparamref name="T"/>. 
     /// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/> 
     /// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
-    /// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
+    /// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
     /// </summary>
     public sealed class ReservoirSamplerWithoutReplacement<T> : IReservoirSampler<T>
     {
@@ -120,7 +120,7 @@ public IEnumerable<T> GetSample()
     /// This class produces a sample with replacement from a stream of data of type <typeparamref name="T"/>. 
     /// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/> 
     /// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
-    /// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
+    /// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
     /// </summary>
     public sealed class ReservoirSamplerWithReplacement<T> : IReservoirSampler<T>
     {

diff --git a/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs b/src/Microsoft.ML.Data/Evaluators/AucAggregator.cs
@@ -408,7 +408,7 @@ public UnweightedAuPrcAggregator(IRandom rand, int reservoirSize)
 
             /// <summary>
             /// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
-            /// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
+            /// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
             /// </summary>
             protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
             {
@@ -482,7 +482,7 @@ public WeightedAuPrcAggregator(IRandom rand, int reservoirSize)
 
             /// <summary>
             /// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
-            /// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
+            /// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
             /// </summary>
             protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
             {

diff --git a/src/Microsoft.ML.Data/Transforms/NAFilter.cs b/src/Microsoft.ML.Data/Transforms/NAFilter.cs
@@ -26,6 +26,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
+    /// <include file='doc.xml' path='doc/members/member[@name="NAFilter"]'/>
     public sealed class NAFilter : FilterBase
     {
         private static class Defaults

diff --git a/src/Microsoft.ML.Data/Transforms/TermTransform.cs b/src/Microsoft.ML.Data/Transforms/TermTransform.cs
@@ -29,14 +29,14 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <summary>
-    /// TermTransform builds up term vocabularies (dictionaries).
-    /// Notes:
-    /// * Each column builds/uses exactly one "vocabulary" (dictionary).
-    /// * Output columns are KeyType-valued.
-    /// * The Key value is the one-based index of the item in the dictionary.
-    /// * Not found is assigned the value zero.
-    /// </summary>
+
+    // TermTransform builds up term vocabularies (dictionaries).
+    // Notes:
+    // * Each column builds/uses exactly one "vocabulary" (dictionary).
+    // * Output columns are KeyType-valued.
+    // * The Key value is the one-based index of the item in the dictionary.
+    // * Not found is assigned the value zero.
+    /// <include file='doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
     public sealed partial class TermTransform : OneToOneTransformBase, ITransformTemplate
     {
         public abstract class ColumnBase : OneToOneColumn

diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<doc>
+  <members>
+    <member name="NAFilter">
+      <summary>
+        Removes missing values from vector type columns.
+      </summary>
+      <remarks>
+        This transform removes the entire row if any of the input columns have a missing value in that row.
+        This preprocessing is required for many ML algorithms that cannot work with missing values.
+        Useful if any missing entry invalidates the entire row.
+        If the <see cref="Microsoft.ML.Runtime.Data.NAFilter.Defaults.Complement"/> is set to true, this transform would do the exact opposite,
+        it will keep only the rows that have missing values.
+      </remarks>
+      <seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
+    </member>
+    <example name="NAFilter">
+      <example>
+        <code language="csharp">
+          pipeline.Add(new MissingValuesRowDropper(&quot;Column1&quot;));
+        </code>
+      </example>
+    </example>
+
+    <member name="TextToKey">
+      <summary>
+        Converts input values (words, numbers, etc.) to index in a dictionary.
+      </summary>
+      <remarks>
+        The TextToKeyConverter transform builds up term vocabularies (dictionaries).
+        The TextToKey Converter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
+        If multiple columns are used, each column builds/uses exactly one vocabulary.
+        The output columns are KeyType-valued.
+        The Key value is the one-based index of the item in the dictionary.
+        If the key is not found in the dictionary, it is assigned the missing value indicator.
+        This dictionary mapping values to keys is most commonly learnt from the unique values in input data,
+        but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file.
+      </remarks>
+      <seealso cref="T:Microsoft.ML.Transforms.HashConverter"/>
+      <seealso cref="T:Microsoft.ML.Transforms.KeyToTextConverter"/>
+    </member>
+    <example name="TextToKey">
+      <example>
+        <code language="csharp">
+          pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;))
+          { 
+            Sort = TermTransformSortOrder.Occurrence 
+          });
+        </code>
+      </example>
+    </example>
+
+  </members>
+</doc>
diff --git a/src/Microsoft.ML.FastTree/FastTreeArguments.cs b/src/Microsoft.ML.FastTree/FastTreeArguments.cs
@@ -20,7 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
     {
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer
     {
         [TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]

diff --git a/src/Microsoft.ML.FastTree/FastTreeClassification.cs b/src/Microsoft.ML.FastTree/FastTreeClassification.cs
@@ -100,7 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         public override PredictionKind PredictionKind => PredictionKind.BinaryClassification;
     }
 
-    /// <include file = './doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file = 'doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeBinaryClassificationTrainer :
         BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -342,7 +342,8 @@ public static partial class FastTree
             Desc = FastTreeBinaryClassificationTrainer.Summary,
             UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
             ShortName = FastTreeBinaryClassificationTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeBinaryClassifier""]/*' />" })]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/FastTreeRanking.cs b/src/Microsoft.ML.FastTree/FastTreeRanking.cs
@@ -38,7 +38,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
         IHasLabelGains
     {
@@ -1096,7 +1096,8 @@ public static partial class FastTree
             Desc = FastTreeRankingTrainer.Summary,
             UserName = FastTreeRankingTrainer.UserNameValue,
             ShortName = FastTreeRankingTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRanker""]/*' />"})]
         public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/FastTreeRegression.cs b/src/Microsoft.ML.FastTree/FastTreeRegression.cs
@@ -31,7 +31,7 @@
 
 namespace Microsoft.ML.Runtime.FastTree
 {
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
     public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
     {
         public const string LoadNameValue = "FastTreeRegression";
@@ -445,7 +445,8 @@ public static partial class FastTree
             Desc = FastTreeRegressionTrainer.Summary,
             UserName = FastTreeRegressionTrainer.UserNameValue,
             ShortName = FastTreeRegressionTrainer.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRegressor""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/FastTreeTweedie.cs b/src/Microsoft.ML.FastTree/FastTreeTweedie.cs
@@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.FastTree
     // The Tweedie boosting model follows the mathematics established in:
     // Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
     // https://arxiv.org/pdf/1508.06378.pdf
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastTreeTweedieRegression"]/*' />
     public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
     {
         public const string LoadNameValue = "FastTreeTweedieRegression";
@@ -454,7 +454,7 @@ public static partial class FastTree
             Desc = FastTreeTweedieTrainer.Summary,
             UserName = FastTreeTweedieTrainer.UserNameValue,
             ShortName = FastTreeTweedieTrainer.ShortName,
-            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
+            XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
         public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/RandomForestClassification.cs b/src/Microsoft.ML.FastTree/RandomForestClassification.cs
@@ -106,7 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
         }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestClassification :
         RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
     {
@@ -206,7 +206,8 @@ public static partial class FastForest
             Desc = FastForestClassification.Summary,
             UserName = FastForestClassification.UserNameValue,
             ShortName = FastForestClassification.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestBinaryClassifier""]/*' />"})]
         public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/RandomForestRegression.cs b/src/Microsoft.ML.FastTree/RandomForestRegression.cs
@@ -137,7 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
         }
     }
 
-    /// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
+    /// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
     public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
     {
         public sealed class Arguments : FastForestArgumentsBase
@@ -277,7 +277,8 @@ public static partial class FastForest
             Desc = FastForestRegression.Summary,
             UserName = FastForestRegression.LoadNameValue,
             ShortName = FastForestRegression.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
+                                 @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestRegressor""]/*' />"})]
         public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));

diff --git a/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs b/src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs
@@ -33,20 +33,20 @@ public delegate void FindBestThresholdFromRawArrayFun(LeafSplitCandidates leafSp
     /// <summary>
     /// Interface used for parallel training.
     /// Mainly contains three parts:
-    /// 1. interactive with IO: <see href="GetLocalBinConstructionFeatures" />, <see href="SyncGlobalBoundary" />.
+    /// 1. interactive with IO: <see cref="GetLocalBinConstructionFeatures" />, <see cref="SyncGlobalBoundary" />.
     ///    Data will be partitioned by rows in Data parallel and Voting Parallel.
     ///    To speed up the find bin process, it let different workers to find bins for different features.
     ///    Then perform global sync up.
     ///    In Feature parallel, every machines holds all data, so this is unneeded.
-    /// 2. interactive with TreeLearner: <see href="InitIteration" />, <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" />, 
-    ///        <see href="IsSkipNonSplittableHistogram" />, <see href="FindGlobalBestSplit" />, <see href="GetGlobalDataCountInLeaf" />, <see href="PerformGlobalSplit" />.
+    /// 2. interactive with TreeLearner: <see cref="InitIteration" />, <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" />, 
+    ///        <see cref="IsSkipNonSplittableHistogram" />, <see cref="FindGlobalBestSplit" />, <see cref="GetGlobalDataCountInLeaf" />, <see cref="PerformGlobalSplit" />.
     ///    A full process is:
-    ///        Use <see href="InitIteration" /> to alter local active features.
-    ///        Use <see href="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
-    ///        Use <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" /> and <see href="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
-    ///        Use <see href="FindGlobalBestSplit" /> to sync up global best split
-    ///        Use <see href="PerformGlobalSplit" /> to record global num_data in leaves.
-    /// 3. interactive with Application : <see href="GlobalMean" />.
+    ///        Use <see cref="InitIteration" /> to alter local active features.
+    ///        Use <see cref="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
+    ///        Use <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" /> and <see cref="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
+    ///        Use <see cref="FindGlobalBestSplit" /> to sync up global best split
+    ///        Use <see cref="PerformGlobalSplit" /> to record global num_data in leaves.
+    /// 3. interactive with Application : <see cref="GlobalMean" />.
     ///    Output of leaves is calculated by newton step ( - sum(first_order_gradients) / sum(second_order_gradients)).
     ///    If data is partitioned by row, it needs to a sync up for these sum result.
     ///    So It needs to call this to get the real output of leaves.

diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
@@ -544,6 +544,7 @@ public ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema)
         }
     }
 
+    /// <include file='doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]'/>
     public static class TreeEnsembleFeaturizerTransform
     {
         public sealed class Arguments : TrainAndScoreTransform.ArgumentsBase<SignatureTreeEnsembleTrainer>
@@ -802,7 +803,11 @@ private static IDataView AppendLabelTransform(IHostEnvironment env, IChannel ch,
 
     public static partial class TreeFeaturize
     {
-        [TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer", Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary, UserName = TreeEnsembleFeaturizerTransform.UserName, ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort)]
+        [TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer", 
+            Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary, 
+            UserName = TreeEnsembleFeaturizerTransform.UserName, 
+            ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort,
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""TreeEnsembleFeaturizerTransform""]'/>" })]
         public static CommonOutputs.TransformOutput Featurizer(IHostEnvironment env, TreeEnsembleFeaturizerTransform.ArgumentsForEntryPoint input)
         {
             Contracts.CheckValue(env, nameof(env));