Skip to content

Commit

Permalink
Adding documentation about the rest of the classes involved on genera…
Browse files Browse the repository at this point in the history
…ting the CSharpAPI (#529)

* Moving from xml strings to having the documentation details in xml files.
For the summary text that is common between several learners, the examples will be added on a separate node.
An example of how that will look like is in the LogisticRegressionBinaryClassifier and LogisticRegressionClassifier.

* fixing the aftermath of renaming the XML file.

* removing the Desc from the EntryPoint attribute is a bad idea.

* removing the XML docs from the doc folder, and added them under the respective projects.

* Some OS get picky about casing.

* file name should be vanilla

* Adding documentation for the first group of transforms

* adding more documentation.
changing the root of the XML documents from docs -> doc, since its only one.
Switching all <see href /> to the valid <see cref />

* formatting tweaks, and adressing most of the code comments.

* Extracted the examples outside of the member nodes in the xml, so that they only appear in the CSharpApi classes, and not on the runtime classes.

* small fixes

* addressing code comments

* addressing Pete's comments.

* Fixing language around the CharTokenizer description.

Closes #389
  • Loading branch information
sfilipi authored Jul 18, 2018
1 parent 0e37508 commit 839bd6d
Show file tree
Hide file tree
Showing 69 changed files with 1,442 additions and 399 deletions.
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Core/Utilities/ReservoirSampler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public interface IReservoirSampler<T>
/// This class produces a sample without replacement from a stream of data of type <typeparamref name="T"/>.
/// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/>
/// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
/// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
/// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
/// </summary>
public sealed class ReservoirSamplerWithoutReplacement<T> : IReservoirSampler<T>
{
Expand Down Expand Up @@ -120,7 +120,7 @@ public IEnumerable<T> GetSample()
/// This class produces a sample with replacement from a stream of data of type <typeparamref name="T"/>.
/// It is instantiated with a delegate that gets the next data point, and builds a reservoir in one pass by calling <see cref="Sample"/>
/// for every data point in the stream. In case the next data point does not get 'picked' into the reservoir, the delegate is not invoked.
/// Sampling is done according to the algorithm in this paper: <see href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53"/>.
/// Sampling is done according to the algorithm in this paper: <a href="http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53">http://epubs.siam.org/doi/pdf/10.1137/1.9781611972740.53</a>.
/// </summary>
public sealed class ReservoirSamplerWithReplacement<T> : IReservoirSampler<T>
{
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Evaluators/AucAggregator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ public UnweightedAuPrcAggregator(IRandom rand, int reservoirSize)

/// <summary>
/// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
/// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
/// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
/// </summary>
protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
{
Expand Down Expand Up @@ -482,7 +482,7 @@ public WeightedAuPrcAggregator(IRandom rand, int reservoirSize)

/// <summary>
/// Compute the AUPRC using the "lower trapesoid" estimator, as described in the paper
/// <see href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf"/>.
/// <a href="http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf">http://www.ecmlpkdd2013.org/wp-content/uploads/2013/07/aucpr_2013ecml_corrected.pdf</a>.
/// </summary>
protected override Double ComputeWeightedAuPrcCore(out Double unweighted)
{
Expand Down
1 change: 1 addition & 0 deletions src/Microsoft.ML.Data/Transforms/NAFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

namespace Microsoft.ML.Runtime.Data
{
/// <include file='doc.xml' path='doc/members/member[@name="NAFilter"]'/>
public sealed class NAFilter : FilterBase
{
private static class Defaults
Expand Down
16 changes: 8 additions & 8 deletions src/Microsoft.ML.Data/Transforms/TermTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@

namespace Microsoft.ML.Runtime.Data
{
/// <summary>
/// TermTransform builds up term vocabularies (dictionaries).
/// Notes:
/// * Each column builds/uses exactly one "vocabulary" (dictionary).
/// * Output columns are KeyType-valued.
/// * The Key value is the one-based index of the item in the dictionary.
/// * Not found is assigned the value zero.
/// </summary>

// TermTransform builds up term vocabularies (dictionaries).
// Notes:
// * Each column builds/uses exactly one "vocabulary" (dictionary).
// * Output columns are KeyType-valued.
// * The Key value is the one-based index of the item in the dictionary.
// * Not found is assigned the value zero.
/// <include file='doc.xml' path='doc/members/member[@name="TextToKey"]/*' />
public sealed partial class TermTransform : OneToOneTransformBase, ITransformTemplate
{
public abstract class ColumnBase : OneToOneColumn
Expand Down
54 changes: 54 additions & 0 deletions src/Microsoft.ML.Data/Transforms/doc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?xml version="1.0" encoding="utf-8" ?>
<doc>
<members>
<member name="NAFilter">
<summary>
Removes missing values from vector type columns.
</summary>
<remarks>
This transform removes the entire row if any of the input columns have a missing value in that row.
This preprocessing is required for many ML algorithms that cannot work with missing values.
Useful if any missing entry invalidates the entire row.
If the <see cref="Microsoft.ML.Runtime.Data.NAFilter.Defaults.Complement"/> is set to true, this transform would do the exact opposite,
it will keep only the rows that have missing values.
</remarks>
<seealso cref="Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"></seealso>
</member>
<example name="NAFilter">
<example>
<code language="csharp">
pipeline.Add(new MissingValuesRowDropper(&quot;Column1&quot;));
</code>
</example>
</example>

<member name="TextToKey">
<summary>
Converts input values (words, numbers, etc.) to index in a dictionary.
</summary>
<remarks>
The TextToKeyConverter transform builds up term vocabularies (dictionaries).
The TextToKey Converter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
If multiple columns are used, each column builds/uses exactly one vocabulary.
The output columns are KeyType-valued.
The Key value is the one-based index of the item in the dictionary.
If the key is not found in the dictionary, it is assigned the missing value indicator.
This dictionary mapping values to keys is most commonly learnt from the unique values in input data,
but can be defined through other means: either with the mapping defined directly on the command line, or as loaded from an external file.
</remarks>
<seealso cref="T:Microsoft.ML.Transforms.HashConverter"/>
<seealso cref="T:Microsoft.ML.Transforms.KeyToTextConverter"/>
</member>
<example name="TextToKey">
<example>
<code language="csharp">
pipeline.Add(new TextToKeyConverter((&quot;Column&quot;, &quot;OutColumn&quot;))
{
Sort = TermTransformSortOrder.Occurrence
});
</code>
</example>
</example>

</members>
</doc>
2 changes: 1 addition & 1 deletion src/Microsoft.ML.FastTree/FastTreeArguments.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public interface IFastTreeTrainerFactory : IComponentFactory<ITrainer>
{
}

/// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
public sealed partial class FastTreeBinaryClassificationTrainer
{
[TlcModule.Component(Name = LoadNameValue, FriendlyName = UserNameValue, Desc = Summary)]
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.FastTree/FastTreeClassification.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
public override PredictionKind PredictionKind => PredictionKind.BinaryClassification;
}

/// <include file = './doc.xml' path='docs/members/member[@name="FastTree"]/*' />
/// <include file = 'doc.xml' path='doc/members/member[@name="FastTree"]/*' />
public sealed partial class FastTreeBinaryClassificationTrainer :
BoostingFastTreeTrainerBase<FastTreeBinaryClassificationTrainer.Arguments, IPredictorWithFeatureWeights<Float>>
{
Expand Down Expand Up @@ -342,7 +342,8 @@ public static partial class FastTree
Desc = FastTreeBinaryClassificationTrainer.Summary,
UserName = FastTreeBinaryClassificationTrainer.UserNameValue,
ShortName = FastTreeBinaryClassificationTrainer.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
@"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeBinaryClassifier""]/*' />" })]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastTreeBinaryClassificationTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.FastTree/FastTreeRanking.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

namespace Microsoft.ML.Runtime.FastTree
{
/// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
public sealed partial class FastTreeRankingTrainer : BoostingFastTreeTrainerBase<FastTreeRankingTrainer.Arguments, FastTreeRankingPredictor>,
IHasLabelGains
{
Expand Down Expand Up @@ -1096,7 +1096,8 @@ public static partial class FastTree
Desc = FastTreeRankingTrainer.Summary,
UserName = FastTreeRankingTrainer.UserNameValue,
ShortName = FastTreeRankingTrainer.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
@"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRanker""]/*' />"})]
public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, FastTreeRankingTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.FastTree/FastTreeRegression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

namespace Microsoft.ML.Runtime.FastTree
{
/// <include file='./doc.xml' path='docs/members/member[@name="FastTree"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastTree"]/*' />
public sealed partial class FastTreeRegressionTrainer : BoostingFastTreeTrainerBase<FastTreeRegressionTrainer.Arguments, FastTreeRegressionPredictor>
{
public const string LoadNameValue = "FastTreeRegression";
Expand Down Expand Up @@ -445,7 +445,8 @@ public static partial class FastTree
Desc = FastTreeRegressionTrainer.Summary,
UserName = FastTreeRegressionTrainer.UserNameValue,
ShortName = FastTreeRegressionTrainer.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTree""]/*' />" })]
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTree""]/*' />",
@"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastTreeRegressor""]/*' />"})]
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastTreeRegressionTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.FastTree/FastTreeTweedie.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace Microsoft.ML.Runtime.FastTree
// The Tweedie boosting model follows the mathematics established in:
// Yang, Quan, and Zou. "Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models."
// https://arxiv.org/pdf/1508.06378.pdf
/// <include file='./doc.xml' path='docs/members/member[@name="FastTreeTweedieRegression"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastTreeTweedieRegression"]/*' />
public sealed partial class FastTreeTweedieTrainer : BoostingFastTreeTrainerBase<FastTreeTweedieTrainer.Arguments, FastTreeTweediePredictor>
{
public const string LoadNameValue = "FastTreeTweedieRegression";
Expand Down Expand Up @@ -454,7 +454,7 @@ public static partial class FastTree
Desc = FastTreeTweedieTrainer.Summary,
UserName = FastTreeTweedieTrainer.UserNameValue,
ShortName = FastTreeTweedieTrainer.ShortName,
XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
XmlInclude = new [] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastTreeTweedieRegression""]/*' />" })]
public static CommonOutputs.RegressionOutput TrainTweedieRegression(IHostEnvironment env, FastTreeTweedieTrainer.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.FastTree/RandomForestClassification.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ public static IPredictorProducing<Float> Create(IHostEnvironment env, ModelLoadC
}
}

/// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
public sealed partial class FastForestClassification :
RandomForestTrainerBase<FastForestClassification.Arguments, IPredictorWithFeatureWeights<Float>>
{
Expand Down Expand Up @@ -206,7 +206,8 @@ public static partial class FastForest
Desc = FastForestClassification.Summary,
UserName = FastForestClassification.UserNameValue,
ShortName = FastForestClassification.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
@"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestBinaryClassifier""]/*' />"})]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, FastForestClassification.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.FastTree/RandomForestRegression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public ISchemaBindableMapper CreateMapper(Double[] quantiles)
}
}

/// <include file='./doc.xml' path='docs/members/member[@name="FastForest"]/*' />
/// <include file='doc.xml' path='doc/members/member[@name="FastForest"]/*' />
public sealed partial class FastForestRegression : RandomForestTrainerBase<FastForestRegression.Arguments, FastForestRegressionPredictor>
{
public sealed class Arguments : FastForestArgumentsBase
Expand Down Expand Up @@ -277,7 +277,8 @@ public static partial class FastForest
Desc = FastForestRegression.Summary,
UserName = FastForestRegression.LoadNameValue,
ShortName = FastForestRegression.ShortName,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='docs/members/member[@name=""FastForest""]/*' />" })]
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""FastForest""]/*' />",
@"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/example[@name=""FastForestRegressor""]/*' />"})]
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, FastForestRegression.Arguments input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
18 changes: 9 additions & 9 deletions src/Microsoft.ML.FastTree/Training/Parallel/IParallelTraining.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,20 @@ public delegate void FindBestThresholdFromRawArrayFun(LeafSplitCandidates leafSp
/// <summary>
/// Interface used for parallel training.
/// Mainly contains three parts:
/// 1. interactive with IO: <see href="GetLocalBinConstructionFeatures" />, <see href="SyncGlobalBoundary" />.
/// 1. interactive with IO: <see cref="GetLocalBinConstructionFeatures" />, <see cref="SyncGlobalBoundary" />.
/// Data will be partitioned by rows in Data parallel and Voting Parallel.
/// To speed up the find bin process, it let different workers to find bins for different features.
/// Then perform global sync up.
/// In Feature parallel, every machines holds all data, so this is unneeded.
/// 2. interactive with TreeLearner: <see href="InitIteration" />, <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" />,
/// <see href="IsSkipNonSplittableHistogram" />, <see href="FindGlobalBestSplit" />, <see href="GetGlobalDataCountInLeaf" />, <see href="PerformGlobalSplit" />.
/// 2. interactive with TreeLearner: <see cref="InitIteration" />, <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" />,
/// <see cref="IsSkipNonSplittableHistogram" />, <see cref="FindGlobalBestSplit" />, <see cref="GetGlobalDataCountInLeaf" />, <see cref="PerformGlobalSplit" />.
/// A full process is:
/// Use <see href="InitIteration" /> to alter local active features.
/// Use <see href="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
/// Use <see href="CacheHistogram" />, <see href="IsNeedFindLocalBestSplit" /> and <see href="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
/// Use <see href="FindGlobalBestSplit" /> to sync up global best split
/// Use <see href="PerformGlobalSplit" /> to record global num_data in leaves.
/// 3. interactive with Application : <see href="GlobalMean" />.
/// Use <see cref="InitIteration" /> to alter local active features.
/// Use <see cref="GetGlobalDataCountInLeaf" /> to check smaller leaf and larger leaf.
/// Use <see cref="CacheHistogram" />, <see cref="IsNeedFindLocalBestSplit" /> and <see cref="IsSkipNonSplittableHistogram" /> to interactive with Feature histograms.
/// Use <see cref="FindGlobalBestSplit" /> to sync up global best split
/// Use <see cref="PerformGlobalSplit" /> to record global num_data in leaves.
/// 3. interactive with Application : <see cref="GlobalMean" />.
/// Output of leaves is calculated by newton step ( - sum(first_order_gradients) / sum(second_order_gradients)).
/// If data is partitioned by row, it needs to a sync up for these sum result.
/// So It needs to call this to get the real output of leaves.
Expand Down
7 changes: 6 additions & 1 deletion src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ public ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema)
}
}

/// <include file='doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]'/>
public static class TreeEnsembleFeaturizerTransform
{
public sealed class Arguments : TrainAndScoreTransform.ArgumentsBase<SignatureTreeEnsembleTrainer>
Expand Down Expand Up @@ -802,7 +803,11 @@ private static IDataView AppendLabelTransform(IHostEnvironment env, IChannel ch,

public static partial class TreeFeaturize
{
[TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer", Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary, UserName = TreeEnsembleFeaturizerTransform.UserName, ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort)]
[TlcModule.EntryPoint(Name = "Transforms.TreeLeafFeaturizer",
Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary,
UserName = TreeEnsembleFeaturizerTransform.UserName,
ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort,
XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""TreeEnsembleFeaturizerTransform""]'/>" })]
public static CommonOutputs.TransformOutput Featurizer(IHostEnvironment env, TreeEnsembleFeaturizerTransform.ArgumentsForEntryPoint input)
{
Contracts.CheckValue(env, nameof(env));
Expand Down
Loading

0 comments on commit 839bd6d

Please sign in to comment.