Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrub text normalizer #2918

Merged
merged 1 commit into from
Mar 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ private sealed class OutPipelineColumn : Scalar<string>
{
public readonly Scalar<string> Input;

public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
: base(new Reconciler(textCase, keepDiacritics, keepPunctuations, keepNumbers), input)
{
Input = input;
Expand All @@ -185,12 +185,12 @@ public OutPipelineColumn(Scalar<string> input, TextNormalizingEstimator.CaseNorm

private sealed class Reconciler : EstimatorReconciler, IEquatable<Reconciler>
{
private readonly TextNormalizingEstimator.CaseNormalizationMode _textCase;
private readonly TextNormalizingEstimator.CaseMode _textCase;
private readonly bool _keepDiacritics;
private readonly bool _keepPunctuations;
private readonly bool _keepNumbers;

public Reconciler(TextNormalizingEstimator.CaseNormalizationMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
public Reconciler(TextNormalizingEstimator.CaseMode textCase, bool keepDiacritics, bool keepPunctuations, bool keepNumbers)
{
_textCase = textCase;
_keepDiacritics = keepDiacritics;
Expand Down Expand Up @@ -227,15 +227,15 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
/// Normalizes input text by changing case, removing diacritical marks, punctuation marks and/or numbers.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="textCase">Casing text using the rules of the invariant culture.</param>
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
public static Scalar<string> NormalizeText(this Scalar<string> input,
TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizingEstimator.CaseNormalizationMode.Lower,
TextNormalizingEstimator.CaseMode caseMode = TextNormalizingEstimator.CaseMode.Lower,
bool keepDiacritics = false,
bool keepPunctuations = true,
bool keepNumbers = true) => new OutPipelineColumn(input, textCase, keepDiacritics, keepPunctuations, keepNumbers);
bool keepNumbers = true) => new OutPipelineColumn(input, caseMode, keepDiacritics, keepPunctuations, keepNumbers);
}

/// <summary>
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,19 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="textCase">Casing text using the rules of the invariant culture.</param>
/// <param name="caseMode">Casing text using the rules of the invariant culture.</param>
/// <param name="keepDiacritics">Whether to keep diacritical marks or remove them.</param>
/// <param name="keepPunctuations">Whether to keep punctuation marks or remove them.</param>
/// <param name="keepNumbers">Whether to keep numbers or remove them.</param>
public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
TextNormalizingEstimator.CaseNormalizationMode textCase = TextNormalizeDefaults.TextCase,
TextNormalizingEstimator.CaseMode caseMode = TextNormalizeDefaults.Mode,
bool keepDiacritics = TextNormalizeDefaults.KeepDiacritics,
bool keepPunctuations = TextNormalizeDefaults.KeepPunctuations,
bool keepNumbers = TextNormalizeDefaults.KeepNumbers)
=> new TextNormalizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, inputColumnName, textCase, keepDiacritics, keepPunctuations, keepNumbers);
outputColumnName, inputColumnName, caseMode, keepDiacritics, keepPunctuations, keepNumbers);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <param name="catalog">The text-related transform's catalog.</param>
Expand Down
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

namespace Microsoft.ML.Transforms.Text
{
using CaseNormalizationMode = TextNormalizingEstimator.CaseNormalizationMode;
using CaseMode = TextNormalizingEstimator.CaseMode;
// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are counts
// of (word or character) ngrams in a given text. It offers ngram hashing (finding the ngram token string name to feature
// integer index mapping through hashing) as an option.
Expand Down Expand Up @@ -100,7 +100,7 @@ internal sealed class Arguments : TransformInputBase
public bool UsePredefinedStopWordRemover = false;

[Argument(ArgumentType.AtMostOnce, HelpText = "Casing text using the rules of the invariant culture.", ShortName = "case", SortOrder = 5)]
public CaseNormalizationMode TextCase = TextNormalizingEstimator.Defaults.TextCase;
public CaseMode TextCase = TextNormalizingEstimator.Defaults.Mode;

[Argument(ArgumentType.AtMostOnce, HelpText = "Whether to keep diacritical marks or remove them.", ShortName = "diac", SortOrder = 6)]
public bool KeepDiacritics = TextNormalizingEstimator.Defaults.KeepDiacritics;
Expand Down Expand Up @@ -142,7 +142,7 @@ public sealed class Options
/// <summary>
/// Casing used for the text.
/// </summary>
public CaseNormalizationMode TextCase { get; set; } = CaseNormalizationMode.Lower;
public CaseMode TextCase { get; set; } = CaseMode.Lower;
/// <summary>
/// Whether to keep diacritical marks or remove them.
/// </summary>
Expand Down Expand Up @@ -203,7 +203,7 @@ private sealed class TransformApplierParams
public readonly NormFunction VectorNormalizer;
public readonly Language Language;
public readonly bool UsePredefinedStopWordRemover;
public readonly CaseNormalizationMode TextCase;
public readonly CaseMode TextCase;
public readonly bool KeepDiacritics;
public readonly bool KeepPunctuations;
public readonly bool KeepNumbers;
Expand Down Expand Up @@ -241,7 +241,7 @@ public bool NeedsNormalizeTransform
get
{
return
TextCase != CaseNormalizationMode.None ||
TextCase != CaseMode.None ||
!KeepDiacritics ||
!KeepPunctuations ||
!KeepNumbers;
Expand Down Expand Up @@ -275,7 +275,7 @@ public TransformApplierParams(TextFeaturizingEstimator parent)
{
var host = parent._host;
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage));
host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.OptionalSettings.TextCase));
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
Expand Down
Loading