Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed the TextTransform bug where chargrams where being computed differently when using with/without word tokenizer. #548

Merged
merged 8 commits into from
Jul 26, 2018
78 changes: 74 additions & 4 deletions src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,16 @@ public sealed class Arguments : TransformInputBase
public const string LoaderSignature = "CharToken";
public const string UserName = "Character Tokenizer Transform";

// Keep track of the model that was saved with ver:0x00010001
private readonly bool _isSeparatorStartEnd;

private static VersionInfo GetVersionInfo()
{
return new VersionInfo(
modelSignature: "CHARTOKN",
verWrittenCur: 0x00010001, // Initial
Copy link
Contributor

@TomFinley TomFinley Jul 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

verWrittenCur: 0x00010001, // Initial [](start = 16, length = 37)

See other examples. You will see that we keep commented out the version strings from old versions, to track them and so we know why we bumped the version each time. This is important: we have live code in our deserializers that is meant to handle those version bumps, and for that reason we like to have documentation on what exactly changed, so that when we have tests on this or that version in our deserializers, we know why they changed. #Resolved

verReadableCur: 0x00010001,
//verWrittenCur: 0x00010001, // Initial
verWrittenCur: 0x00010002, // Updated to use UnitSeparator <US> character instead of using <ETX><STX> for vector inputs.
verReadableCur: 0x00010002,
verWeCanReadBack: 0x00010001,
loaderSignature: LoaderSignature);
}
Expand All @@ -84,6 +88,7 @@ private static VersionInfo GetVersionInfo()
private volatile string _keyValuesStr;
private volatile int[] _keyValuesBoundaries;

private const ushort UnitSeparator = 0x1f;
private const ushort TextStartMarker = 0x02;
private const ushort TextEndMarker = 0x03;
private const int TextMarkersCount = 2;
Expand All @@ -102,6 +107,7 @@ public CharTokenizeTransform(IHostEnvironment env, Arguments args, IDataView inp

_type = GetOutputColumnType();
SetMetadata();
_isSeparatorStartEnd = false;
Copy link
Contributor

@TomFinley TomFinley Jul 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI there is no need for this. The default value of bool is false. #Resolved

}

private static ColumnType GetOutputColumnType()
Expand All @@ -120,6 +126,8 @@ private CharTokenizeTransform(IHost host, ModelLoadContext ctx, IDataView input)
// byte: _useMarkerChars value.
_useMarkerChars = ctx.Reader.ReadBoolByte();

_isSeparatorStartEnd = ctx.Header.ModelVerReadable < 0x00010002 || ctx.Reader.ReadBoolByte();

_type = GetOutputColumnType();
SetMetadata();
}
Expand All @@ -145,6 +153,7 @@ public override void Save(ModelSaveContext ctx)
// byte: _useMarkerChars value.
SaveBase(ctx);
ctx.Writer.WriteBoolByte(_useMarkerChars);
ctx.Writer.WriteBoolByte(_isSeparatorStartEnd);
}

protected override ColumnType GetColumnTypeCore(int iinfo)
Expand Down Expand Up @@ -399,8 +408,8 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)

var getSrc = GetSrcGetter<VBuffer<DvText>>(input, iinfo);
var src = default(VBuffer<DvText>);
return
(ref VBuffer<ushort> dst) =>

ValueGetter<VBuffer<ushort>> getterWithStartEndSep = (ref VBuffer<ushort> dst) =>
{
getSrc(ref src);

Expand Down Expand Up @@ -438,6 +447,67 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)

dst = new VBuffer<ushort>(len, values, dst.Indices);
};

ValueGetter < VBuffer<ushort> > getterWithUnitSep = (ref VBuffer<ushort> dst) =>
{
getSrc(ref src);

int len = 0;

for (int i = 0; i < src.Count; i++)
{
if (src.Values[i].HasChars)
{
len += src.Values[i].Length;

if (i > 0)
len += 1; // add UnitSeparator character to len that will be added
}
}

if (_useMarkerChars)
len += TextMarkersCount;

var values = dst.Values;
if (len > 0)
{
if (Utils.Size(values) < len)
values = new ushort[len];

int index = 0;

// VBuffer<DvText> can be a result of either concatenating text columns together
// or application of word tokenizer before char tokenizer in TextTransform.
//
// Considering VBuffer<DvText> as a single text stream.
// Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector.
// Insert UnitSeparator after every piece of text in the vector.
if (_useMarkerChars)
values[index++] = TextStartMarker;

for (int i = 0; i < src.Count; i++)
{
if (!src.Values[i].HasChars)
continue;

if (i > 0)
values[index++] = UnitSeparator;

for (int ich = 0; ich < src.Values[i].Length; ich++)
{
values[index++] = src.Values[i][ich];
}
}

if (_useMarkerChars)
values[index++] = TextEndMarker;

Contracts.Assert(index == len);
}

dst = new VBuffer<ushort>(len, values, dst.Indices);
};
return _isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep;
}
}
}
54 changes: 25 additions & 29 deletions src/Microsoft.ML.Transforms/Text/TextTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,30 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
view = new ConcatTransform(h, new ConcatTransform.Arguments() { Column = xfCols }, view);
}

if (tparams.NeedsNormalizeTransform)
Copy link
Contributor Author

@zeahmed zeahmed Jul 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@justinormont, I moved TextNormalizer above WordTokenizer. Let me know if there is any adverse effect of it? #Resolved

Copy link
Contributor

@justinormont justinormont Jul 20, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if there are averse effects of running TextNormalizer before WordTokenizer. I expect it is different, but assume not adversely so. For instance (depending on the specifics of the tokenizer), I don't can be split into {I, do, n't} but if removing punctuation first, it may be split to {I, dont}.

If you make me a build, I'll run a manual regression test. Unfortunately we don't currently have running nightly regression tests to tell us if the text datasets decreased in their core metrics.

#Resolved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this change. It's far from clear to me that if someone asks to remove stopwords that they meant for the content of stopwords to be retained in the chargrams. At the very least this ought to be a configurable option.


In reply to: 204000786 [](ancestors = 204000786)

Copy link
Contributor Author

@zeahmed zeahmed Jul 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TomFinley, Can you elaborate more on this? Your point is not clear to me. Don't you see it good to have TextNormalizer applied before WordTokenizer?


In reply to: 204201054 [](ancestors = 204201054,204000786)

Copy link
Contributor

@TomFinley TomFinley Jul 25, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry thought you were saying something else for some reason got confused. #Resolved

{
var xfCols = new TextNormalizerCol[textCols.Length];
string[] dstCols = new string[textCols.Length];
for (int i = 0; i < textCols.Length; i++)
{
dstCols[i] = GenerateColumnName(view.Schema, textCols[i], "TextNormalizer");
tempCols.Add(dstCols[i]);
xfCols[i] = new TextNormalizerCol() { Source = textCols[i], Name = dstCols[i] };
}

view = new TextNormalizerTransform(h,
new TextNormalizerArgs()
{
Column = xfCols,
KeepDiacritics = tparams.KeepDiacritics,
KeepNumbers = tparams.KeepNumbers,
KeepPunctuations = tparams.KeepPunctuations,
TextCase = tparams.TextCase
}, view);

textCols = dstCols;
}

if (tparams.NeedsWordTokenizationTransform)
{
var xfCols = new DelimitedTokenizeTransform.Column[textCols.Length];
Expand All @@ -281,34 +305,6 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
view = new DelimitedTokenizeTransform(h, new DelimitedTokenizeTransform.Arguments() { Column = xfCols }, view);
}

if (tparams.NeedsNormalizeTransform)
{
string[] srcCols = wordTokCols == null ? textCols : wordTokCols;
var xfCols = new TextNormalizerCol[srcCols.Length];
string[] dstCols = new string[srcCols.Length];
for (int i = 0; i < srcCols.Length; i++)
{
dstCols[i] = GenerateColumnName(view.Schema, srcCols[i], "TextNormalizer");
tempCols.Add(dstCols[i]);
xfCols[i] = new TextNormalizerCol() { Source = srcCols[i], Name = dstCols[i] };
}

view = new TextNormalizerTransform(h,
new TextNormalizerArgs()
{
Column = xfCols,
KeepDiacritics = tparams.KeepDiacritics,
KeepNumbers = tparams.KeepNumbers,
KeepPunctuations = tparams.KeepPunctuations,
TextCase = tparams.TextCase
}, view);

if (wordTokCols != null)
wordTokCols = dstCols;
else
textCols = dstCols;
}

if (tparams.NeedsRemoveStopwordsTransform)
{
Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text.");
Expand Down Expand Up @@ -360,7 +356,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
if (tparams.CharExtractorFactory != null)
{
{
var srcCols = wordTokCols ?? textCols;
var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols;
charTokCols = new string[srcCols.Length];
var xfCols = new CharTokenizeTransform.Column[srcCols.Length];
for (int i = 0; i < srcCols.Length; i++)
Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Predictor.Tests/TestPipelineSweeper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ public void PipelineSweeperRoles()
var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue;
var testAuc = bestPipeline.PerformanceSummary.MetricValue;
Assert.True((0.94 < trainAuc) && (trainAuc < 0.95));
Assert.True((0.83 < testAuc) && (testAuc < 0.84));
Assert.True((0.815 < testAuc) && (testAuc < 0.825));

var results = runner.GetOutput<IDataView>("ResultsOut");
Assert.NotNull(results);
Expand Down
Loading