Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FeatureHasher #652

Merged
merged 29 commits into from
Oct 2, 2020
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c933914
FeatureHasher
Sep 3, 2020
0801108
tidy
Sep 4, 2020
ac26205
tidying comments
Sep 4, 2020
7e9e8cc
Merge branch 'master' into ml/FeatureHasher
imback82 Sep 8, 2020
3652b55
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
Sep 8, 2020
d6094e3
adding extra test for FeatureBase
Sep 8, 2020
19626f7
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
Sep 8, 2020
ffd0cfc
Trigger Build
Sep 8, 2020
a0356cc
missing file header
Sep 8, 2020
7921e04
comments
Sep 8, 2020
b5fcee2
naming better
Sep 8, 2020
01d40f8
indentation
Sep 8, 2020
89694dc
changes after feedback
Sep 9, 2020
2d8eaa1
test summary
Sep 9, 2020
bad829a
trigger build
Sep 9, 2020
012bd6b
Merge branch 'master' into ml/FeatureHasher
GoEddie Sep 11, 2020
cae76fc
Merge branch 'master' into ml/FeatureHasher
imback82 Sep 12, 2020
105d690
Merge branch 'master' into ml/FeatureHasher
GoEddie Sep 14, 2020
dd75d78
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
Sep 29, 2020
c2926b3
Merge branch 'master' into ml/FeatureHasher
suhsteve Sep 29, 2020
02f06a9
changes after review
Sep 29, 2020
72741fe
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
Sep 29, 2020
ecb9e5f
formatting
GoEddie Sep 29, 2020
b28c1a7
formatting
GoEddie Sep 29, 2020
e94a601
Reverting change
GoEddie Sep 29, 2020
88eec00
Merge branch 'master' into ml/FeatureHasher
GoEddie Oct 1, 2020
752e48d
Merge branch 'master' into ml/FeatureHasher
GoEddie Oct 2, 2020
54d55de
retrigger build
Oct 2, 2020
e2aeab7
Merge branch 'master' into ml/FeatureHasher
imback82 Oct 2, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,24 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class BucketizerTests
public class BucketizerTests : FeatureBaseTests<Bucketizer>
{
private readonly SparkSession _spark;

public BucketizerTests(SparkFixture fixture)
public BucketizerTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}

/// <summary>
/// Create a <see cref="DataFrame"/>, create a <see cref="Bucketizer"/> and test the
/// available methods. Test the FeatureBase methods using <see cref="FeatureBaseTests"/>.
/// </summary>
[Fact]
public void TestBucketizer()
GoEddie marked this conversation as resolved.
Show resolved Hide resolved
{
var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };
var expectedSplits =
new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };

string expectedHandle = "skip";
string expectedUid = "uid";
GoEddie marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -60,18 +65,7 @@ public void TestBucketizer()
Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
}

Assert.NotEmpty(bucketizer.ExplainParams());

Param handleInvalidParam = bucketizer.GetParam("handleInvalid");
Assert.NotEmpty(handleInvalidParam.Doc);
Assert.NotEmpty(handleInvalidParam.Name);
Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid());

Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam));
bucketizer.Set(handleInvalidParam, "keep");
Assert.Equal("keep", bucketizer.GetHandleInvalid());

Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid());
TestFeatureBase(bucketizer, "handleInvalid", "keep");
}

[Fact]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class CountVectorizerModelTests
public class CountVectorizerModelTests : FeatureBaseTests<CountVectorizerModel>
{
private readonly SparkSession _spark;

public CountVectorizerModelTests(SparkFixture fixture)
public CountVectorizerModelTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}

/// <summary>
/// Test that we can create a CountVectorizerModel, pass in a specifc vocabulary to use
/// Test that we can create a CountVectorizerModel, pass in a specific vocabulary to use
/// when creating the model. Verify the standard features methods as well as load/save.
/// </summary>
[Fact]
Expand Down Expand Up @@ -68,6 +68,8 @@ public void TestCountVectorizerModel()
Assert.IsType<int>(countVectorizerModel.GetVocabSize());
Assert.NotEmpty(countVectorizerModel.ExplainParams());
Assert.NotEmpty(countVectorizerModel.ToString());

TestFeatureBase(countVectorizerModel, "minDF", 100);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class CountVectorizerTests
public class CountVectorizerTests : FeatureBaseTests<CountVectorizer>
{
private readonly SparkSession _spark;

public CountVectorizerTests(SparkFixture fixture)
public CountVectorizerTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -67,6 +67,8 @@ public void TestCountVectorizer()

Assert.NotEmpty(countVectorizer.ExplainParams());
Assert.NotEmpty(countVectorizer.ToString());

TestFeatureBase(countVectorizer, "minDF", 0.4);
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.ML.Feature.Param;
using Microsoft.Spark.Sql;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
public class FeatureBaseTests<T>
{
private readonly SparkSession _spark;

protected FeatureBaseTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}

/// <summary>
/// Tests the common functionality across all ML.Feature classes.
/// </summary>
/// <param name="testObject">The object that implemented FeatureBase</param>
/// <param name="paramName">The name of a parameter that can be set on this object</param>
/// <param name="paramValue">A parameter value that can be set on this object</param>
public void TestFeatureBase(
FeatureBase<T> testObject,
string paramName,
object paramValue)
{
Assert.NotEmpty(testObject.ExplainParams());

Param param = testObject.GetParam(paramName);
Assert.NotEmpty(param.Doc);
Assert.NotEmpty(param.Name);
Assert.Equal(param.Parent, testObject.Uid());

Assert.NotEmpty(testObject.ExplainParam(param));
testObject.Set(param, paramValue);
Assert.IsAssignableFrom<Identifiable>(testObject.Clear(param));

Assert.IsType<string>(testObject.Uid());
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Spark.ML.Feature;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Xunit;

namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class FeatureHasherTests : FeatureBaseTests<FeatureHasher>
{
private readonly SparkSession _spark;

public FeatureHasherTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}

/// <summary>
/// Create a <see cref="DataFrame"/>, create a <see cref="FeatureHasher"/> and test the
/// available methods. Test the FeatureBase methods using <see cref="FeatureBaseTests"/>.
/// </summary>
[Fact]
GoEddie marked this conversation as resolved.
Show resolved Hide resolved
public void TestFeatureHasher()
GoEddie marked this conversation as resolved.
Show resolved Hide resolved
{
DataFrame dataFrame = _spark.CreateDataFrame(
new List<GenericRow>
{
new GenericRow(new object[] { 2.0D, true, "1", "foo" }),
new GenericRow(new object[] { 3.0D, false, "2", "bar" })
},
new StructType(new List<StructField>
{
new StructField("real", new DoubleType()),
new StructField("bool", new BooleanType()),
new StructField("stringNum", new StringType()),
new StructField("string", new StringType())
}));

FeatureHasher hasher = new FeatureHasher()
.SetInputCols(new List<string>() { "real", "bool", "stringNum", "string" })
.SetOutputCol("features")
.SetCategoricalCols(new List<string>() { "real", "string" })
.SetNumFeatures(10);

Assert.IsType<string>(hasher.GetOutputCol());
Assert.IsType<string[]>(hasher.GetInputCols());
Assert.IsType<string[]>(hasher.GetCategoricalCols());
Assert.IsType<int>(hasher.GetNumFeatures());
Assert.IsType<StructType>(hasher.TransformSchema(dataFrame.Schema()));
Assert.IsType<DataFrame>(hasher.Transform(dataFrame));

TestFeatureBase(hasher, "numFeatures", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class HashingTFTests
public class HashingTFTests : FeatureBaseTests<HashingTF>
{
private readonly SparkSession _spark;

public HashingTFTests(SparkFixture fixture)
public HashingTFTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -57,6 +57,8 @@ public void TestHashingTF()

hashingTf.SetBinary(true);
Assert.True(hashingTf.GetBinary());

TestFeatureBase(hashingTf, "numFeatures", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class IDFModelTests
public class IDFModelTests : FeatureBaseTests<IDFModel>
{
private readonly SparkSession _spark;

public IDFModelTests(SparkFixture fixture)
public IDFModelTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -65,6 +65,8 @@ public void TestIDFModel()
IDFModel loadedModel = IDFModel.Load(modelPath);
Assert.Equal(idfModel.Uid(), loadedModel.Uid());
}

TestFeatureBase(idfModel, "minDocFreq", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class IDFTests
public class IDFTests : FeatureBaseTests<IDF>
{
private readonly SparkSession _spark;

public IDFTests(SparkFixture fixture)
public IDFTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -44,6 +44,8 @@ public void TestIDFModel()
IDF loadedIdf = IDF.Load(savePath);
Assert.Equal(idf.Uid(), loadedIdf.Uid());
}

TestFeatureBase(idf, "minDocFreq", 1000);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class TokenizerTests
public class TokenizerTests : FeatureBaseTests<Tokenizer>
{
private readonly SparkSession _spark;

public TokenizerTests(SparkFixture fixture)
public TokenizerTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -50,6 +50,8 @@ public void TestTokenizer()
}

Assert.Equal(expectedUid, tokenizer.Uid());

TestFeatureBase(tokenizer, "inputCol", "input_col");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class Word2VecModelTests
public class Word2VecModelTests : FeatureBaseTests<Word2VecModel>
{
private readonly SparkSession _spark;

public Word2VecModelTests(SparkFixture fixture)
public Word2VecModelTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -47,6 +47,8 @@ public void TestWord2VecModel()
Word2VecModel loadedModel = Word2VecModel.Load(savePath);
Assert.Equal(model.Uid(), loadedModel.Uid());
}

TestFeatureBase(model, "maxIter", 2);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
{
[Collection("Spark E2E Tests")]
public class Word2VecTests
public class Word2VecTests : FeatureBaseTests<Word2Vec>
{
private readonly SparkSession _spark;

public Word2VecTests(SparkFixture fixture)
public Word2VecTests(SparkFixture fixture) : base(fixture)
{
_spark = fixture.Spark;
}
Expand Down Expand Up @@ -67,6 +67,8 @@ public void TestWord2Vec()
Word2Vec loadedWord2Vec = Word2Vec.Load(savePath);
Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid());
}

TestFeatureBase(word2vec, "maxIter", 2);
}
}
}
Loading