From ba609f50751b1ce2d1e5b618d9e43d5d132e844e Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Thu, 21 May 2020 12:47:45 -0700 Subject: [PATCH 01/27] Update M.D.A to 0.4.0 (#528) --- src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 643e1130c..f3d3f1ffd 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -27,7 +27,7 @@ - + From 08d203a3179919be314a0df1bdd4fad5d1f0b37d Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 26 May 2020 19:45:47 -0700 Subject: [PATCH 02/27] [DOC] Adding guides to explain UDF serialization and Broadcast variable usage (#464) --- docs/broadcast-guide.md | 92 +++++++++++++++++++++ docs/udf-guide.md | 171 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 docs/broadcast-guide.md create mode 100644 docs/udf-guide.md diff --git a/docs/broadcast-guide.md b/docs/broadcast-guide.md new file mode 100644 index 000000000..c3026516b --- /dev/null +++ b/docs/broadcast-guide.md @@ -0,0 +1,92 @@ +# Guide to using Broadcast Variables + +This is a guide to show how to use broadcast variables in .NET for Apache Spark. + +## What are Broadcast Variables + +[Broadcast variables in Apache Spark](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) are a mechanism for sharing variables across executors that are meant to be read-only. They allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. + +### How to use broadcast variables in .NET for Apache Spark + +Broadcast variables are created from a variable `v` by calling `SparkContext.Broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `Value()` method. + +Example: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf = Udf( + str => $"{str}: {bv.Value()}"); +``` + +The type parameter for `Broadcast` should be the type of the variable being broadcasted. + +### Deleting broadcast variables + +The broadcast variable can be deleted from all executors by calling the `Destroy()` method on it. + +```csharp +// Destroying the broadcast variable bv: +bv.Destroy(); +``` + +> Note: `Destroy()` deletes all data and metadata related to the broadcast variable. Use this with caution - once a broadcast variable has been destroyed, it cannot be used again. + +#### Caveat of using Destroy + +One important thing to keep in mind while using broadcast variables in UDFs is to limit the scope of the variable to only the UDF that is referencing it. The [guide to using UDFs](udf-guide.md) describes this phenomenon in detail. This is especially crucial when calling `Destroy` on the broadcast variable. If the broadcast variable that has been destroyed is visible to or accessible from other UDFs, it gets picked up for serialization by all those UDFs, even if it is not being referenced by them. This will throw an error as .NET for Apache Spark is not able to serialize the destroyed broadcast variable. + +Example to demonstrate: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + +// Destroying bv +bv.Destroy(); + +// Calling udf1 after destroying bv throws the following expected exception: +// org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed +df.Select(udf1(df["_1"])).Show(); + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 throws the following (unexpected) exception: +// [Error] [JvmBridge] org.apache.spark.SparkException: Task not serializable +df.Select(udf2(df["_1"])).Show(); +``` + +The recommended way of implementing above desired behavior: + +```csharp +string v = "Variable to be broadcasted"; +// Restricting the visibility of bv to only the UDF referencing it +{ + Broadcast bv = SparkContext.Broadcast(v); + + // Using the broadcast variable in a UDF: + Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + + // Destroying bv + bv.Destroy(); +} + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 works fine as expected +df.Select(udf2(df["_1"])).Show(); +``` + This ensures that destroying `bv` doesn't affect calling `udf2` because of unexpected serialization behavior. + + Broadcast variables are useful for transmitting read-only data to all executors, as the data is sent only once and this can give performance benefits when compared with using local variables that get shipped to the executors with each task. Please refer to the [official documentation](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) to get a deeper understanding of broadcast variables and why they are used. \ No newline at end of file diff --git a/docs/udf-guide.md b/docs/udf-guide.md new file mode 100644 index 000000000..6a2905bf4 --- /dev/null +++ b/docs/udf-guide.md @@ -0,0 +1,171 @@ +# Guide to User-Defined Functions (UDFs) + +This is a guide to show how to use UDFs in .NET for Apache Spark. + +## What are UDFs + +[User-Defined Functions (UDFs)](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/expressions/UserDefinedFunction.html) are a feature of Spark that allow developers to use custom functions to extend the system's built-in functionality. They transform values from a single row within a table to produce a single corresponding output value per row based on the logic defined in the UDF. + +Let's take the following as an example for a UDF definition: + +```csharp +string s1 = "hello"; +Func udf = Udf( + str => $"{s1} {str}"); + +``` +The above defined UDF takes a `string` as an input (in the form of a [Column](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Column.cs#L14) of a [Dataframe](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/DataFrame.cs#L24)), and returns a `string` with `hello` appended in front of the input. + +For a sample Dataframe, let's take the following Dataframe `df`: + +```text ++-------+ +| name| ++-------+ +|Michael| +| Andy| +| Justin| ++-------+ +``` + +Now let's apply the above defined `udf` to the dataframe `df`: + +```csharp +DataFrame udfResult = df.Select(udf(df["name"])); +``` + +This would return the below as the Dataframe `udfResult`: + +```text ++-------------+ +| name| ++-------------+ +|hello Michael| +| hello Andy| +| hello Justin| ++-------------+ +``` +To get a better understanding of how to implement UDFs, please take a look at the [UDF helper functions](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Functions.cs#L3616) and some [test examples](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs#L49). + +## UDF serialization + +Since UDFs are functions that need to be executed on the workers, they have to be serialized and sent to the workers as part of the payload from the driver. This involves serializing the [delegate](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/) which is a reference to the method, along with its [target](https://docs.microsoft.com/en-us/dotnet/api/system.delegate.target?view=netframework-4.8) which is the class instance on which the current delegate invokes the instance method. Please take a look at this [code](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Utils/CommandSerDe.cs#L149) to get a better understanding of how UDF serialization is being done. + +## Good to know while implementing UDFs + +One behavior to be aware of while implementing UDFs in .NET for Apache Spark is how the target of the UDF gets serialized. .NET for Apache Spark uses .NET Core, which does not support serializing delegates, so it is instead done by using reflection to serialize the target where the delegate is defined. When multiple delegates are defined in a common scope, they have a shared closure that becomes the target of reflection for serialization. Let's take an example to illustrate what that means. + +The following code snippet defines two string variables that are being referenced in two function delegates that return the respective strings as result: + +```csharp +using System; + +public class C { + public void M() { + string s1 = "s1"; + string s2 = "s2"; + Func a = str => s1; + Func b = str => s2; + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](https://sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + public string s2; + + internal string b__0(string str) + { + return s1; + } + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + <>c__DisplayClass0_.s2 = "s2"; + Func func = new Func(<>c__DisplayClass0_.b__0); + Func func2 = new Func(<>c__DisplayClass0_.b__1); + } +} +``` +As can be seen in the above decompiled code, both `func` and `func2` share the same closure `<>c__DisplayClass0_0`, which is the target that is serialized when serializing the delegates `func` and `func2`. Hence, even though `Func a` is only referencing `s1`, `s2` also gets serialized when sending over the bytes to the workers. + +This can lead to some unexpected behaviors at runtime (like in the case of using [broadcast variables](broadcast-guide.md)), which is why we recommend restricting the visibility of the variables used in a function to that function's scope. + +Going back to the above example, the following is the recommended way to implement the desired behavior of previous code snippet: + +```csharp +using System; + +public class C { + public void M() { + { + string s1 = "s1"; + Func a = str => s1; + } + { + string s2 = "s2"; + Func b = str => s2; + } + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](https://sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + internal string b__0(string str) + { + return s1; + } + } + + [CompilerGenerated] + private sealed class <>c__DisplayClass0_1 + { + public string s2; + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + Func func = new Func(<>c__DisplayClass0_.b__0); + <>c__DisplayClass0_1 <>c__DisplayClass0_2 = new <>c__DisplayClass0_1(); + <>c__DisplayClass0_2.s2 = "s2"; + Func func2 = new Func(<>c__DisplayClass0_2.b__1); + } +} +``` + +Here we see that `func` and `func2` no longer share a closure and have their own separate closures `<>c__DisplayClass0_0` and `<>c__DisplayClass0_1` respectively. When used as the target for serialization, nothing other than the referenced variables will get serialized for the delegate. + +This behavior is important to keep in mind while implementing multiple UDFs in a common scope. +To learn more about UDFs in general, please review the following articles that explain UDFs and how to use them: [UDFs in databricks(scala)](https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html), [Spark UDFs and some gotchas](https://medium.com/@achilleus/spark-udfs-we-can-use-them-but-should-we-use-them-2c5a561fde6d). \ No newline at end of file From ce2317774c25550ea3784a1af4f02befd2eea70b Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Thu, 28 May 2020 19:29:27 -0700 Subject: [PATCH 03/27] Resolve nuget dependencies for UDFs defined in dotnet-interactive (#515) --- .../DeltaTableTests.cs | 1 + .../IpcTests/BroadcastTests.cs | 4 +- .../IpcTests/ML/Feature/BucketizerTests.cs | 2 +- .../IpcTests/ML/Feature/HashingTFTests.cs | 5 +- .../IpcTests/ML/Feature/IDFModelTests.cs | 2 +- .../IpcTests/ML/Feature/IDFTests.cs | 2 +- .../IpcTests/ML/Feature/TokenizerTests.cs | 2 +- .../IpcTests/ML/Feature/Word2VecModelTests.cs | 3 +- .../IpcTests/ML/Feature/Word2VecTests.cs | 2 +- .../IpcTests/SparkContextTests.cs | 2 +- .../IpcTests/Sql/DataFrameTests.cs | 2 +- .../IpcTests/Sql/DataFrameWriterTests.cs | 2 +- .../Sql/Streaming/DataStreamWriterTests.cs | 1 + .../Microsoft.Spark.E2ETest.csproj | 1 + .../Microsoft.Spark.E2ETest/SparkFixture.cs | 2 +- .../AssemblyLoaderTests.cs | 5 +- .../CollectionUtilsTests.cs | 26 +++ .../CommandSerDeTests.cs | 1 + .../DependencyProviderUtilsTests.cs | 154 ++++++++++++++++++ .../Microsoft.Spark.UnitTest/SparkFixture.cs | 109 +++++++++++++ .../Sql/ColumnTests.cs | 65 +------- .../TestUtils}/TemporaryDirectory.cs | 126 +++++++------- .../Microsoft.Spark.UnitTest/UdfSerDeTests.cs | 1 + .../DaemonWorkerTests.cs | 1 + .../DependencyProviderTests.cs | 64 ++++++++ .../Microsoft.Spark.Worker.UnitTest.csproj | 6 + .../PayloadProcessorTests.cs | 1 + .../Microsoft.Spark.Worker.csproj | 4 + .../Processor/PayloadProcessor.cs | 30 +--- .../Utils/AssemblyLoaderHelper.cs | 93 +++++++++++ .../Utils/DependencyProvider.cs | 87 ++++++++++ .../Interop/Ipc/IJvmBridgeFactory.cs | 11 ++ .../Interop/Ipc/JvmBridgeFactory.cs | 14 ++ .../Interop/SparkEnvironment.cs | 25 ++- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +- src/csharp/Microsoft.Spark/SparkFiles.cs | 39 ++++- .../Microsoft.Spark/Utils/AssemblyLoader.cs | 50 ++++-- .../Microsoft.Spark/Utils/CollectionUtils.cs | 18 ++ .../Utils/DependencyProviderUtils.cs | 99 +++++++++++ src/csharp/Microsoft.Spark/Utils/UdfSerDe.cs | 19 ++- src/csharp/Microsoft.Spark/Utils/UdfUtils.cs | 23 +-- 41 files changed, 898 insertions(+), 208 deletions(-) create mode 100644 src/csharp/Microsoft.Spark.UnitTest/CollectionUtilsTests.cs create mode 100644 src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs create mode 100644 src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs rename src/csharp/{Microsoft.Spark.E2ETest/Utils => Microsoft.Spark.UnitTest/TestUtils}/TemporaryDirectory.cs (87%) create mode 100644 src/csharp/Microsoft.Spark.Worker.UnitTest/DependencyProviderTests.cs create mode 100644 src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs create mode 100644 src/csharp/Microsoft.Spark.Worker/Utils/DependencyProvider.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/IJvmBridgeFactory.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridgeFactory.cs create mode 100644 src/csharp/Microsoft.Spark/Utils/CollectionUtils.cs create mode 100644 src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaTableTests.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaTableTests.cs index 69249d8c5..fab7c74dc 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaTableTests.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaTableTests.cs @@ -11,6 +11,7 @@ using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Streaming; using Microsoft.Spark.Sql.Types; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.Extensions.Delta.E2ETest diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 000c8f27e..511f5a122 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -1,10 +1,8 @@ using System; -using System.Collections.Generic; using System.Linq; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; -using static Microsoft.Spark.Sql.Functions; using Xunit; +using static Microsoft.Spark.Sql.Functions; namespace Microsoft.Spark.E2ETest.IpcTests { diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 11037bc6d..a075334de 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -4,9 +4,9 @@ using System.Collections.Generic; using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 7b6882bea..df459ed7a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -2,13 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; -using System.Collections.Generic; using System.IO; -using System.Linq; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 623b7322c..202187809 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information. using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs index 3dea63de7..72da97887 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information. using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 8cdb4e03a..4b1998f50 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information. using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs index 4845e011a..a5227149b 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs @@ -2,11 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs index 30e14ed28..1d5da5335 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs @@ -3,9 +3,9 @@ // See the LICENSE file in the project root for more information. using System.IO; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/SparkContextTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/SparkContextTests.cs index 07fbf2372..ca752570a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/SparkContextTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/SparkContextTests.cs @@ -3,7 +3,7 @@ // See the LICENSE file in the project root for more information. using System; -using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs index 7359bdb6b..46e899a87 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs @@ -3,13 +3,13 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections.Generic; using System.Linq; using Apache.Arrow; using Microsoft.Data.Analysis; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Types; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; using static Microsoft.Spark.Sql.Functions; using static Microsoft.Spark.UnitTest.TestUtils.ArrowTestUtils; diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterTests.cs index a7e214160..4f0d06742 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameWriterTests.cs @@ -3,8 +3,8 @@ // See the LICENSE file in the project root for more information. using System.Collections.Generic; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs index 4e87dc6c6..15c2a22a7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs @@ -10,6 +10,7 @@ using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Streaming; using Microsoft.Spark.Sql.Types; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; using static Microsoft.Spark.Sql.Functions; diff --git a/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj b/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj index abe436ec9..e03519853 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj +++ b/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj @@ -23,6 +23,7 @@ + diff --git a/src/csharp/Microsoft.Spark.E2ETest/SparkFixture.cs b/src/csharp/Microsoft.Spark.E2ETest/SparkFixture.cs index fc8272c5b..6d8dadbac 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/SparkFixture.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/SparkFixture.cs @@ -7,9 +7,9 @@ using System.IO; using System.Reflection; using System.Runtime.InteropServices; -using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; using Xunit; namespace Microsoft.Spark.E2ETest diff --git a/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs b/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs index da7d05197..f2f0dd30e 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs @@ -9,17 +9,19 @@ namespace Microsoft.Spark.UnitTest { + [Collection("Spark Unit Tests")] public class AssemblyLoaderTests { [Fact] public void TestAssemblySearchPathResolver() { + string sparkFilesDir = SparkFiles.GetRootDirectory(); string curDir = Directory.GetCurrentDirectory(); string appDir = AppDomain.CurrentDomain.BaseDirectory; // Test the default scenario. string[] searchPaths = AssemblySearchPathResolver.GetAssemblySearchPaths(); - Assert.Equal(new[] { curDir, appDir }, searchPaths); + Assert.Equal(new[] { sparkFilesDir, curDir, appDir }, searchPaths); // Test the case where DOTNET_ASSEMBLY_SEARCH_PATHS is defined. char sep = Path.PathSeparator; @@ -34,6 +36,7 @@ public void TestAssemblySearchPathResolver() "mydir2", Path.Combine(curDir, $".{sep}mydir3"), Path.Combine(curDir, $".{sep}mydir4"), + sparkFilesDir, curDir, appDir }, searchPaths); diff --git a/src/csharp/Microsoft.Spark.UnitTest/CollectionUtilsTests.cs b/src/csharp/Microsoft.Spark.UnitTest/CollectionUtilsTests.cs new file mode 100644 index 000000000..9a723b2b5 --- /dev/null +++ b/src/csharp/Microsoft.Spark.UnitTest/CollectionUtilsTests.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Utils; +using Xunit; + +namespace Microsoft.Spark.UnitTest +{ + public class CollectionUtilsTests + { + [Fact] + public void TestArrayEquals() + { + Assert.False(CollectionUtils.ArrayEquals(new int[] { 1 }, null)); + Assert.False(CollectionUtils.ArrayEquals(null, new int[] { 1 })); + Assert.False(CollectionUtils.ArrayEquals(new int[] { }, new int[] { 1 })); + Assert.False(CollectionUtils.ArrayEquals(new int[] { 1 }, new int[] { })); + Assert.False(CollectionUtils.ArrayEquals(new int[] { 1 }, new int[] { 1, 2 })); + Assert.False(CollectionUtils.ArrayEquals(new int[] { 1 }, new int[] { 2 })); + + Assert.True(CollectionUtils.ArrayEquals(null, null)); + Assert.True(CollectionUtils.ArrayEquals(new int[] { 1 }, new int[] { 1 })); + } + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/CommandSerDeTests.cs b/src/csharp/Microsoft.Spark.UnitTest/CommandSerDeTests.cs index 557bdcc72..820d7dea0 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/CommandSerDeTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/CommandSerDeTests.cs @@ -14,6 +14,7 @@ namespace Microsoft.Spark.UnitTest { + [Collection("Spark Unit Tests")] public class CommandSerDeTests { [Fact] diff --git a/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs b/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs new file mode 100644 index 000000000..ccfc4890b --- /dev/null +++ b/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs @@ -0,0 +1,154 @@ +using System.IO; +using System.Linq; +using Microsoft.Spark.UnitTest.TestUtils; +using Microsoft.Spark.Utils; +using Xunit; + +namespace Microsoft.Spark.UnitTest +{ + public class DependencyProviderUtilsTests + { + [Fact] + public void TestNuGetMetadataEquals() + { + string expectedFileName = "package.name.1.0.0.nupkg"; + string expectedPackageName = "package.name"; + string expectedPackageVersion = "1.0.0"; + + var nugetMetadata = new DependencyProviderUtils.NuGetMetadata + { + FileName = expectedFileName, + PackageName = expectedPackageName, + PackageVersion = expectedPackageVersion + }; + + Assert.False(nugetMetadata.Equals(null)); + Assert.False(nugetMetadata.Equals(new DependencyProviderUtils.NuGetMetadata())); + Assert.False(nugetMetadata.Equals(new DependencyProviderUtils.NuGetMetadata + { + FileName = "", + PackageName = expectedPackageName, + PackageVersion = expectedPackageVersion + })); + Assert.False(nugetMetadata.Equals(new DependencyProviderUtils.NuGetMetadata + { + FileName = expectedFileName, + PackageName = "", + PackageVersion = expectedPackageVersion + })); + Assert.False(nugetMetadata.Equals(new DependencyProviderUtils.NuGetMetadata + { + FileName = expectedFileName, + PackageName = expectedPackageName, + PackageVersion = "" + })); + + Assert.True(nugetMetadata.Equals(new DependencyProviderUtils.NuGetMetadata + { + FileName = expectedFileName, + PackageName = expectedPackageName, + PackageVersion = expectedPackageVersion + })); + } + + [Fact] + public void TestMetadataEquals() + { + string expectedAssemblyProbingPath = "/assembly/probe/path"; + string expectedNativeProbingPath = "/native/probe/path"; + var expectedNugetMetadata = new DependencyProviderUtils.NuGetMetadata + { + FileName = "package.name.1.0.0.nupkg", + PackageName = "package.name", + PackageVersion = "1.0.0" + }; + + var metadata = new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { expectedAssemblyProbingPath }, + NativeProbingPaths = new string[] { expectedNativeProbingPath }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] { expectedNugetMetadata } + }; + + Assert.False(metadata.Equals(null)); + Assert.False(metadata.Equals(new DependencyProviderUtils.Metadata())); + Assert.False(metadata.Equals(new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { expectedAssemblyProbingPath }, + NativeProbingPaths = new string[] { expectedNativeProbingPath, "" }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] { expectedNugetMetadata } + })); + Assert.False(metadata.Equals(new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { expectedAssemblyProbingPath }, + NativeProbingPaths = new string[] { expectedNativeProbingPath }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] { expectedNugetMetadata, null } + })); + Assert.False(metadata.Equals(new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { expectedAssemblyProbingPath, "" }, + NativeProbingPaths = new string[] { expectedNativeProbingPath }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] { expectedNugetMetadata } + })); + + Assert.True(metadata.Equals(new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { expectedAssemblyProbingPath }, + NativeProbingPaths = new string[] { expectedNativeProbingPath }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] { expectedNugetMetadata } + })); + } + + [Fact] + public void TestMetadataSerDe() + { + using var tempDir = new TemporaryDirectory(); + var metadata = new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { "/assembly/probe/path" }, + NativeProbingPaths = new string[] { "/native/probe/path" }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] + { + new DependencyProviderUtils.NuGetMetadata + { + FileName = "package.name.1.0.0.nupkg", + PackageName = "package.name", + PackageVersion = "1.0.0" + } + } + }; + + string serializedFilePath = Path.Combine(tempDir.Path, "serializedMetadata"); + metadata.Serialize(serializedFilePath); + + DependencyProviderUtils.Metadata deserializedMetadata = + DependencyProviderUtils.Metadata.Deserialize(serializedFilePath); + + Assert.True(metadata.Equals(deserializedMetadata)); + } + + [Fact] + public void TestFileNames() + { + using var tempDir = new TemporaryDirectory(); + foreach (ulong num in Enumerable.Range(0, 3).Select(x => System.Math.Pow(10, x))) + { + string filePath = + Path.Combine(tempDir.Path, DependencyProviderUtils.CreateFileName(num)); + File.Create(filePath).Dispose(); + } + + var expectedFiles = new string[] + { + "dependencyProviderMetadata_00000000000000000001", + "dependencyProviderMetadata_00000000000000000010", + "dependencyProviderMetadata_00000000000000000100", + }; + IOrderedEnumerable actualFiles = DependencyProviderUtils + .GetMetadataFiles(tempDir.Path) + .Select(f => Path.GetFileName(f)) + .OrderBy(s => s); + Assert.True(expectedFiles.SequenceEqual(actualFiles)); + } + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs b/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs new file mode 100644 index 000000000..02f2c8b3b --- /dev/null +++ b/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Moq; +using Xunit; + +namespace Microsoft.Spark.UnitTest +{ + public sealed class SparkFixture : IDisposable + { + internal Mock MockJvm { get; private set; } + + public SparkFixture() + { + SetupBasicMockJvm(); + + // Unit tests may contain calls that hit the AssemblyLoader. + // One of the AssemblyLoader assembly search paths is populated + // using SparkFiles. Unless we are running in an E2E scenario and + // on the Worker, SparkFiles will attempt to call the JVM. Because + // this is a (non E2E) Unit test, it is necessary to mock this call. + SetupSparkFiles(); + + var mockJvmBridgeFactory = new Mock(); + mockJvmBridgeFactory + .Setup(m => m.Create(It.IsAny())) + .Returns(MockJvm.Object); + + SparkEnvironment.JvmBridgeFactory = mockJvmBridgeFactory.Object; + } + + public void Dispose() + { + } + + private void SetupBasicMockJvm() + { + MockJvm = new Mock(); + + MockJvm + .Setup(m => m.CallStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + MockJvm + .Setup(m => m.CallStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + MockJvm + .Setup(m => m.CallStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + + MockJvm + .Setup(m => m.CallNonStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + MockJvm + .Setup(m => m.CallNonStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + MockJvm + .Setup(m => m.CallNonStaticJavaMethod( + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns( + new JvmObjectReference("result", MockJvm.Object)); + } + + private void SetupSparkFiles() + { + MockJvm + .Setup(m => m.CallStaticJavaMethod( + "org.apache.spark.SparkFiles", + "getRootDirectory")) + .Returns("SparkFilesRootDirectory"); + } + } + + [CollectionDefinition("Spark Unit Tests")] + public class SparkCollection : ICollectionFixture + { + // This class has no code, and is never created. Its purpose is simply + // to be the place to apply [CollectionDefinition] and all the + // ICollectionFixture<> interfaces. + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/Sql/ColumnTests.cs b/src/csharp/Microsoft.Spark.UnitTest/Sql/ColumnTests.cs index f88d53800..adffd9312 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/Sql/ColumnTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/Sql/ColumnTests.cs @@ -12,71 +12,12 @@ namespace Microsoft.Spark.UnitTest { - public class ColumnTestsFixture : IDisposable - { - internal Mock MockJvm { get; } - - public ColumnTestsFixture() - { - MockJvm = new Mock(); - - MockJvm - .Setup(m => m.CallStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - MockJvm - .Setup(m => m.CallStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - MockJvm - .Setup(m => m.CallStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - - MockJvm - .Setup(m => m.CallNonStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - MockJvm - .Setup(m => m.CallNonStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - MockJvm - .Setup(m => m.CallNonStaticJavaMethod( - It.IsAny(), - It.IsAny(), - It.IsAny())) - .Returns( - new JvmObjectReference("result", MockJvm.Object)); - } - - public void Dispose() - { - } - } - - public class ColumnTests : IClassFixture + [Collection("Spark Unit Tests")] + public class ColumnTests { private readonly Mock _mockJvm; - public ColumnTests(ColumnTestsFixture fixture) + public ColumnTests(SparkFixture fixture) { _mockJvm = fixture.MockJvm; } diff --git a/src/csharp/Microsoft.Spark.E2ETest/Utils/TemporaryDirectory.cs b/src/csharp/Microsoft.Spark.UnitTest/TestUtils/TemporaryDirectory.cs similarity index 87% rename from src/csharp/Microsoft.Spark.E2ETest/Utils/TemporaryDirectory.cs rename to src/csharp/Microsoft.Spark.UnitTest/TestUtils/TemporaryDirectory.cs index 556b78f99..98d3c18f3 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/Utils/TemporaryDirectory.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/TestUtils/TemporaryDirectory.cs @@ -1,63 +1,63 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.IO; - -namespace Microsoft.Spark.E2ETest.Utils -{ - /// - /// Creates a temporary folder that is automatically cleaned up when disposed. - /// - internal sealed class TemporaryDirectory : IDisposable - { - private bool disposed = false; - - /// - /// Path to temporary folder. - /// - public string Path { get; } - - public TemporaryDirectory() - { - Path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), Guid.NewGuid().ToString()); - Cleanup(); - Directory.CreateDirectory(Path); - Path = $"{Path}{System.IO.Path.DirectorySeparatorChar}"; - } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } - - private void Cleanup() - { - if (File.Exists(Path)) - { - File.Delete(Path); - } - else if (Directory.Exists(Path)) - { - Directory.Delete(Path, true); - } - } - - private void Dispose(bool disposing) - { - if (disposed) - { - return; - } - - if (disposing) - { - Cleanup(); - } - - disposed = true; - } - } -} +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; + +namespace Microsoft.Spark.UnitTest.TestUtils +{ + /// + /// Creates a temporary folder that is automatically cleaned up when disposed. + /// + internal sealed class TemporaryDirectory : IDisposable + { + private bool _disposed = false; + + /// + /// Path to temporary folder. + /// + public string Path { get; } + + public TemporaryDirectory() + { + Path = System.IO.Path.Combine(System.IO.Path.GetTempPath(), Guid.NewGuid().ToString()); + Cleanup(); + Directory.CreateDirectory(Path); + Path = $"{Path}{System.IO.Path.DirectorySeparatorChar}"; + } + + public void Dispose() + { + Dispose(true); + GC.SuppressFinalize(this); + } + + private void Cleanup() + { + if (File.Exists(Path)) + { + File.Delete(Path); + } + else if (Directory.Exists(Path)) + { + Directory.Delete(Path, true); + } + } + + private void Dispose(bool disposing) + { + if (_disposed) + { + return; + } + + if (disposing) + { + Cleanup(); + } + + _disposed = true; + } + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/UdfSerDeTests.cs b/src/csharp/Microsoft.Spark.UnitTest/UdfSerDeTests.cs index 6928150d0..bf4ef29f4 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/UdfSerDeTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/UdfSerDeTests.cs @@ -11,6 +11,7 @@ namespace Microsoft.Spark.UnitTest { + [Collection("Spark Unit Tests")] public class UdfSerDeTests { [Serializable] diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs index 0490660e3..5fac38035 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs @@ -15,6 +15,7 @@ namespace Microsoft.Spark.Worker.UnitTest { + [Collection("Spark Unit Tests")] public class DaemonWorkerTests { [Fact] diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/DependencyProviderTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/DependencyProviderTests.cs new file mode 100644 index 000000000..6643ba2ab --- /dev/null +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/DependencyProviderTests.cs @@ -0,0 +1,64 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using System.IO.Compression; +using Microsoft.Spark.UnitTest.TestUtils; +using Microsoft.Spark.Utils; +using Microsoft.Spark.Worker.Utils; +using Xunit; + +namespace Microsoft.Spark.Worker.UnitTest +{ + [Collection("Spark Unit Tests")] + public class DependencyProviderTests + { + [Fact] + public void TestsUnpackPackages() + { + string packageFileName = "package.name.1.0.0.nupkg"; + string packageName = "package.name"; + string packageVersion = "1.0.0"; + + using var emptyFileDir = new TemporaryDirectory(); + string emptyFileName = "emptyfile"; + File.Create(Path.Combine(emptyFileDir.Path, emptyFileName)).Dispose(); + + using var nupkgDir = new TemporaryDirectory(); + ZipFile.CreateFromDirectory( + emptyFileDir.Path, + Path.Combine(nupkgDir.Path, packageFileName)); + + var metadata = new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] { "/assembly/probe/path" }, + NativeProbingPaths = new string[] { "/native/probe/path" }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] + { + new DependencyProviderUtils.NuGetMetadata + { + FileName = packageFileName, + PackageName = packageName, + PackageVersion = packageVersion + } + } + }; + + using var unpackDir = new TemporaryDirectory(); + string metadataFilePath = + Path.Combine(nupkgDir.Path, DependencyProviderUtils.CreateFileName(1)); + metadata.Serialize(metadataFilePath); + + // Files located in nupkgDir + // nuget: package.name.1.0.0.nupkg + // metadata file: dependencyProviderMetadata_00000000000000000001 + var dependencyProvider = + new DependencyProvider(metadataFilePath, nupkgDir.Path, unpackDir.Path); + string expectedPackagePath = + Path.Combine(unpackDir.Path, ".nuget", "packages", packageName, packageVersion); + string expectedFilePath = Path.Combine(expectedPackagePath, emptyFileName); + Assert.True(File.Exists(expectedFilePath)); + } + } +} diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/Microsoft.Spark.Worker.UnitTest.csproj b/src/csharp/Microsoft.Spark.Worker.UnitTest/Microsoft.Spark.Worker.UnitTest.csproj index 1b68d2e45..1371d5d1b 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/Microsoft.Spark.Worker.UnitTest.csproj +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/Microsoft.Spark.Worker.UnitTest.csproj @@ -4,13 +4,19 @@ netcoreapp3.1 + + + + + + diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/PayloadProcessorTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/PayloadProcessorTests.cs index c586e9dc2..24370abcb 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/PayloadProcessorTests.cs +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/PayloadProcessorTests.cs @@ -14,6 +14,7 @@ namespace Microsoft.Spark.Worker.UnitTest { + [Collection("Spark Unit Tests")] public class PayloadProcessorTests { [Theory] diff --git a/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj b/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj index cff20b084..1be21a7ac 100644 --- a/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj +++ b/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj @@ -6,6 +6,9 @@ netcoreapp3.1 Microsoft.Spark.Worker true + + + https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json @@ -13,6 +16,7 @@ + diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/PayloadProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/PayloadProcessor.cs index 2acc89933..58dd588aa 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/PayloadProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/PayloadProcessor.cs @@ -7,12 +7,7 @@ using System.Collections.Generic; using System.IO; using Microsoft.Spark.Interop.Ipc; -using Microsoft.Spark.Utils; - -#if NETCOREAPP -using System.Reflection; -using System.Runtime.Loader; -#endif +using Microsoft.Spark.Worker.Utils; namespace Microsoft.Spark.Worker.Processor { @@ -28,20 +23,6 @@ internal PayloadProcessor(Version version) _version = version; } - static PayloadProcessor() - { -#if NETCOREAPP - AssemblyLoader.LoadFromFile = AssemblyLoadContext.Default.LoadFromAssemblyPath; - AssemblyLoader.LoadFromName = (asmName) => - AssemblyLoadContext.Default.LoadFromAssemblyName(new AssemblyName(asmName)); - AssemblyLoadContext.Default.Resolving += (assemblyLoadContext, assemblyName) => - AssemblyLoader.ResolveAssembly(assemblyName.FullName); -#else - AppDomain.CurrentDomain.AssemblyResolve += (object sender, ResolveEventArgs args) => - AssemblyLoader.ResolveAssembly(args.Name); -#endif - } - /// /// Processes the given stream to construct a Payload object. /// @@ -79,8 +60,15 @@ internal Payload Process(Stream stream) TaskContextHolder.Set(payload.TaskContext); payload.SparkFilesDir = SerDe.ReadString(stream); + SparkFiles.SetRootDirectory(payload.SparkFilesDir); + + // Register additional assembly handlers after SparkFilesDir has been set + // and before any deserialization occurs. BroadcastVariableProcessor may + // deserialize objects from assemblies that are not currently loaded within + // our current context. + AssemblyLoaderHelper.RegisterAssemblyHandler(); - if (Utils.SettingUtils.IsDatabricks) + if (SettingUtils.IsDatabricks) { SerDe.ReadString(stream); SerDe.ReadString(stream); diff --git a/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs b/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs new file mode 100644 index 000000000..b74228073 --- /dev/null +++ b/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.IO; +using Microsoft.Spark.Services; +using Microsoft.Spark.Utils; + +#if NETCOREAPP +using System.Runtime.Loader; +#endif + +namespace Microsoft.Spark.Worker.Utils +{ + internal static class AssemblyLoaderHelper + { + private static readonly ILoggerService s_logger = + LoggerServiceFactory.GetLogger(typeof(AssemblyLoaderHelper)); + + // A mapping between a metadata file's path to its respective DependencyProvider. + private static readonly ConcurrentDictionary> + s_dependencyProviders = new ConcurrentDictionary>(); + + private static readonly bool s_runningREPL = + EnvironmentUtils.GetEnvironmentVariableAsBool("DOTNET_SPARK_RUNNING_REPL"); + + /// + /// Register the AssemblyLoader.ResolveAssembly handler to handle the + /// event when assemblies fail to load in the current assembly load context. + /// + static AssemblyLoaderHelper() + { +#if NETCOREAPP + AssemblyLoader.LoadFromFile = AssemblyLoadContext.Default.LoadFromAssemblyPath; + AssemblyLoadContext.Default.Resolving += (assemblyLoadContext, assemblyName) => + AssemblyLoader.ResolveAssembly(assemblyName.FullName); +#else + AppDomain.CurrentDomain.AssemblyResolve += (object sender, ResolveEventArgs args) => + AssemblyLoader.ResolveAssembly(args.Name); +#endif + } + + /// + /// In a dotnet-interactive REPL session (driver), nuget dependencies will be + /// systematically added using . + /// + /// These files include: + /// - "{packagename}.{version}.nupkg" + /// The nuget packages + /// - + /// Serialized object. + /// + /// On the Worker, in order to resolve the nuget dependencies referenced by + /// the dotnet-interactive session, we instantiate a + /// . + /// This provider will register an event handler to the Assembly Load Resolving event. + /// By using , we can access the + /// required files added to the . + /// + internal static void RegisterAssemblyHandler() + { + if (!s_runningREPL) + { + return; + } + + string sparkFilesPath = SparkFiles.GetRootDirectory(); + string[] metadataFiles = + DependencyProviderUtils.GetMetadataFiles(sparkFilesPath); + foreach (string metdatafile in metadataFiles) + { + // The execution of the delegate passed to GetOrAdd is not guaranteed to run once. + // Multiple Lazy objects may be created, but only one of them will be added to the + // ConcurrentDictionary. The Lazy value is retrieved to materialize the + // DependencyProvider object if it hasn't already been created. + Lazy dependecyProvider = s_dependencyProviders.GetOrAdd( + metdatafile, + mdf => new Lazy( + () => + { + s_logger.LogInfo($"Creating {nameof(DependencyProvider)} using {mdf}"); + return new DependencyProvider( + mdf, + sparkFilesPath, + Directory.GetCurrentDirectory()); + })); + _ = dependecyProvider.Value; + } + } + } +} diff --git a/src/csharp/Microsoft.Spark.Worker/Utils/DependencyProvider.cs b/src/csharp/Microsoft.Spark.Worker/Utils/DependencyProvider.cs new file mode 100644 index 000000000..d15bda3a1 --- /dev/null +++ b/src/csharp/Microsoft.Spark.Worker/Utils/DependencyProvider.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using Microsoft.Spark.Utils; +using DepManager = Microsoft.DotNet.DependencyManager; + +namespace Microsoft.Spark.Worker.Utils +{ + /// + /// sets up and creates a new + /// . + /// + /// The following steps outline the process: + /// - Deserializes a . + /// - Uses to unpack required + /// nugets. + /// - Uses and + /// to construct + /// a . + /// + internal class DependencyProvider : IDisposable + { + private readonly DepManager.DependencyProvider _dependencyProvider; + + internal DependencyProvider(string metadataFilePath, string srcPath, string dstPath) + { + DependencyProviderUtils.Metadata metadata = + DependencyProviderUtils.Metadata.Deserialize(metadataFilePath); + + string unpackPath = Path.Combine(dstPath, ".nuget", "packages"); + Directory.CreateDirectory(unpackPath); + + UnpackPackages(srcPath, unpackPath, metadata.NuGets); + + _dependencyProvider = CreateDependencyProvider(unpackPath, metadata); + } + + public void Dispose() + { + (_dependencyProvider as IDisposable)?.Dispose(); + } + + private DepManager.DependencyProvider CreateDependencyProvider( + string basePath, + DependencyProviderUtils.Metadata metadata) + { + IEnumerable AssemblyProbingPaths() + { + foreach (string dependency in metadata.AssemblyProbingPaths) + { + yield return Path.Combine(basePath, dependency); + } + } + + IEnumerable NativeProbingRoots() + { + foreach (string dependency in metadata.NativeProbingPaths) + { + yield return Path.Combine(basePath, dependency); + } + } + + return new DepManager.DependencyProvider( + AssemblyProbingPaths, + NativeProbingRoots); + } + + private void UnpackPackages( + string src, + string dst, + DependencyProviderUtils.NuGetMetadata[] nugetMetadata) + { + foreach (DependencyProviderUtils.NuGetMetadata metadata in nugetMetadata) + { + var packageDirectory = new DirectoryInfo( + Path.Combine(dst, metadata.PackageName.ToLower(), metadata.PackageVersion)); + if (!packageDirectory.Exists) + { + ZipFile.ExtractToDirectory( + Path.Combine(src, metadata.FileName), + packageDirectory.FullName); + } + } + } + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/IJvmBridgeFactory.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/IJvmBridgeFactory.cs new file mode 100644 index 000000000..428565527 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/IJvmBridgeFactory.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.Spark.Interop.Ipc +{ + internal interface IJvmBridgeFactory + { + IJvmBridge Create(int portNumber); + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridgeFactory.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridgeFactory.cs new file mode 100644 index 000000000..9c9f4ca43 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridgeFactory.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.Spark.Interop.Ipc +{ + internal class JvmBridgeFactory : IJvmBridgeFactory + { + public IJvmBridge Create(int portNumber) + { + return new JvmBridge(portNumber); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs b/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs index 2d19fd185..f2523d065 100644 --- a/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs +++ b/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System; -using System.Dynamic; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Services; @@ -46,17 +45,26 @@ internal static Version SparkVersion } } + private static IJvmBridgeFactory s_jvmBridgeFactory; + internal static IJvmBridgeFactory JvmBridgeFactory + { + get + { + return s_jvmBridgeFactory ??= new JvmBridgeFactory(); + } + set + { + s_jvmBridgeFactory = value; + } + } + private static IJvmBridge s_jvmBridge; internal static IJvmBridge JvmBridge { get { - if (s_jvmBridge == null) - { - s_jvmBridge = new JvmBridge(ConfigurationService.GetBackendPortNumber()); - } - - return s_jvmBridge; + return s_jvmBridge ??= + JvmBridgeFactory.Create(ConfigurationService.GetBackendPortNumber()); } set { @@ -69,8 +77,7 @@ internal static IConfigurationService ConfigurationService { get { - return s_configurationService ?? - (s_configurationService = new ConfigurationService()); + return s_configurationService ??= new ConfigurationService(); } set { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 823f13c1a..924c8b362 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -151,7 +151,7 @@ public Bucketizer SetInputCols(IEnumerable value) /// Gets the name of the column the output data will be written to. This is set by /// SetInputCol /// - // string, the output column + /// string, the output column public string GetOutputCol() { return (string)_jvmObject.Invoke("getOutputCol"); diff --git a/src/csharp/Microsoft.Spark/SparkFiles.cs b/src/csharp/Microsoft.Spark/SparkFiles.cs index 8b09933a7..8c6f6af4b 100644 --- a/src/csharp/Microsoft.Spark/SparkFiles.cs +++ b/src/csharp/Microsoft.Spark/SparkFiles.cs @@ -2,33 +2,62 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.IO; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; namespace Microsoft.Spark { /// - /// Resolves paths to files added through `SparkContext.addFile()`. + /// Resolves paths to files added through . /// public static class SparkFiles { private static IJvmBridge Jvm { get; } = SparkEnvironment.JvmBridge; private static readonly string s_sparkFilesClassName = "org.apache.spark.SparkFiles"; + [ThreadStatic] + private static string s_rootDirectory; + + [ThreadStatic] + private static bool s_isRunningOnWorker; + /// - /// Get the absolute path of a file added through `SparkContext.addFile()`. + /// Get the absolute path of a file added through + /// . /// - /// The name of the file added through `SparkContext.addFile()` + /// The name of the file added through + /// . /// /// The absolute path of the file. public static string Get(string fileName) => - (string)Jvm.CallStaticJavaMethod(s_sparkFilesClassName, "get", fileName); + Path.GetFullPath(Path.Combine(GetRootDirectory(), fileName)); /// - /// Get the root directory that contains files added through `SparkContext.addFile()`. + /// Get the root directory that contains files added through + /// . /// /// The root directory that contains the files. public static string GetRootDirectory() => + s_isRunningOnWorker ? + s_rootDirectory : (string)Jvm.CallStaticJavaMethod(s_sparkFilesClassName, "getRootDirectory"); + + /// + /// Set the root directory that contains files added through + /// . + /// + /// This should only be called from the Microsoft.Spark.Worker. + /// + /// + /// Root directory that contains files added + /// through . + /// + internal static void SetRootDirectory(string path) + { + s_isRunningOnWorker = true; + s_rootDirectory = path; + } } } diff --git a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs index 621a81881..94a37dbb5 100644 --- a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs +++ b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs @@ -8,6 +8,7 @@ using System.Reflection; using System.Runtime.InteropServices; using System.Text.RegularExpressions; +using Microsoft.Spark.Services; namespace Microsoft.Spark.Utils { @@ -20,8 +21,10 @@ internal static class AssemblySearchPathResolver /// precedence: /// 1) Comma-separated paths specified in DOTNET_ASSEMBLY_SEARCH_PATHS environment /// variable. Note that if a path starts with ".", the working directory will be prepended. - /// 2) The working directory. - /// 3) The directory of the application. + /// 2) The path of the files added through + /// . + /// 3) The working directory. + /// 4) The directory of the application. /// /// /// The reason that the working directory has higher precedence than the directory @@ -54,6 +57,12 @@ internal static string[] GetAssemblySearchPaths() } } + string sparkFilesPath = SparkFiles.GetRootDirectory(); + if (!string.IsNullOrWhiteSpace(sparkFilesPath)) + { + searchPaths.Add(sparkFilesPath); + } + searchPaths.Add(Directory.GetCurrentDirectory()); searchPaths.Add(AppDomain.CurrentDomain.BaseDirectory); @@ -65,13 +74,15 @@ internal static class AssemblyLoader { internal static Func LoadFromFile { get; set; } = Assembly.LoadFrom; - internal static Func LoadFromName { get; set; } = Assembly.Load; + private static readonly ILoggerService s_logger = + LoggerServiceFactory.GetLogger(typeof(AssemblyLoader)); private static readonly Dictionary s_assemblyCache = new Dictionary(); - private static readonly string[] s_searchPaths = - AssemblySearchPathResolver.GetAssemblySearchPaths(); + // Lazily evaluate the assembly search paths because it has a dependency on SparkFiles. + private static readonly Lazy s_searchPaths = + new Lazy(() => AssemblySearchPathResolver.GetAssemblySearchPaths()); private static readonly string[] s_extensions = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? @@ -95,9 +106,7 @@ internal static class AssemblyLoader /// /// The full name of the assembly /// Name of the file that contains the assembly - /// Cached or Loaded Assembly - /// Thrown if the assembly is not - /// found. + /// Cached or Loaded Assembly or null if not found internal static Assembly LoadAssembly(string assemblyName, string assemblyFileName) { // assemblyFileName is empty when serializing a UDF from within the REPL. @@ -119,7 +128,14 @@ internal static Assembly LoadAssembly(string assemblyName, string assemblyFileNa return assembly; } - throw new FileNotFoundException($"Assembly '{assemblyName}' file not found '{assemblyFileName}' in '{string.Join(",", s_searchPaths)}'"); + s_logger.LogWarn( + string.Format( + "Assembly '{0}' file not found '{1}' in '{2}'", + assemblyName, + assemblyFileName, + string.Join(",", s_searchPaths.Value))); + + return null; } } @@ -129,9 +145,7 @@ internal static Assembly LoadAssembly(string assemblyName, string assemblyFileNa /// s_extension combination. /// /// The fullname of the assembly to load - /// The loaded assembly - /// Thrown if the assembly is not - /// found. + /// The loaded assembly or null if not found internal static Assembly ResolveAssembly(string assemblyName) { lock (s_cacheLock) @@ -153,7 +167,15 @@ internal static Assembly ResolveAssembly(string assemblyName) } } - throw new FileNotFoundException($"Assembly '{assemblyName}' file not found '{simpleAsmName}[{string.Join(",", s_extensions)}]' in '{string.Join(",", s_searchPaths)}'"); + s_logger.LogWarn( + string.Format( + "Assembly '{0}' file not found '{1}[{2}]' in '{3}'", + assemblyName, + simpleAsmName, + string.Join(",", s_extensions), + string.Join(",", s_searchPaths.Value))); + + return null; } } @@ -165,7 +187,7 @@ internal static Assembly ResolveAssembly(string assemblyName) /// True if assembly is loaded, false otherwise. private static bool TryLoadAssembly(string assemblyFileName, ref Assembly assembly) { - foreach (string searchPath in s_searchPaths) + foreach (string searchPath in s_searchPaths.Value) { string assemblyPath = Path.Combine(searchPath, assemblyFileName); if (File.Exists(assemblyPath)) diff --git a/src/csharp/Microsoft.Spark/Utils/CollectionUtils.cs b/src/csharp/Microsoft.Spark/Utils/CollectionUtils.cs new file mode 100644 index 000000000..774e20835 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Utils/CollectionUtils.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Linq; + +namespace Microsoft.Spark.Utils +{ + internal static class CollectionUtils + { + internal static bool ArrayEquals(T[] array1, T[] array2) + { + return (array1?.Length == array2?.Length) && + ((array1 == null) || array1.SequenceEqual(array2)); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs b/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs new file mode 100644 index 000000000..f379cfe24 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs @@ -0,0 +1,99 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using System.Runtime.Serialization.Formatters.Binary; + +namespace Microsoft.Spark.Utils +{ + internal class DependencyProviderUtils + { + private static readonly string s_filePattern = "dependencyProviderMetadata_*"; + + internal static string[] GetMetadataFiles(string path) => + Directory.GetFiles(path, s_filePattern); + + // Create the dependency provider metadata filename based on the number passed into the + // function. + // + // number => filename + // 0 => dependencyProviderMetadata_00000000000000000000 + // 1 => dependencyProviderMetadata_00000000000000000001 + // ... + // 20 => dependencyProviderMetadata_00000000000000000020 + internal static string CreateFileName(ulong number) => + s_filePattern.Replace("*", $"{number:D20}"); + + [Serializable] + internal class NuGetMetadata + { + public string FileName { get; set; } + public string PackageName { get; set; } + public string PackageVersion { get; set; } + + public override int GetHashCode() + { + return base.GetHashCode(); + } + + public override bool Equals(object obj) + { + return (obj is NuGetMetadata nugetMetadata) && + Equals(nugetMetadata); + } + + private bool Equals(NuGetMetadata other) + { + return (other != null) && + (FileName == other.FileName) && + (PackageName == other.PackageName) && + (PackageVersion == other.PackageVersion); + } + } + + [Serializable] + internal class Metadata + { + public string[] AssemblyProbingPaths { get; set; } + public string[] NativeProbingPaths { get; set; } + public NuGetMetadata[] NuGets { get; set; } + + public override int GetHashCode() + { + return base.GetHashCode(); + } + + public override bool Equals(object obj) + { + return (obj is Metadata metadata) && + Equals(metadata); + } + + internal static Metadata Deserialize(string path) + { + using FileStream fileStream = File.OpenRead(path); + var formatter = new BinaryFormatter(); + return (Metadata)formatter.Deserialize(fileStream); + } + + internal void Serialize(string path) + { + using FileStream fileStream = File.OpenWrite(path); + var formatter = new BinaryFormatter(); + formatter.Serialize(fileStream, this); + } + + private bool Equals(Metadata other) + { + return (other != null) && + CollectionUtils.ArrayEquals( + AssemblyProbingPaths, + other.AssemblyProbingPaths) && + CollectionUtils.ArrayEquals(NativeProbingPaths, other.NativeProbingPaths) && + CollectionUtils.ArrayEquals(NuGets, other.NuGets); + } + } + } +} diff --git a/src/csharp/Microsoft.Spark/Utils/UdfSerDe.cs b/src/csharp/Microsoft.Spark/Utils/UdfSerDe.cs index 638838b9f..d338ddbdb 100644 --- a/src/csharp/Microsoft.Spark/Utils/UdfSerDe.cs +++ b/src/csharp/Microsoft.Spark/Utils/UdfSerDe.cs @@ -257,8 +257,21 @@ private static TypeData SerializeType(Type type) private static Type DeserializeType(TypeData typeData) => s_typeCache.GetOrAdd( typeData, - td => AssemblyLoader.LoadAssembly( - td.AssemblyName, - td.AssemblyFileName).GetType(td.Name)); + td => + { + Type type = AssemblyLoader.LoadAssembly( + td.AssemblyName, + td.AssemblyFileName).GetType(td.Name); + if (type == null) + { + throw new FileNotFoundException( + string.Format( + "Assembly '{0}' file not found '{1}'", + td.AssemblyName, + td.AssemblyFileName)); + } + + return type; + }); } } diff --git a/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs b/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs index b012794ba..a4c09ae3b 100644 --- a/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs +++ b/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs @@ -5,7 +5,6 @@ using System; using System.Collections.Generic; using System.Diagnostics; -using System.Linq; using Apache.Arrow; using Microsoft.Data.Analysis; using Microsoft.Spark.Interop; @@ -183,22 +182,24 @@ internal static JvmObjectReference CreatePythonFunction(IJvmBridge jvm, byte[] c private static IJvmObjectReferenceProvider CreateEnvVarsForPythonFunction(IJvmBridge jvm) { var environmentVars = new Hashtable(jvm); - string assemblySearchPath = string.Join(",", - new[] - { - Environment.GetEnvironmentVariable( - AssemblySearchPathResolver.AssemblySearchPathsEnvVarName), - SparkFiles.GetRootDirectory() - }.Where(s => !string.IsNullOrWhiteSpace(s))); - + string assemblySearchPath = Environment.GetEnvironmentVariable( + AssemblySearchPathResolver.AssemblySearchPathsEnvVarName); if (!string.IsNullOrEmpty(assemblySearchPath)) { environmentVars.Put( AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, assemblySearchPath); } - // DOTNET_WORKER_SPARK_VERSION is used to handle different versions of Spark on the worker. - environmentVars.Put("DOTNET_WORKER_SPARK_VERSION", SparkEnvironment.SparkVersion.ToString()); + // DOTNET_WORKER_SPARK_VERSION is used to handle different versions + // of Spark on the worker. + environmentVars.Put( + "DOTNET_WORKER_SPARK_VERSION", + SparkEnvironment.SparkVersion.ToString()); + + if (EnvironmentUtils.GetEnvironmentVariableAsBool("DOTNET_SPARK_RUNNING_REPL")) + { + environmentVars.Put("DOTNET_SPARK_RUNNING_REPL", "true"); + } return environmentVars; } From 4be3dce24a4020d6dbbc738fed05d60be4e4407b Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Fri, 12 Jun 2020 21:14:26 -0700 Subject: [PATCH 04/27] dotnet-interactive assembly extension (#517) --- ...ensions.DotNet.Interactive.UnitTest.csproj | 25 +++ .../PackageResolverTests.cs | 95 ++++++++++ .../AssemblyKernelExtension.cs | 156 +++++++++++++++++ ...Spark.Extensions.DotNet.Interactive.csproj | 40 +++++ .../PackageResolver.cs | 165 ++++++++++++++++++ .../PackageRestoreContextWrapper.cs | 14 ++ .../ResolvedNugetPackage.cs | 15 ++ .../DependencyProviderUtilsTests.cs | 8 +- .../Utils/AssemblyLoaderHelper.cs | 4 +- src/csharp/Microsoft.Spark.sln | 14 ++ src/csharp/Microsoft.Spark/Constants.cs | 11 ++ .../Microsoft.Spark/Microsoft.Spark.csproj | 2 + .../Microsoft.Spark/Utils/AssemblyLoader.cs | 2 +- .../Utils/DependencyProviderUtils.cs | 10 +- src/csharp/Microsoft.Spark/Utils/UdfUtils.cs | 4 +- 15 files changed, 551 insertions(+), 14 deletions(-) create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/ResolvedNugetPackage.cs create mode 100644 src/csharp/Microsoft.Spark/Constants.cs diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj new file mode 100644 index 000000000..b0af57cf2 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj @@ -0,0 +1,25 @@ + + + + netcoreapp3.1 + Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest + false + + https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json + + + + + + + + + + + + + + + + + diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs new file mode 100644 index 000000000..8fac95d7a --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.DotNet.Interactive.Utility; +using Microsoft.Spark.UnitTest.TestUtils; +using Microsoft.Spark.Utils; +using Moq; +using Xunit; + +namespace Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest +{ + public class PackageResolverTests + { + [Fact] + public void TestPackageResolver() + { + using var tempDir = new TemporaryDirectory(); + + string packageName = "package.name"; + string packageVersion = "0.1.0"; + string packageRootPath = + Path.Combine(tempDir.Path, "path", "to", "packages", packageName, packageVersion); + string packageFrameworkPath = Path.Combine(packageRootPath, "lib", "framework"); + + Directory.CreateDirectory(packageRootPath); + var nugetFile = new FileInfo( + Path.Combine(packageRootPath, $"{packageName}.{packageVersion}.nupkg")); + using (File.Create(nugetFile.FullName)) + { + } + + var assemblyPaths = new List + { + new FileInfo(Path.Combine(packageFrameworkPath, "1.dll")), + new FileInfo(Path.Combine(packageFrameworkPath, "2.dll")) + }; + var probingPaths = new List { new DirectoryInfo(packageRootPath) }; + + var mockPackageRestoreContextWrapper = new Mock(); + mockPackageRestoreContextWrapper + .SetupGet(m => m.ResolvedPackageReferences) + .Returns(new ResolvedPackageReference[] + { + new ResolvedPackageReference( + packageName, + packageVersion, + assemblyPaths, + new DirectoryInfo(packageRootPath), + probingPaths) + }); + + var packageResolver = new PackageResolver(mockPackageRestoreContextWrapper.Object); + IEnumerable actualFiles = packageResolver.GetFiles(tempDir.Path); + + string metadataFilePath = + Path.Combine(tempDir.Path, DependencyProviderUtils.CreateFileName(1)); + var expectedFiles = new string[] + { + nugetFile.FullName, + metadataFilePath + }; + Assert.True(expectedFiles.SequenceEqual(actualFiles)); + Assert.True(File.Exists(metadataFilePath)); + + DependencyProviderUtils.Metadata actualMetadata = + DependencyProviderUtils.Metadata.Deserialize(metadataFilePath); + var expectedMetadata = new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = new string[] + { + Path.Combine(packageName, packageVersion, "lib", "framework", "1.dll"), + Path.Combine(packageName, packageVersion, "lib", "framework", "2.dll") + }, + NativeProbingPaths = new string[] + { + Path.Combine(packageName, packageVersion) + }, + NuGets = new DependencyProviderUtils.NuGetMetadata[] + { + new DependencyProviderUtils.NuGetMetadata + { + FileName = $"{packageName}.{packageVersion}.nupkg", + PackageName = packageName, + PackageVersion = packageVersion + } + } + }; + Assert.True(expectedMetadata.Equals(actualMetadata)); + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs new file mode 100644 index 000000000..80977c46e --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs @@ -0,0 +1,156 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using Microsoft.CodeAnalysis; +using Microsoft.DotNet.Interactive; +using Microsoft.DotNet.Interactive.Commands; +using Microsoft.DotNet.Interactive.CSharp; +using Microsoft.DotNet.Interactive.Utility; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Utils; + +namespace Microsoft.Spark.Extensions.DotNet.Interactive +{ + /// + /// A kernel extension when using .NET for Apache Spark with Microsoft.DotNet.Interactive + /// Adds nuget and assembly dependencies to the default + /// using . + /// + public class AssemblyKernelExtension : IKernelExtension + { + private const string TempDirEnvVar = "DOTNET_SPARK_EXTENSION_INTERACTIVE_TMPDIR"; + + private readonly PackageResolver _packageResolver = + new PackageResolver(new PackageRestoreContextWrapper()); + + /// + /// Called by the Microsoft.DotNet.Interactive Assembly Extension Loader. + /// + /// The kernel calling this method. + /// when extension is loaded. + public Task OnLoadAsync(IKernel kernel) + { + if (kernel is CompositeKernel kernelBase) + { + Environment.SetEnvironmentVariable(Constants.RunningREPLEnvVar, "true"); + + DirectoryInfo tempDir = CreateTempDirectory(); + kernelBase.RegisterForDisposal(new DisposableDirectory(tempDir)); + + kernelBase.AddMiddleware(async (command, context, next) => + { + if ((context.HandlingKernel is CSharpKernel kernel) && + (command is SubmitCode) && + TryGetSparkSession(out SparkSession sparkSession) && + TryEmitAssembly(kernel, tempDir.FullName, out string assemblyPath)) + { + sparkSession.SparkContext.AddFile(assemblyPath); + + foreach (string filePath in GetPackageFiles(tempDir.FullName)) + { + sparkSession.SparkContext.AddFile(filePath); + } + } + + await next(command, context); + }); + } + + return Task.CompletedTask; + } + + private DirectoryInfo CreateTempDirectory() + { + string envTempDir = Environment.GetEnvironmentVariable(TempDirEnvVar); + string tempDirBasePath = string.IsNullOrEmpty(envTempDir) ? + Directory.GetCurrentDirectory() : + envTempDir; + + if (!IsPathValid(tempDirBasePath)) + { + throw new Exception($"[{GetType().Name}] Spaces in " + + $"'{tempDirBasePath}' is unsupported. Set the {TempDirEnvVar} " + + "environment variable to control the base path. Please see " + + "https://issues.apache.org/jira/browse/SPARK-30126 and " + + "https://github.com/apache/spark/pull/26773 for more details."); + } + + return Directory.CreateDirectory( + Path.Combine(tempDirBasePath, Path.GetRandomFileName())); + } + + private bool TryEmitAssembly(CSharpKernel kernel, string dstPath, out string assemblyPath) + { + Compilation compilation = kernel.ScriptState.Script.GetCompilation(); + string assemblyName = + AssemblyLoader.NormalizeAssemblyName(compilation.AssemblyName); + assemblyPath = Path.Combine(dstPath, $"{assemblyName}.dll"); + if (!File.Exists(assemblyPath)) + { + FileSystemExtensions.Emit(compilation, assemblyPath); + return true; + } + + throw new Exception( + $"TryEmitAssembly() unexpected duplicate assembly: ${assemblyPath}"); + } + + private bool TryGetSparkSession(out SparkSession sparkSession) + { + sparkSession = SparkSession.GetDefaultSession(); + return sparkSession != null; + } + + private IEnumerable GetPackageFiles(string path) + { + foreach (string filePath in _packageResolver.GetFiles(path)) + { + if (IsPathValid(filePath)) + { + yield return filePath; + } + else + { + // Copy file to a path without spaces. + string fileDestPath = Path.Combine( + path, + Path.GetFileName(filePath).Replace(" ", string.Empty)); + File.Copy(filePath, fileDestPath); + yield return fileDestPath; + } + } + } + + /// + /// In some versions of Spark, spaces is unsupported when using + /// . + /// + /// For more details please see: + /// - https://issues.apache.org/jira/browse/SPARK-30126 + /// - https://github.com/apache/spark/pull/26773 + /// + /// The path to validate. + /// true if the path is supported by Spark, false otherwise. + private bool IsPathValid(string path) + { + if (!path.Contains(" ")) + { + return true; + } + + Version version = SparkEnvironment.SparkVersion; + return (version.Major, version.Minor, version.Build) switch + { + (2, _, _) => false, + (3, 0, _) => true, + _ => throw new NotSupportedException($"Spark {version} not supported.") + }; + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj new file mode 100644 index 000000000..6966e3390 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj @@ -0,0 +1,40 @@ + + + + Library + netcoreapp3.1 + Microsoft.Spark.Extensions.DotNet.Interactive + true + true + + NU5100;$(NoWarn) + + DotNet Interactive Extension for .NET for Apache Spark + https://github.com/dotnet/spark/tree/master/docs/release-notes + spark;dotnet;csharp;interactive;dotnet-interactive + + https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json + + + + + + + + + + all + + + + + + + + + + + + diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs new file mode 100644 index 000000000..4e91156ba --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs @@ -0,0 +1,165 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Threading; +using Microsoft.DotNet.Interactive.Utility; +using Microsoft.Spark.Utils; + +namespace Microsoft.Spark.Extensions.DotNet.Interactive +{ + internal class PackageResolver + { + private readonly PackageRestoreContextWrapper _packageRestoreContextWrapper; + private readonly ConcurrentDictionary _filesCopied; + private long _metadataCounter; + + internal PackageResolver(PackageRestoreContextWrapper packageRestoreContextWrapper) + { + _packageRestoreContextWrapper = packageRestoreContextWrapper; + _filesCopied = new ConcurrentDictionary(); + _metadataCounter = 0; + } + + /// + /// Generates and serializes a to + /// . Returns a list of file paths which include the + /// the serialized and nuget file + /// dependencies. + /// + /// Path to write metadata. + /// + /// List of file paths of the serialized + /// and nuget file dependencies. + /// + internal IEnumerable GetFiles(string writePath) + { + IEnumerable nugetPackagesToCopy = GetNewPackages(); + + var assemblyProbingPaths = new List(); + var nativeProbingPaths = new List(); + var nugetMetadata = new List(); + + foreach (ResolvedNuGetPackage package in nugetPackagesToCopy) + { + ResolvedPackageReference resolvedPackage = package.ResolvedPackage; + + foreach (FileInfo asmPath in resolvedPackage.AssemblyPaths) + { + // asmPath.FullName + // /path/to/packages/package.name/package.version/lib/framework/1.dll + // resolvedPackage.PackageRoot + // /path/to/packages/package.name/package.version/ + // GetRelativeToPackages(..) + // package.name/package.version/lib/framework/1.dll + assemblyProbingPaths.Add( + GetPathRelativeToPackages( + asmPath.FullName, + resolvedPackage.PackageRoot)); + } + + foreach (DirectoryInfo probePath in resolvedPackage.ProbingPaths) + { + // probePath.FullName + // /path/to/packages/package.name/package.version/ + // resolvedPackage.PackageRoot + // /path/to/packages/package.name/package.version/ + // GetRelativeToPackages(..) + // package.name/package.version + nativeProbingPaths.Add( + GetPathRelativeToPackages( + probePath.FullName, + resolvedPackage.PackageRoot)); + } + + nugetMetadata.Add( + new DependencyProviderUtils.NuGetMetadata + { + FileName = package.NuGetFile.Name, + PackageName = resolvedPackage.PackageName, + PackageVersion = resolvedPackage.PackageVersion + }); + + yield return package.NuGetFile.FullName; + } + + if (nugetMetadata.Count > 0) + { + var metadataPath = + Path.Combine( + writePath, + DependencyProviderUtils.CreateFileName( + Interlocked.Increment(ref _metadataCounter))); + new DependencyProviderUtils.Metadata + { + AssemblyProbingPaths = assemblyProbingPaths.ToArray(), + NativeProbingPaths = nativeProbingPaths.ToArray(), + NuGets = nugetMetadata.ToArray() + }.Serialize(metadataPath); + + yield return metadataPath; + } + } + + /// + /// Return the delta of the list of packages that have been introduced + /// since the last call. + /// + /// The delta of the list of packages. + private IEnumerable GetNewPackages() + { + IEnumerable packages = + _packageRestoreContextWrapper.ResolvedPackageReferences; + foreach (ResolvedPackageReference package in packages) + { + IEnumerable files = + package.PackageRoot.EnumerateFiles("*.nupkg", SearchOption.AllDirectories); + + foreach (FileInfo file in files) + { + if (_filesCopied.TryAdd(file.Name, 1)) + { + yield return new ResolvedNuGetPackage + { + ResolvedPackage = package, + NuGetFile = file + }; + } + } + } + } + + /// + /// Given a , get the relative path to the packages directory. + /// The package is a subfolder within the packages directory. + /// + /// Examples: + /// path: + /// /path/to/packages/package.name/package.version/lib/framework/1.dll + /// directory: + /// /path/to/packages/package.name/package.version/ + /// relative path: + /// package.name/package.version/lib/framework/1.dll + /// + /// path: + /// /path/to/packages/package.name/package.version/ + /// directory: + /// /path/to/packages/package.name/package.version/ + /// relative path: + /// package.name/package.version + /// + /// The full path used to determine the relative path. + /// The package directory. + /// The relative path to the packages directory. + private string GetPathRelativeToPackages(string path, DirectoryInfo directory) + { + string strippedRoot = path + .Substring(directory.FullName.Length) + .Trim(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar); + return Path.Combine(directory.Parent.Name, directory.Name, strippedRoot); + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs new file mode 100644 index 000000000..259088d7a --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs @@ -0,0 +1,14 @@ +using System.Collections.Generic; +using Microsoft.DotNet.Interactive; +using Microsoft.DotNet.Interactive.Utility; + +namespace Microsoft.Spark.Extensions.DotNet.Interactive +{ + internal class PackageRestoreContextWrapper + { + internal virtual IEnumerable ResolvedPackageReferences => + ((ISupportNuget)KernelInvocationContext.Current.HandlingKernel) + .PackageRestoreContext + .ResolvedPackageReferences; + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/ResolvedNugetPackage.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/ResolvedNugetPackage.cs new file mode 100644 index 000000000..57106c16a --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/ResolvedNugetPackage.cs @@ -0,0 +1,15 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; +using Microsoft.DotNet.Interactive.Utility; + +namespace Microsoft.Spark.Extensions.DotNet.Interactive +{ + internal class ResolvedNuGetPackage + { + public ResolvedPackageReference ResolvedPackage { get; set; } + public FileInfo NuGetFile { get; set; } + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs b/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs index ccfc4890b..ad01e3724 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/DependencyProviderUtilsTests.cs @@ -131,7 +131,7 @@ public void TestMetadataSerDe() public void TestFileNames() { using var tempDir = new TemporaryDirectory(); - foreach (ulong num in Enumerable.Range(0, 3).Select(x => System.Math.Pow(10, x))) + foreach (long num in Enumerable.Range(0, 3).Select(x => System.Math.Pow(10, x))) { string filePath = Path.Combine(tempDir.Path, DependencyProviderUtils.CreateFileName(num)); @@ -140,9 +140,9 @@ public void TestFileNames() var expectedFiles = new string[] { - "dependencyProviderMetadata_00000000000000000001", - "dependencyProviderMetadata_00000000000000000010", - "dependencyProviderMetadata_00000000000000000100", + "dependencyProviderMetadata_0000000000000000001", + "dependencyProviderMetadata_0000000000000000010", + "dependencyProviderMetadata_0000000000000000100", }; IOrderedEnumerable actualFiles = DependencyProviderUtils .GetMetadataFiles(tempDir.Path) diff --git a/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs b/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs index b74228073..1443165bc 100644 --- a/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs +++ b/src/csharp/Microsoft.Spark.Worker/Utils/AssemblyLoaderHelper.cs @@ -24,7 +24,7 @@ private static readonly ConcurrentDictionary> s_dependencyProviders = new ConcurrentDictionary>(); private static readonly bool s_runningREPL = - EnvironmentUtils.GetEnvironmentVariableAsBool("DOTNET_SPARK_RUNNING_REPL"); + EnvironmentUtils.GetEnvironmentVariableAsBool(Constants.RunningREPLEnvVar); /// /// Register the AssemblyLoader.ResolveAssembly handler to handle the @@ -49,7 +49,7 @@ static AssemblyLoaderHelper() /// These files include: /// - "{packagename}.{version}.nupkg" /// The nuget packages - /// - + /// - /// Serialized object. /// /// On the Worker, in order to resolve the nuget dependencies referenced by diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln index 49eac3fc7..73047bff3 100644 --- a/src/csharp/Microsoft.Spark.sln +++ b/src/csharp/Microsoft.Spark.sln @@ -35,6 +35,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions. EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Azure.Synapse.Analytics", "Extensions\Microsoft.Spark.Extensions.Azure.Synapse.Analytics\Microsoft.Spark.Extensions.Azure.Synapse.Analytics.csproj", "{47652C7D-B076-4FD9-98AC-959E38BE18E3}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.DotNet.Interactive", "Extensions\Microsoft.Spark.Extensions.DotNet.Interactive\Microsoft.Spark.Extensions.DotNet.Interactive.csproj", "{9C32014D-8C0C-40F1-9ABA-C3BF19687E5C}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest", "Extensions\Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest\Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj", "{7BDE09ED-04B3-41B2-A466-3D6F7225291E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -89,6 +93,14 @@ Global {47652C7D-B076-4FD9-98AC-959E38BE18E3}.Debug|Any CPU.Build.0 = Debug|Any CPU {47652C7D-B076-4FD9-98AC-959E38BE18E3}.Release|Any CPU.ActiveCfg = Release|Any CPU {47652C7D-B076-4FD9-98AC-959E38BE18E3}.Release|Any CPU.Build.0 = Release|Any CPU + {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C}.Debug|Any CPU.Build.0 = Debug|Any CPU + {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C}.Release|Any CPU.ActiveCfg = Release|Any CPU + {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C}.Release|Any CPU.Build.0 = Release|Any CPU + {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -99,6 +111,8 @@ Global {2048446B-45AB-4304-B230-50EDF6E8E6A4} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {47652C7D-B076-4FD9-98AC-959E38BE18E3} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} + {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} + {7BDE09ED-04B3-41B2-A466-3D6F7225291E} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538} diff --git a/src/csharp/Microsoft.Spark/Constants.cs b/src/csharp/Microsoft.Spark/Constants.cs new file mode 100644 index 000000000..c346aadd3 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Constants.cs @@ -0,0 +1,11 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.Spark +{ + internal class Constants + { + internal const string RunningREPLEnvVar = "DOTNET_SPARK_RUNNING_REPL"; + } +} diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index f3d3f1ffd..050a43493 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -17,6 +17,8 @@ + + diff --git a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs index 94a37dbb5..3b9b34f5e 100644 --- a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs +++ b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs @@ -217,7 +217,7 @@ ex is FileLoadException || /// /// Assembly name /// Normalized assembly name - private static string NormalizeAssemblyName(string assemblyName) + internal static string NormalizeAssemblyName(string assemblyName) { // Check if the assembly name follows the Roslyn naming convention. // Roslyn assembly name: "\u211B*4b31b71b-d4bd-4642-9f63-eef5f5d99197#1-14" diff --git a/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs b/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs index f379cfe24..3954151d1 100644 --- a/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs +++ b/src/csharp/Microsoft.Spark/Utils/DependencyProviderUtils.cs @@ -19,12 +19,12 @@ internal static string[] GetMetadataFiles(string path) => // function. // // number => filename - // 0 => dependencyProviderMetadata_00000000000000000000 - // 1 => dependencyProviderMetadata_00000000000000000001 + // 0 => dependencyProviderMetadata_0000000000000000000 + // 1 => dependencyProviderMetadata_0000000000000000001 // ... - // 20 => dependencyProviderMetadata_00000000000000000020 - internal static string CreateFileName(ulong number) => - s_filePattern.Replace("*", $"{number:D20}"); + // 20 => dependencyProviderMetadata_0000000000000000020 + internal static string CreateFileName(long number) => + s_filePattern.Replace("*", $"{number:D19}"); [Serializable] internal class NuGetMetadata diff --git a/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs b/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs index a4c09ae3b..ccb5e5209 100644 --- a/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs +++ b/src/csharp/Microsoft.Spark/Utils/UdfUtils.cs @@ -196,9 +196,9 @@ private static IJvmObjectReferenceProvider CreateEnvVarsForPythonFunction(IJvmBr "DOTNET_WORKER_SPARK_VERSION", SparkEnvironment.SparkVersion.ToString()); - if (EnvironmentUtils.GetEnvironmentVariableAsBool("DOTNET_SPARK_RUNNING_REPL")) + if (EnvironmentUtils.GetEnvironmentVariableAsBool(Constants.RunningREPLEnvVar)) { - environmentVars.Put("DOTNET_SPARK_RUNNING_REPL", "true"); + environmentVars.Put(Constants.RunningREPLEnvVar, "true"); } return environmentVars; From e43a0a465ada4eec837322d54d03a15c88d9e74f Mon Sep 17 00:00:00 2001 From: elvaliuliuliu <47404285+elvaliuliuliu@users.noreply.github.com> Date: Sat, 13 Jun 2020 15:12:59 -0700 Subject: [PATCH 05/27] Add multiple stages for Azure Pipeline (#524) --- azure-pipelines.yml | 876 +++++++++++++++++++++++--------------------- 1 file changed, 466 insertions(+), 410 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 20215a7b2..14b6e689a 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,9 +11,9 @@ variables: _SignType: real _TeamName: DotNetSpark MSBUILDSINGLELOADCONTEXT: 1 - # backwardCompatibleRelease/forwardCompatibleRelease is the "oldest" releases that work with the current release - backwardCompatibleRelease: '0.9.0' + # forwardCompatibleRelease/backwardCompatibleRelease is the "oldest" releases that work with the current release forwardCompatibleRelease: '0.9.0' + backwardCompatibleRelease: '0.9.0' TestsToFilterOut: "(FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameGroupedMapUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameVectorUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroy)&\ @@ -22,7 +22,8 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" - LatestDotnetWorkerDir: '$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp3.1\win-x64' + ArtifactPath: '$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Binaries' + CurrentDotnetWorkerDir: '$(ArtifactPath)\Microsoft.Spark.Worker\netcoreapp3.1\win-x64' BackwardCompatibleDotnetWorkerDir: $(Build.BinariesDirectory)\Microsoft.Spark.Worker-$(backwardCompatibleRelease) # Azure DevOps variables are transformed into environment variables, with these variables we @@ -38,404 +39,131 @@ resources: name: dotnet/spark ref: refs/tags/v$(forwardCompatibleRelease) -jobs: -- job: Build - displayName: Build and Test Sources - pool: Hosted VS2017 - - variables: - ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: - _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) - HADOOP_HOME: $(Build.BinariesDirectory)\hadoop - - steps: - - checkout: self - path: s\master - - checkout: forwardCompatibleRelease - path: s\$(forwardCompatibleRelease) - - - task: Maven@3 - displayName: 'Maven build src' - inputs: - mavenPomFile: master/src/scala/pom.xml - - - task: Maven@3 - displayName: 'Maven build benchmark' - inputs: - mavenPomFile: master/benchmark/scala/pom.xml - - - task: BatchScript@1 - displayName: Download Spark Distros & Winutils.exe - inputs: - filename: master\script\download-spark-distros.cmd - arguments: $(Build.BinariesDirectory) - - - task: BatchScript@1 - displayName: Download backward compatible worker v$(backwardCompatibleRelease) - inputs: - filename: master\script\download-worker-release.cmd - arguments: '$(Build.BinariesDirectory) $(backwardCompatibleRelease)' - - - script: master\build.cmd -pack - -c $(buildConfiguration) - -ci - $(_OfficialBuildIdArgs) - /p:PublishSparkWorker=true - /p:SparkWorkerPublishDir=$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker - displayName: '.NET build' - - - task: DotNetCoreCLI@2 - displayName: '.NET unit tests' - inputs: - command: test - projects: 'master/**/*UnitTest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.0' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.1' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.2' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.3' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.4' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.0' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.1' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.3' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.4' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.5' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.0 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.1 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.2 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.3 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.4 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.0 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.1 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.3 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.4 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.5 with backward compatible worker v$(backwardCompatibleRelease)' - inputs: - command: test - projects: 'master/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) - - - task: Maven@3 - displayName: 'Maven build src for forward compatible release v$(forwardCompatibleRelease)' - inputs: - mavenPomFile: $(forwardCompatibleRelease)/src/scala/pom.xml - - - script: $(forwardCompatibleRelease)\build.cmd - -c $(buildConfiguration) - -ci - $(_OfficialBuildIdArgs) - /p:PublishSparkWorker=false - displayName: '.NET build for forward compatible release v$(forwardCompatibleRelease)' - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.0 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.1 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.2 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.3 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.3.4 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.0 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.1 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.3 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.4 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) - - - task: DotNetCoreCLI@2 - displayName: 'E2E tests for Spark 2.4.5 from forward compatible release v$(forwardCompatibleRelease)' - inputs: - command: test - projects: '$(forwardCompatibleRelease)/**/Microsoft.Spark*.E2ETest/*.csproj' - arguments: '--configuration $(buildConfiguration)' - env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 - DOTNET_WORKER_DIR: $(LatestDotnetWorkerDir) +stages: +- stage: Build + displayName: Build Sources + jobs: + - job: Build + pool: Hosted VS2017 - - ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: - - task: CopyFiles@2 - displayName: Stage .NET artifacts + variables: + ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) + + steps: + - task: Maven@3 + displayName: 'Maven build src' + inputs: + mavenPomFile: src/scala/pom.xml + + - task: Maven@3 + displayName: 'Maven build benchmark' + inputs: + mavenPomFile: benchmark/scala/pom.xml + + - script: build.cmd -pack + -c $(buildConfiguration) + -ci + $(_OfficialBuildIdArgs) + /p:PublishSparkWorker=true + /p:SparkWorkerPublishDir=$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker + displayName: '.NET build' + + - task: DotNetCoreCLI@2 + displayName: '.NET unit tests' inputs: - sourceFolder: $(Build.SourcesDirectory)/master/artifacts/packages/$(buildConfiguration)/Shipping - contents: | - **/*.nupkg - **/*.snupkg - targetFolder: $(Build.ArtifactStagingDirectory)/BuildArtifacts/artifacts/packages/$(buildConfiguration)/Shipping + command: test + projects: '**/*UnitTest/*.csproj' + arguments: '--configuration $(buildConfiguration)' - task: CopyFiles@2 - displayName: Stage build logs + displayName: Stage Maven build jars inputs: - sourceFolder: $(Build.SourcesDirectory)/master/artifacts/log - targetFolder: $(Build.ArtifactStagingDirectory)/BuildArtifacts/artifacts/log + sourceFolder: $(Build.SourcesDirectory)/src/scala + contents: '**/*.jar' + targetFolder: $(Build.ArtifactStagingDirectory)/Jars + + - ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + - task: CopyFiles@2 + displayName: Stage .NET artifacts + inputs: + sourceFolder: $(Build.SourcesDirectory)/artifacts/packages/$(buildConfiguration)/Shipping + contents: | + **/*.nupkg + **/*.snupkg + targetFolder: $(Build.ArtifactStagingDirectory)/BuildArtifacts/artifacts/packages/$(buildConfiguration)/Shipping + + - task: CopyFiles@2 + displayName: Stage build logs + inputs: + sourceFolder: $(Build.SourcesDirectory)/artifacts/log + targetFolder: $(Build.ArtifactStagingDirectory)/BuildArtifacts/artifacts/log - task: PublishBuildArtifacts@1 inputs: pathtoPublish: '$(Build.ArtifactStagingDirectory)' artifactName: Microsoft.Spark.Binaries -- ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: - - job: SignPublish - dependsOn: - - Build - displayName: Sign and Publish Artifacts - pool: - name: NetCoreInternal-Pool - queue: buildpool.windows.10.amd64.vs2017 + - ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + - job: SignPublish + dependsOn: + - Build + displayName: Sign and Publish Artifacts + pool: + name: NetCoreInternal-Pool + queue: buildpool.windows.10.amd64.vs2017 + + variables: + ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) + + steps: + - task: DownloadBuildArtifacts@0 + displayName: Download Build Artifacts + inputs: + artifactName: Microsoft.Spark.Binaries + downloadPath: $(Build.ArtifactStagingDirectory) + + - task: MicroBuildSigningPlugin@2 + displayName: Install MicroBuild plugin + inputs: + signType: $(_SignType) + zipSources: false + feedSource: https://dnceng.pkgs.visualstudio.com/_packaging/MicroBuildToolset/nuget/v3/index.json + env: + TeamName: $(_TeamName) + condition: and(succeeded(), in(variables['_SignType'], 'real', 'test'), eq(variables['Agent.Os'], 'Windows_NT')) + + - task: PowerShell@2 + displayName: Sign artifacts and Package Microsoft.Spark.Worker + inputs: + filePath: eng\common\build.ps1 + arguments: -restore -sign -publish + -c $(buildConfiguration) + -ci + $(_OfficialBuildIdArgs) + /p:DotNetSignType=$(_SignType) + /p:SparkPackagesDir=$(ArtifactPath)\BuildArtifacts\artifacts\packages + /p:SparkWorkerPublishDir=$(ArtifactPath)\Microsoft.Spark.Worker + /p:SparkWorkerPackageOutputDir=$(ArtifactPath) + + - task: PublishBuildArtifacts@1 + inputs: + pathtoPublish: '$(ArtifactPath)' + artifactName: Microsoft.Spark.Binaries + +- stage: Test + displayName: E2E Tests + dependsOn: Build + jobs: + - job: Run + pool: Hosted VS2017 variables: ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) + HADOOP_HOME: $(Build.BinariesDirectory)\hadoop + DOTNET_WORKER_DIR: $(CurrentDotnetWorkerDir) steps: - task: DownloadBuildArtifacts@0 @@ -443,31 +171,359 @@ jobs: inputs: artifactName: Microsoft.Spark.Binaries downloadPath: $(Build.ArtifactStagingDirectory) - - - task: MicroBuildSigningPlugin@2 - displayName: Install MicroBuild plugin + + - task: CopyFiles@2 + displayName: Copy jars + inputs: + sourceFolder: $(ArtifactPath)/Jars + contents: '**/*.jar' + targetFolder: $(Build.SourcesDirectory)/src/scala + + - task: BatchScript@1 + displayName: Download Spark Distros & Winutils.exe + inputs: + filename: script\download-spark-distros.cmd + arguments: $(Build.BinariesDirectory) + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.0' inputs: - signType: $(_SignType) - zipSources: false - feedSource: https://dnceng.pkgs.visualstudio.com/_packaging/MicroBuildToolset/nuget/v3/index.json + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' env: - TeamName: $(_TeamName) - condition: and(succeeded(), in(variables['_SignType'], 'real', 'test'), eq(variables['Agent.Os'], 'Windows_NT')) - - - task: PowerShell@2 - displayName: Sign artifacts and Package Microsoft.Spark.Worker - inputs: - filePath: eng\common\build.ps1 - arguments: -restore -sign -publish - -c $(buildConfiguration) - -ci - $(_OfficialBuildIdArgs) - /p:DotNetSignType=$(_SignType) - /p:SparkPackagesDir=$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Binaries\BuildArtifacts\artifacts\packages - /p:SparkWorkerPublishDir=$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Binaries\Microsoft.Spark.Worker - /p:SparkWorkerPackageOutputDir=$(Build.ArtifactStagingDirectory)\Microsoft.Spark.Binaries + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 - - task: PublishBuildArtifacts@1 + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.1' inputs: - pathtoPublish: '$(Build.ArtifactStagingDirectory)/Microsoft.Spark.Binaries' - artifactName: Microsoft.Spark.Binaries + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.2' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.3' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.4' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.0' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.1' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.3' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.4' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.5' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 + +- stage: ForwardCompatibility + displayName: E2E Forward Compatibility Tests + dependsOn: Build + jobs: + - job: Run + pool: Hosted VS2017 + + variables: + ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) + HADOOP_HOME: $(Build.BinariesDirectory)\hadoop + DOTNET_WORKER_DIR: $(CurrentDotnetWorkerDir) + + steps: + - checkout: forwardCompatibleRelease + path: s\$(forwardCompatibleRelease) + + - task: Maven@3 + displayName: 'Maven build src for forward compatible release v$(forwardCompatibleRelease)' + inputs: + mavenPomFile: src/scala/pom.xml + + - task: DownloadBuildArtifacts@0 + displayName: Download Build Artifacts + inputs: + artifactName: Microsoft.Spark.Binaries + downloadPath: $(Build.ArtifactStagingDirectory) + + - task: BatchScript@1 + displayName: Download Spark Distros & Winutils.exe + inputs: + filename: script\download-spark-distros.cmd + arguments: $(Build.BinariesDirectory) + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.0' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.1' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.2' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.3' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.4' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.0' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.1' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.3' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.4' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.5' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 + +- stage: BackwardCompatibility + displayName: E2E Backward Compatibility Tests + dependsOn: Build + jobs: + - job: Run + pool: Hosted VS2017 + + variables: + ${{ if and(ne(variables['System.TeamProject'], 'public'), notin(variables['Build.Reason'], 'PullRequest')) }}: + _OfficialBuildIdArgs: /p:OfficialBuildId=$(BUILD.BUILDNUMBER) + HADOOP_HOME: $(Build.BinariesDirectory)\hadoop + DOTNET_WORKER_DIR: $(BackwardCompatibleDotnetWorkerDir) + + steps: + - task: DownloadBuildArtifacts@0 + displayName: Download Build Artifacts + inputs: + artifactName: Microsoft.Spark.Binaries + downloadPath: $(Build.ArtifactStagingDirectory) + + - task: CopyFiles@2 + displayName: Copy jars + inputs: + sourceFolder: $(ArtifactPath)/Jars + contents: '**/*.jar' + targetFolder: $(Build.SourcesDirectory)/src/scala + + - task: BatchScript@1 + displayName: Download Spark Distros & Winutils.exe + inputs: + filename: script\download-spark-distros.cmd + arguments: $(Build.BinariesDirectory) + + - task: BatchScript@1 + displayName: Download backward compatible worker v$(backwardCompatibleRelease) + inputs: + filename: script\download-worker-release.cmd + arguments: '$(Build.BinariesDirectory) $(backwardCompatibleRelease)' + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.0' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.1' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.2' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.3' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.3.4' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.0' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.1' + inputs: + command: test + projects: '**/Microsoft.Spark.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.3' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.3-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.4' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.4-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.5' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 \ No newline at end of file From c8992e0787c6f50f2aa3572839fe67b872a3363a Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Sun, 14 Jun 2020 14:30:50 -0700 Subject: [PATCH 06/27] Support Spark 2.4.6 (#547) --- azure-pipelines.yml | 21 ++++++++++++++++++- script/download-spark-distros.cmd | 4 +++- .../spark/deploy/dotnet/DotnetRunner.scala | 3 ++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 14b6e689a..8ba73e0c1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -275,6 +275,15 @@ stages: env: SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.6' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.6-bin-hadoop2.7 + - stage: ForwardCompatibility displayName: E2E Forward Compatibility Tests dependsOn: Build @@ -526,4 +535,14 @@ stages: projects: '**/Microsoft.Spark*.E2ETest/*.csproj' arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' env: - SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 \ No newline at end of file + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.5-bin-hadoop2.7 + + - task: DotNetCoreCLI@2 + displayName: 'E2E tests for Spark 2.4.6' + inputs: + command: test + projects: '**/Microsoft.Spark*.E2ETest/*.csproj' + arguments: '--configuration $(buildConfiguration) --filter $(TestsToFilterOut)' + env: + SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.6-bin-hadoop2.7 + diff --git a/script/download-spark-distros.cmd b/script/download-spark-distros.cmd index d02bb49a7..0d2435a00 100644 --- a/script/download-spark-distros.cmd +++ b/script/download-spark-distros.cmd @@ -23,5 +23,7 @@ curl -k -L -o spark-2.4.1.tgz https://archive.apache.org/dist/spark/spark-2.4.1/ curl -k -L -o spark-2.4.3.tgz https://archive.apache.org/dist/spark/spark-2.4.3/spark-2.4.3-bin-hadoop2.7.tgz && tar xzvf spark-2.4.3.tgz curl -k -L -o spark-2.4.4.tgz https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz && tar xzvf spark-2.4.4.tgz curl -k -L -o spark-2.4.5.tgz https://archive.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz && tar xzvf spark-2.4.5.tgz +curl -k -L -o spark-2.4.6.tgz https://archive.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz && tar xzvf spark-2.4.6.tgz + +endlocal -endlocal \ No newline at end of file diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/deploy/dotnet/DotnetRunner.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/deploy/dotnet/DotnetRunner.scala index 65a56e3e8..5925dcca9 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/deploy/dotnet/DotnetRunner.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/deploy/dotnet/DotnetRunner.scala @@ -34,7 +34,8 @@ import scala.util.Try */ object DotnetRunner extends Logging { private val DEBUG_PORT = 5567 - private val supportedSparkVersions = Set[String]("2.4.0", "2.4.1", "2.4.3", "2.4.4", "2.4.5") + private val supportedSparkVersions = + Set[String]("2.4.0", "2.4.1", "2.4.3", "2.4.4", "2.4.5", "2.4.6") val SPARK_VERSION = DotnetUtils.normalizeSparkVersion(spark.SPARK_VERSION) From 9240bfaf721443a2c5556034ce126f7adbb6272d Mon Sep 17 00:00:00 2001 From: JavierAndres Date: Mon, 15 Jun 2020 19:47:31 +0200 Subject: [PATCH 07/27] Fix AppName in examples to follow Spark naming convention (#548) --- .../MachineLearning/Sentiment/Program.cs | 2 +- examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs | 2 +- .../Microsoft.Spark.CSharp.Examples/Sql/Batch/Datasource.cs | 2 +- .../Sql/Batch/VectorDataFrameUdfs.cs | 2 +- .../Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorUdfs.cs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/Microsoft.Spark.CSharp.Examples/MachineLearning/Sentiment/Program.cs b/examples/Microsoft.Spark.CSharp.Examples/MachineLearning/Sentiment/Program.cs index efb85e468..51f63078d 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/MachineLearning/Sentiment/Program.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/MachineLearning/Sentiment/Program.cs @@ -27,7 +27,7 @@ public void Run(string[] args) SparkSession spark = SparkSession .Builder() - .AppName(".NET for Apache Spark Sentiment Analysis") + .AppName("Sentiment Analysis using .NET for Apache Spark") .GetOrCreate(); // Read in and display Yelp reviews diff --git a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs index 6ef95eefa..fe57f7d1b 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs @@ -26,7 +26,7 @@ public void Run(string[] args) SparkSession spark = SparkSession .Builder() - .AppName(".NET Spark SQL basic example") + .AppName("SQL basic example using .NET for Apache Spark") .Config("spark.some.config.option", "some-value") .GetOrCreate(); diff --git a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Datasource.cs b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Datasource.cs index cf41eeceb..0945df791 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Datasource.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Datasource.cs @@ -32,7 +32,7 @@ public void Run(string[] args) SparkSession spark = SparkSession .Builder() - .AppName(".NET Spark SQL Datasource example") + .AppName("SQL Datasource example using .NET for Apache Spark") .Config("spark.some.config.option", "some-value") .GetOrCreate(); diff --git a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorDataFrameUdfs.cs b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorDataFrameUdfs.cs index 697301733..aafea7256 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorDataFrameUdfs.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorDataFrameUdfs.cs @@ -31,7 +31,7 @@ public void Run(string[] args) .Builder() // Lower the shuffle partitions to speed up groupBy() operations. .Config("spark.sql.shuffle.partitions", "3") - .AppName(".NET Spark SQL VectorUdfs example") + .AppName("SQL VectorUdfs example using .NET for Apache Spark") .GetOrCreate(); DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]); diff --git a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorUdfs.cs b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorUdfs.cs index 369cc3aff..2497d5ef3 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorUdfs.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/VectorUdfs.cs @@ -29,7 +29,7 @@ public void Run(string[] args) .Builder() // Lower the shuffle partitions to speed up groupBy() operations. .Config("spark.sql.shuffle.partitions", "3") - .AppName(".NET Spark SQL VectorUdfs example") + .AppName("SQL VectorUdfs example using .NET for Apache Spark") .GetOrCreate(); DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]); From ff7c7640d5aad9d53eb127f71e28d04f8cc8a88d Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Fri, 19 Jun 2020 17:12:02 -0700 Subject: [PATCH 08/27] Microsoft.Spark.Extensions.DotNet.Interactive support latest changes to ISupportNuget (#554) --- NuGet.config | 2 ++ ...ft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj | 4 +--- .../PackageResolverTests.cs | 6 +++--- .../AssemblyKernelExtension.cs | 2 +- .../Microsoft.Spark.Extensions.DotNet.Interactive.csproj | 6 ++---- .../PackageResolver.cs | 8 ++++---- ...ageRestoreContextWrapper.cs => SupportNugetWrapper.cs} | 3 +-- .../Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj | 3 --- 8 files changed, 14 insertions(+), 20 deletions(-) rename src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/{PackageRestoreContextWrapper.cs => SupportNugetWrapper.cs} (82%) diff --git a/NuGet.config b/NuGet.config index 7b7b765e2..9d2866825 100644 --- a/NuGet.config +++ b/NuGet.config @@ -6,5 +6,7 @@ + + diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj index b0af57cf2..391582751 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj @@ -4,13 +4,11 @@ netcoreapp3.1 Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest false - - https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json - + diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs index 8fac95d7a..219c533ff 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest/PackageResolverTests.cs @@ -40,8 +40,8 @@ public void TestPackageResolver() }; var probingPaths = new List { new DirectoryInfo(packageRootPath) }; - var mockPackageRestoreContextWrapper = new Mock(); - mockPackageRestoreContextWrapper + var mockSupportNugetWrapper = new Mock(); + mockSupportNugetWrapper .SetupGet(m => m.ResolvedPackageReferences) .Returns(new ResolvedPackageReference[] { @@ -53,7 +53,7 @@ public void TestPackageResolver() probingPaths) }); - var packageResolver = new PackageResolver(mockPackageRestoreContextWrapper.Object); + var packageResolver = new PackageResolver(mockSupportNugetWrapper.Object); IEnumerable actualFiles = packageResolver.GetFiles(tempDir.Path); string metadataFilePath = diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs index 80977c46e..2deff5869 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs @@ -27,7 +27,7 @@ public class AssemblyKernelExtension : IKernelExtension private const string TempDirEnvVar = "DOTNET_SPARK_EXTENSION_INTERACTIVE_TMPDIR"; private readonly PackageResolver _packageResolver = - new PackageResolver(new PackageRestoreContextWrapper()); + new PackageResolver(new SupportNugetWrapper()); /// /// Called by the Microsoft.DotNet.Interactive Assembly Extension Loader. diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj index 6966e3390..da330c762 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/Microsoft.Spark.Extensions.DotNet.Interactive.csproj @@ -6,14 +6,12 @@ Microsoft.Spark.Extensions.DotNet.Interactive true true - + NU5100;$(NoWarn) DotNet Interactive Extension for .NET for Apache Spark https://github.com/dotnet/spark/tree/master/docs/release-notes spark;dotnet;csharp;interactive;dotnet-interactive - - https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json @@ -22,7 +20,7 @@ - + all diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs index 4e91156ba..f9a76e43f 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageResolver.cs @@ -13,13 +13,13 @@ namespace Microsoft.Spark.Extensions.DotNet.Interactive { internal class PackageResolver { - private readonly PackageRestoreContextWrapper _packageRestoreContextWrapper; + private readonly SupportNugetWrapper _supportNugetWrapper; private readonly ConcurrentDictionary _filesCopied; private long _metadataCounter; - internal PackageResolver(PackageRestoreContextWrapper packageRestoreContextWrapper) + internal PackageResolver(SupportNugetWrapper supportNugetWrapper) { - _packageRestoreContextWrapper = packageRestoreContextWrapper; + _supportNugetWrapper = supportNugetWrapper; _filesCopied = new ConcurrentDictionary(); _metadataCounter = 0; } @@ -112,7 +112,7 @@ internal IEnumerable GetFiles(string writePath) private IEnumerable GetNewPackages() { IEnumerable packages = - _packageRestoreContextWrapper.ResolvedPackageReferences; + _supportNugetWrapper.ResolvedPackageReferences; foreach (ResolvedPackageReference package in packages) { IEnumerable files = diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/SupportNugetWrapper.cs similarity index 82% rename from src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs rename to src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/SupportNugetWrapper.cs index 259088d7a..489e39e94 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/PackageRestoreContextWrapper.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/SupportNugetWrapper.cs @@ -4,11 +4,10 @@ namespace Microsoft.Spark.Extensions.DotNet.Interactive { - internal class PackageRestoreContextWrapper + internal class SupportNugetWrapper { internal virtual IEnumerable ResolvedPackageReferences => ((ISupportNuget)KernelInvocationContext.Current.HandlingKernel) - .PackageRestoreContext .ResolvedPackageReferences; } } diff --git a/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj b/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj index 1be21a7ac..f18f41963 100644 --- a/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj +++ b/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj @@ -6,9 +6,6 @@ netcoreapp3.1 Microsoft.Spark.Worker true - - - https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet3.1/nuget/v3/index.json From 3701d2e9c3076d75c6dc7a9a71a229175cf39501 Mon Sep 17 00:00:00 2001 From: Laneser Date: Mon, 22 Jun 2020 01:17:22 +0800 Subject: [PATCH 09/27] Fix broken maven link in building/ubuntu-instructions.md (#558) --- docs/building/ubuntu-instructions.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/building/ubuntu-instructions.md b/docs/building/ubuntu-instructions.md index 8bb11b163..0e3dbdf40 100644 --- a/docs/building/ubuntu-instructions.md +++ b/docs/building/ubuntu-instructions.md @@ -35,14 +35,14 @@ If you already have all the pre-requisites, skip to the [build](ubuntu-instructi ```bash sudo update-alternatives --config java ``` - 3. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)** + 3. Install **[Apache Maven 3.6.3+](https://maven.apache.org/download.cgi)** - Run the following command: ```bash mkdir -p ~/bin/maven cd ~/bin/maven - wget https://www-us.apache.org/dist/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.tar.gz - tar -xvzf apache-maven-3.6.0-bin.tar.gz - ln -s apache-maven-3.6.0 current + wget https://www-us.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz + tar -xvzf apache-maven-3.6.3-bin.tar.gz + ln -s apache-maven-3.6.3 current export M2_HOME=~/bin/maven/current export PATH=${M2_HOME}/bin:${PATH} source ~/.bashrc @@ -54,11 +54,11 @@ If you already have all the pre-requisites, skip to the [build](ubuntu-instructi 📙 Click to see sample mvn -version output ``` - Apache Maven 3.6.0 (97c98ec64a1fdfee7767ce5ffb20918da4f719f3; 2018-10-24T18:41:47Z) - Maven home: ~/bin/apache-maven-3.6.0 - Java version: 1.8.0_191, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre - Default locale: en, platform encoding: UTF-8 - OS name: "linux", version: "4.4.0-17763-microsoft", arch: "amd64", family: "unix" + Apache Maven 3.6.3 (cecedd343002696d0abb50b32b541b8a6ba2883f) + Maven home: ~/bin/apache-maven-3.6.3 + Java version: 1.8.0_242, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre + Default locale: en_US, platform encoding: ANSI_X3.4-1968 + OS name: "linux", version: "4.4.0-142-generic", arch: "amd64", family: "unix" ``` 4. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)** - Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `~/bin/spark-2.3.2-bin-hadoop2.7`) From 6f835a5e0291ffed5ddcc449378fe76bfd722714 Mon Sep 17 00:00:00 2001 From: Laneser Date: Mon, 22 Jun 2020 11:17:03 +0800 Subject: [PATCH 10/27] Fix maven broken link in windows build doc (#560) --- docs/building/windows-instructions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/building/windows-instructions.md b/docs/building/windows-instructions.md index 84874a129..aad141b68 100644 --- a/docs/building/windows-instructions.md +++ b/docs/building/windows-instructions.md @@ -30,10 +30,10 @@ If you already have all the pre-requisites, skip to the [build](windows-instruct 3. Install **[Java 1.8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html)** - Select the appropriate version for your operating system e.g., jdk-8u201-windows-x64.exe for Win x64 machine. - Install using the installer and verify you are able to run `java` from your command-line - 4. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)** - - Download [Apache Maven 3.6.0](http://mirror.metrocast.net/apache/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.zip) - - Extract to a local directory e.g., `c:\bin\apache-maven-3.6.0\` - - Add Apache Maven to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\apache-maven-3.6.0\bin` + 4. Install **[Apache Maven 3.6.3+](https://maven.apache.org/download.cgi)** + - Download [Apache Maven 3.6.3](http://mirror.metrocast.net/apache/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.zip) + - Extract to a local directory e.g., `c:\bin\apache-maven-3.6.3\` + - Add Apache Maven to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\apache-maven-3.6.3\bin` - Verify you are able to run `mvn` from your command-line 5. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)** - Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `c:\bin\spark-2.3.2-bin-hadoop2.7\`) using [7-zip](https://www.7-zip.org/). From 7bb3dd1817095bb4685311be48bbf4ebab21739e Mon Sep 17 00:00:00 2001 From: Andrew Fogarty Date: Sun, 21 Jun 2020 23:56:49 -0700 Subject: [PATCH 11/27] Hyperspace Extension (#555) --- .../Constants.cs | 14 ++ .../HyperspaceFixture.cs | 32 ++++ .../HyperspaceTests.cs | 141 ++++++++++++++++++ .../Index/IndexConfigTests.cs | 86 +++++++++++ ...Spark.Extensions.Hyperspace.E2ETest.csproj | 13 ++ .../Hyperspace.cs | 113 ++++++++++++++ .../HyperspaceSparkSessionExtensions.cs | 55 +++++++ .../Index/Builder.cs | 74 +++++++++ .../Index/IndexConfig.cs | 92 ++++++++++++ ...crosoft.Spark.Extensions.Hyperspace.csproj | 13 ++ .../Microsoft.Spark.E2ETest.csproj | 1 + src/csharp/Microsoft.Spark.sln | 14 ++ .../Interop/Internal/Scala/Seq.cs | 41 +++++ .../Microsoft.Spark/Microsoft.Spark.csproj | 2 + 14 files changed, 691 insertions(+) create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Constants.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceFixture.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceTests.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Index/IndexConfigTests.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Microsoft.Spark.Extensions.Hyperspace.E2ETest.csproj create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Hyperspace.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/HyperspaceSparkSessionExtensions.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/Builder.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/IndexConfig.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Microsoft.Spark.Extensions.Hyperspace.csproj create mode 100644 src/csharp/Microsoft.Spark/Interop/Internal/Scala/Seq.cs diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Constants.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Constants.cs new file mode 100644 index 000000000..969dd85f1 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Constants.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.Spark.Extensions.Hyperspace.E2ETest +{ + /// + /// Constants related to the Hyperspace test suite. + /// + internal class Constants + { + public const string HyperspaceTestContainerName = "Hyperspace Tests"; + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceFixture.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceFixture.cs new file mode 100644 index 000000000..8578c77f0 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceFixture.cs @@ -0,0 +1,32 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.E2ETest; +using Xunit; + +namespace Microsoft.Spark.Extensions.Hyperspace.E2ETest +{ + public class HyperspaceFixture + { + public HyperspaceFixture() + { + Environment.SetEnvironmentVariable( + SparkFixture.EnvironmentVariableNames.ExtraSparkSubmitArgs, + "--packages com.microsoft.hyperspace:hyperspace-core_2.11:0.1.0"); + + SparkFixture = new SparkFixture(); + } + + public SparkFixture SparkFixture { get; private set; } + } + + [CollectionDefinition(Constants.HyperspaceTestContainerName)] + public class HyperspaceTestCollection : ICollectionFixture + { + // This class has no code, and is never created. Its purpose is simply + // to be the place to apply [CollectionDefinition] and all the + // ICollectionFixture<> interfaces. + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceTests.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceTests.cs new file mode 100644 index 000000000..12e8bca60 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/HyperspaceTests.cs @@ -0,0 +1,141 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.Extensions.Hyperspace.Index; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.Extensions.Hyperspace.E2ETest +{ + /// + /// Test suite for Hyperspace index management APIs. + /// + [Collection(Constants.HyperspaceTestContainerName)] + public class HyperspaceTests : IDisposable + { + private readonly SparkSession _spark; + private readonly TemporaryDirectory _hyperspaceSystemDirectory; + private readonly Hyperspace _hyperspace; + + // Fields needed for sample DataFrame. + private readonly DataFrame _sampleDataFrame; + private readonly string _sampleIndexName; + private readonly IndexConfig _sampleIndexConfig; + + public HyperspaceTests(HyperspaceFixture fixture) + { + _spark = fixture.SparkFixture.Spark; + _hyperspaceSystemDirectory = new TemporaryDirectory(); + _spark.Conf().Set("spark.hyperspace.system.path", _hyperspaceSystemDirectory.Path); + _hyperspace = new Hyperspace(_spark); + + _sampleDataFrame = _spark.Read() + .Option("header", true) + .Option("delimiter", ";") + .Csv("Resources\\people.csv"); + _sampleIndexName = "sample_dataframe"; + _sampleIndexConfig = new IndexConfig(_sampleIndexName, new[] { "job" }, new[] { "name" }); + _hyperspace.CreateIndex(_sampleDataFrame, _sampleIndexConfig); + } + + /// + /// Clean up the Hyperspace system directory in between tests. + /// + public void Dispose() + { + _hyperspaceSystemDirectory.Dispose(); + } + + /// + /// Test the method signatures for all Hyperspace APIs. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestSignatures() + { + // Indexes API. + Assert.IsType(_hyperspace.Indexes()); + + // Delete and Restore APIs. + _hyperspace.DeleteIndex(_sampleIndexName); + _hyperspace.RestoreIndex(_sampleIndexName); + + // Refresh API. + _hyperspace.RefreshIndex(_sampleIndexName); + + // Cancel API. + Assert.Throws(() => _hyperspace.Cancel(_sampleIndexName)); + + // Explain API. + _hyperspace.Explain(_sampleDataFrame, true); + _hyperspace.Explain(_sampleDataFrame, true, s => Console.WriteLine(s)); + + // Delete and Vacuum APIs. + _hyperspace.DeleteIndex(_sampleIndexName); + _hyperspace.VacuumIndex(_sampleIndexName); + + // Enable and disable Hyperspace. + Assert.IsType(_spark.EnableHyperspace()); + Assert.IsType(_spark.DisableHyperspace()); + Assert.IsType(_spark.IsHyperspaceEnabled()); + } + + /// + /// Test E2E functionality of index CRUD APIs. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestIndexCreateAndDelete() + { + // Should be one active index. + DataFrame indexes = _hyperspace.Indexes(); + Assert.Equal(1, indexes.Count()); + Assert.Equal(_sampleIndexName, indexes.SelectExpr("name").First()[0]); + Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]); + + // Delete the index then verify it has been deleted. + _hyperspace.DeleteIndex(_sampleIndexName); + indexes = _hyperspace.Indexes(); + Assert.Equal(1, indexes.Count()); + Assert.Equal(States.Deleted, indexes.SelectExpr("state").First()[0]); + + // Restore the index to active state and verify it is back. + _hyperspace.RestoreIndex(_sampleIndexName); + indexes = _hyperspace.Indexes(); + Assert.Equal(1, indexes.Count()); + Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]); + + // Delete and vacuum the index, then verify it is gone. + _hyperspace.DeleteIndex(_sampleIndexName); + _hyperspace.VacuumIndex(_sampleIndexName); + Assert.Equal(0, _hyperspace.Indexes().Count()); + } + + /// + /// Test that the explain API generates the expected string. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestExplainAPI() + { + // Run a query that hits the index. + DataFrame queryDataFrame = _sampleDataFrame + .Where("job == 'Developer'") + .Select("name"); + + string explainString = string.Empty; + _hyperspace.Explain(queryDataFrame, true, s => explainString = s); + Assert.False(string.IsNullOrEmpty(explainString)); + } + + /// + /// Index states used in testing. + /// + private static class States + { + public const string Active = "ACTIVE"; + public const string Deleted = "DELETED"; + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Index/IndexConfigTests.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Index/IndexConfigTests.cs new file mode 100644 index 000000000..b96f85432 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Index/IndexConfigTests.cs @@ -0,0 +1,86 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.Extensions.Hyperspace.Index; +using Xunit; + +namespace Microsoft.Spark.Extensions.Hyperspace.E2ETest.Index +{ + /// + /// Test suite for Hyperspace IndexConfig tests. + /// + [Collection(Constants.HyperspaceTestContainerName)] + public class IndexConfigTests + { + public IndexConfigTests(HyperspaceFixture fixture) + { + } + + /// + /// Test the method signatures for IndexConfig and IndexConfigBuilder APIs. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestSignatures() + { + string indexName = "testIndexName"; + var indexConfig = new IndexConfig(indexName, new[] { "Id" }, new string[] { }); + Assert.IsType(indexConfig.IndexName); + Assert.IsType>(indexConfig.IndexedColumns); + Assert.IsType>(indexConfig.IncludedColumns); + Assert.IsType(IndexConfig.Builder()); + Assert.IsType(indexConfig.Equals(indexConfig)); + Assert.IsType(indexConfig.GetHashCode()); + Assert.IsType(indexConfig.ToString()); + + Builder builder = IndexConfig.Builder(); + Assert.IsType(builder); + Assert.IsType(builder.IndexName("indexName")); + Assert.IsType(builder.IndexBy("indexed1", "indexed2")); + Assert.IsType(builder.Include("included1")); + Assert.IsType(builder.Create()); + } + + /// + /// Test creating an IndexConfig using its class constructor. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestIndexConfigConstructor() + { + string indexName = "indexName"; + string[] indexedColumns = { "idx1" }; + string[] includedColumns = { "inc1", "inc2", "inc3" }; + var config = new IndexConfig(indexName, indexedColumns, includedColumns); + + // Validate that the config was built correctly. + Assert.Equal(indexName, config.IndexName); + Assert.Equal(indexedColumns, config.IndexedColumns); + Assert.Equal(includedColumns, config.IncludedColumns); + } + + /// + /// Test creating an IndexConfig using the builder pattern. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestIndexConfigBuilder() + { + string indexName = "indexName"; + string[] indexedColumns = { "idx1" }; + string[] includedColumns = { "inc1", "inc2", "inc3" }; + + Builder builder = IndexConfig.Builder(); + builder.IndexName(indexName); + builder.Include(includedColumns[0], includedColumns[1], includedColumns[2]); + builder.IndexBy(indexedColumns[0]); + + // Validate that the config was built correctly. + IndexConfig config = builder.Create(); + Assert.Equal(indexName, config.IndexName); + Assert.Equal(indexedColumns, config.IndexedColumns); + Assert.Equal(includedColumns, config.IncludedColumns); + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Microsoft.Spark.Extensions.Hyperspace.E2ETest.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Microsoft.Spark.Extensions.Hyperspace.E2ETest.csproj new file mode 100644 index 000000000..231022e4b --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace.E2ETest/Microsoft.Spark.Extensions.Hyperspace.E2ETest.csproj @@ -0,0 +1,13 @@ + + + + netcoreapp3.1 + false + + + + + + + + diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Hyperspace.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Hyperspace.cs new file mode 100644 index 000000000..13509779d --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Hyperspace.cs @@ -0,0 +1,113 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.Extensions.Hyperspace.Index; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.Extensions.Hyperspace +{ + /// + /// .Net for Spark binding for Hyperspace index management APIs. + /// + public class Hyperspace : IJvmObjectReferenceProvider + { + private static readonly string s_hyperspaceClassName = + "com.microsoft.hyperspace.Hyperspace"; + private readonly SparkSession _spark; + private readonly IJvmBridge _jvmBridge; + private readonly JvmObjectReference _jvmObject; + + public Hyperspace(SparkSession spark) + { + _spark = spark; + _jvmBridge = ((IJvmObjectReferenceProvider)spark).Reference.Jvm; + _jvmObject = _jvmBridge.CallConstructor(s_hyperspaceClassName, spark); + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Collect all the index metadata. + /// + /// All index metadata as a . + public DataFrame Indexes() => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("indexes")); + + /// + /// Create index. + /// + /// The DataFrame object to build index on. + /// The configuration of index to be created. + public void CreateIndex(DataFrame df, IndexConfig indexConfig) => + _jvmObject.Invoke("createIndex", df, indexConfig); + + /// + /// Soft deletes the index with given index name. + /// + /// The name of index to delete. + public void DeleteIndex(string indexName) => _jvmObject.Invoke("deleteIndex", indexName); + + /// + /// Restores index with given index name. + /// + /// Name of the index to restore. + public void RestoreIndex(string indexName) => _jvmObject.Invoke("restoreIndex", indexName); + + /// + /// Does hard delete of indexes marked as DELETED. + /// + /// Name of the index to restore. + public void VacuumIndex(string indexName) => _jvmObject.Invoke("vacuumIndex", indexName); + + /// + /// Update indexes for the latest version of the data. + /// + /// Name of the index to refresh. + public void RefreshIndex(string indexName) => _jvmObject.Invoke("refreshIndex", indexName); + + /// + /// Cancel api to bring back index from an inconsistent state to the last known stable + /// state. + /// + /// E.g. if index fails during creation, in CREATING state. + /// The index will not allow any index modifying operations unless a cancel is called. + /// + /// Note: Cancel from VACUUMING state will move it forward to DOESNOTEXIST + /// state. + /// + /// Note: If no previous stable state exists, cancel will move it to DOESNOTEXIST + /// state. + /// + /// Name of the index to cancel. + public void Cancel(string indexName) => _jvmObject.Invoke("cancel", indexName); + + /// + /// Explains how indexes will be applied to the given dataframe. + /// + /// dataFrame + /// Flag to enable verbose mode. + public void Explain(DataFrame df, bool verbose) => + Explain(df, verbose, s => Console.WriteLine(s)); + + /// + /// Explains how indexes will be applied to the given dataframe. + /// + /// dataFrame + /// Flag to enable verbose mode. + /// Function to redirect output of explain. + public void Explain(DataFrame df, bool verbose, Action redirectFunc) + { + var explainString = (string)_jvmBridge.CallStaticJavaMethod( + "com.microsoft.hyperspace.index.plananalysis.PlanAnalyzer", + "explainString", + df, + _spark, + Indexes(), + verbose); + redirectFunc(explainString); + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/HyperspaceSparkSessionExtensions.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/HyperspaceSparkSessionExtensions.cs new file mode 100644 index 000000000..3c43f369c --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/HyperspaceSparkSessionExtensions.cs @@ -0,0 +1,55 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.Extensions.Hyperspace +{ + /// + /// Hyperspace-specific extension methods on . + /// + public static class HyperspaceSparkSessionExtensions + { + private static readonly string s_pythonUtilsClassName = + "com.microsoft.hyperspace.util.PythonUtils"; + + /// + /// Plug in Hyperspace-specific rules. + /// + /// A spark session that does not contain Hyperspace-specific rules. + /// + /// A spark session that contains Hyperspace-specific rules. + public static SparkSession EnableHyperspace(this SparkSession session) => + new SparkSession( + (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_pythonUtilsClassName, + "enableHyperspace", + session)); + + /// + /// Plug out Hyperspace-specific rules. + /// + /// A spark session that contains Hyperspace-specific rules. + /// A spark session that does not contain Hyperspace-specific rules. + public static SparkSession DisableHyperspace(this SparkSession session) => + new SparkSession( + (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_pythonUtilsClassName, + "disableHyperspace", + session)); + + /// + /// Checks if Hyperspace is enabled or not. + /// + /// + /// True if Hyperspace is enabled or false otherwise. + public static bool IsHyperspaceEnabled(this SparkSession session) => + (bool)SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_pythonUtilsClassName, + "isHyperspaceEnabled", + session); + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/Builder.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/Builder.cs new file mode 100644 index 000000000..4623de3e7 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/Builder.cs @@ -0,0 +1,74 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.Extensions.Hyperspace.Index +{ + /// + /// Builder for . + /// + public sealed class Builder : IJvmObjectReferenceProvider + { + private readonly JvmObjectReference _jvmObject; + + internal Builder(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Updates index name for . + /// + /// Index name for the . + /// An object with updated indexname. + public Builder IndexName(string indexName) + { + _jvmObject.Invoke("indexName", indexName); + return this; + } + + /// + /// Updates column names for . + /// + /// Note: API signature supports passing one or more argument. + /// + /// Indexed column for the + /// . + /// Indexed columns for the + /// . + /// An object with updated indexed columns. + public Builder IndexBy(string indexedColumn, params string[] indexedColumns) + { + _jvmObject.Invoke("indexBy", indexedColumn, indexedColumns); + return this; + } + + /// + /// Updates included columns for . + /// + /// Note: API signature supports passing one or more argument. + /// + /// Included column for . + /// + /// Included columns for . + /// + /// An object with updated included columns. + public Builder Include(string includedColumn, params string[] includedColumns) + { + _jvmObject.Invoke("include", includedColumn, includedColumns); + return this; + } + + /// + /// Creates IndexConfig from supplied index name, indexed columns and included columns + /// to . + /// + /// An object. + public IndexConfig Create() => + new IndexConfig((JvmObjectReference)_jvmObject.Invoke("create")); + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/IndexConfig.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/IndexConfig.cs new file mode 100644 index 000000000..030dda2ca --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Index/IndexConfig.cs @@ -0,0 +1,92 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Internal.Scala; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.Extensions.Hyperspace.Index +{ + /// + /// specifies the configuration of an index. + /// + public sealed class IndexConfig : IJvmObjectReferenceProvider + { + private static readonly string s_className = "com.microsoft.hyperspace.index.IndexConfig"; + private readonly JvmObjectReference _jvmObject; + + /// + /// specifies the configuration of an index. + /// + /// Index name. + /// Columns from which an index is created. + public IndexConfig(string indexName, IEnumerable indexedColumns) + : this(indexName, indexedColumns, new string[] { }) + { + } + + /// + /// specifies the configuration of an index. + /// + /// Index name. + /// Columns from which an index is created. + /// Columns to be included in the index. + public IndexConfig( + string indexName, + IEnumerable indexedColumns, + IEnumerable includedColumns) + { + IndexName = indexName; + IndexedColumns = new List(indexedColumns); + IncludedColumns = new List(includedColumns); + + _jvmObject = (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_className, + "apply", + IndexName, + IndexedColumns, + IncludedColumns); + } + + /// + /// specifies the configuration of an index. + /// + /// JVM object reference. + internal IndexConfig(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + IndexName = (string)_jvmObject.Invoke("indexName"); + IndexedColumns = new List( + new Seq((JvmObjectReference)_jvmObject.Invoke("indexedColumns"))); + IncludedColumns = new List( + new Seq((JvmObjectReference)_jvmObject.Invoke("includedColumns"))); + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + public string IndexName { get; private set; } + + public List IndexedColumns { get; private set; } + + public List IncludedColumns { get; private set; } + + /// + /// Creates new for constructing an + /// . + /// + /// An object. + public static Builder Builder() => + new Builder( + (JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_className, + "builder")); + + public override bool Equals(object that) => (bool)_jvmObject.Invoke("equals", that); + + public override int GetHashCode() => (int)_jvmObject.Invoke("hashCode"); + + public override string ToString() => (string)_jvmObject.Invoke("toString"); + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Microsoft.Spark.Extensions.Hyperspace.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Microsoft.Spark.Extensions.Hyperspace.csproj new file mode 100644 index 000000000..d85c62f71 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Hyperspace/Microsoft.Spark.Extensions.Hyperspace.csproj @@ -0,0 +1,13 @@ + + + + netstandard2.0;netstandard2.1 + true + true + + + + + + + diff --git a/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj b/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj index e03519853..7a6240ecc 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj +++ b/src/csharp/Microsoft.Spark.E2ETest/Microsoft.Spark.E2ETest.csproj @@ -12,6 +12,7 @@ + diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln index 73047bff3..75c071377 100644 --- a/src/csharp/Microsoft.Spark.sln +++ b/src/csharp/Microsoft.Spark.sln @@ -39,6 +39,10 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions. EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest", "Extensions\Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest\Microsoft.Spark.Extensions.DotNet.Interactive.UnitTest.csproj", "{7BDE09ED-04B3-41B2-A466-3D6F7225291E}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.Hyperspace", "Extensions\Microsoft.Spark.Extensions.Hyperspace\Microsoft.Spark.Extensions.Hyperspace.csproj", "{70DDA4E9-1195-4A29-9AA1-96A8223A6D4F}" +EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Hyperspace.E2ETest", "Extensions\Microsoft.Spark.Extensions.Hyperspace.E2ETest\Microsoft.Spark.Extensions.Hyperspace.E2ETest.csproj", "{C6019E44-C777-4DE2-B70E-EA025B7D044D}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -101,6 +105,14 @@ Global {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Debug|Any CPU.Build.0 = Debug|Any CPU {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Release|Any CPU.ActiveCfg = Release|Any CPU {7BDE09ED-04B3-41B2-A466-3D6F7225291E}.Release|Any CPU.Build.0 = Release|Any CPU + {70DDA4E9-1195-4A29-9AA1-96A8223A6D4F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {70DDA4E9-1195-4A29-9AA1-96A8223A6D4F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {70DDA4E9-1195-4A29-9AA1-96A8223A6D4F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {70DDA4E9-1195-4A29-9AA1-96A8223A6D4F}.Release|Any CPU.Build.0 = Release|Any CPU + {C6019E44-C777-4DE2-B70E-EA025B7D044D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {C6019E44-C777-4DE2-B70E-EA025B7D044D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {C6019E44-C777-4DE2-B70E-EA025B7D044D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {C6019E44-C777-4DE2-B70E-EA025B7D044D}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -113,6 +125,8 @@ Global {47652C7D-B076-4FD9-98AC-959E38BE18E3} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {9C32014D-8C0C-40F1-9ABA-C3BF19687E5C} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {7BDE09ED-04B3-41B2-A466-3D6F7225291E} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} + {70DDA4E9-1195-4A29-9AA1-96A8223A6D4F} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} + {C6019E44-C777-4DE2-B70E-EA025B7D044D} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538} diff --git a/src/csharp/Microsoft.Spark/Interop/Internal/Scala/Seq.cs b/src/csharp/Microsoft.Spark/Interop/Internal/Scala/Seq.cs new file mode 100644 index 000000000..9d9ed3bc1 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Internal/Scala/Seq.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections; +using System.Collections.Generic; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.Interop.Internal.Scala +{ + /// + /// Limited read-only implementation of Scala Seq[T] so that Seq objects can be read + /// into POCO collection types such as List. + /// + /// + internal sealed class Seq : IJvmObjectReferenceProvider, IEnumerable + { + private readonly JvmObjectReference _jvmObject; + + internal Seq(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + public int Size => (int)_jvmObject.Invoke("size"); + + public IEnumerator GetEnumerator() + { + for (int i = 0; i < Size; ++i) + { + yield return Apply(i); + } + } + + public T Apply(int index) => (T)_jvmObject.Invoke("apply", index); + } +} diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 050a43493..2cddc5627 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -19,6 +19,8 @@ + + From 1cd9ccab7ccf62f4483004a8245e42c77cf9ef61 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 22 Jun 2020 15:55:49 -0700 Subject: [PATCH 12/27] UDF bug fix caused by ThreadStatic BroadcastVariablesRegistry (#551) --- .../UdfTests/UdfSimpleTypesTests.cs | 25 +++++++++++++++++++ src/csharp/Microsoft.Spark/Broadcast.cs | 13 +++++----- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs index e4c4cabb9..92422c205 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading; using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Types; using Xunit; @@ -166,5 +167,29 @@ public void TestUdfWithReturnAsTimestampType() } } } + + /// + /// Test to validate UDFs defined in separate threads work. + /// + [Fact] + public void TestUdfWithMultipleThreads() + { + try + { + void DefineUdf() => Udf(str => str); + + // Define a UDF in the main thread. + Udf(str => str); + + // Verify a UDF can be defined in a separate thread. + Thread t = new Thread(DefineUdf); + t.Start(); + t.Join(); + } + catch (Exception) + { + Assert.True(false); + } + } } } diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 20ae5c869..2791ec546 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -4,6 +4,7 @@ using System.IO; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; +using System.Threading; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Services; @@ -261,28 +262,26 @@ internal static void Remove(long bid) /// internal static class JvmBroadcastRegistry { - [ThreadStatic] - private static readonly List s_jvmBroadcastVariables = - new List(); + private static ThreadLocal> s_jvmBroadcastVariables = + new ThreadLocal>(() => new List()); /// /// Adds a JVMObjectReference object of type to the list. /// /// JVMObjectReference of the Broadcast variable internal static void Add(JvmObjectReference broadcastJvmObject) => - s_jvmBroadcastVariables.Add(broadcastJvmObject); + s_jvmBroadcastVariables.Value.Add(broadcastJvmObject); /// /// Clears s_jvmBroadcastVariables of all the JVMObjectReference objects of type /// . /// - internal static void Clear() => s_jvmBroadcastVariables.Clear(); + internal static void Clear() => s_jvmBroadcastVariables.Value.Clear(); /// /// Returns the static member s_jvmBroadcastVariables. /// /// A list of all broadcast objects of type - internal static List GetAll() => s_jvmBroadcastVariables; + internal static List GetAll() => s_jvmBroadcastVariables.Value; } } - From 29ad2cb4f4d9c83c8e97e41c614ed115706425a2 Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Tue, 23 Jun 2020 10:03:28 -0700 Subject: [PATCH 13/27] Expose DataStreamWriter.ForeachBatch API (#549) --- .../Sql/Streaming/DataStreamWriterTests.cs | 73 +++++ .../Microsoft.Spark.UnitTest/CallbackTests.cs | 239 +++++++++++++++ .../Microsoft.Spark.UnitTest/SparkFixture.cs | 1 - .../TestUtils/XunitConsoleOutHelper.cs | 34 +++ .../CommandExecutorTests.cs | 1 - .../DaemonWorkerTests.cs | 4 - .../TaskRunnerTests.cs | 3 - .../Interop/Ipc/CallbackConnection.cs | 280 ++++++++++++++++++ .../Interop/Ipc/CallbackServer.cs | 256 ++++++++++++++++ .../Ipc/ForeachBatchCallbackHandler.cs | 36 +++ .../Interop/Ipc/ICallbackHandler.cs | 16 + .../Interop/SparkEnvironment.cs | 9 + .../Network/DefaultSocketWrapper.cs | 5 + .../Microsoft.Spark/Network/ISocketWrapper.cs | 5 + .../Sql/Streaming/DataStreamWriter.cs | 28 ++ .../Sql/Streaming/StreamingQuery.cs | 10 + .../spark/api/dotnet/CallbackClient.scala | 73 +++++ .../spark/api/dotnet/CallbackConnection.scala | 112 +++++++ .../spark/api/dotnet/DotnetBackend.scala | 46 ++- .../api/dotnet/DotnetBackendHandler.scala | 29 +- .../spark/api/dotnet/DotnetException.scala | 13 + .../spark/api/dotnet/CallbackClient.scala | 73 +++++ .../spark/api/dotnet/CallbackConnection.scala | 112 +++++++ .../spark/api/dotnet/DotnetBackend.scala | 46 ++- .../api/dotnet/DotnetBackendHandler.scala | 29 +- .../spark/api/dotnet/DotnetException.scala | 13 + .../sql/api/dotnet/DotnetForeachBatch.scala | 34 +++ .../spark/api/dotnet/CallbackClient.scala | 73 +++++ .../spark/api/dotnet/CallbackConnection.scala | 112 +++++++ .../spark/api/dotnet/DotnetBackend.scala | 46 ++- .../api/dotnet/DotnetBackendHandler.scala | 29 +- .../spark/api/dotnet/DotnetException.scala | 13 + .../sql/api/dotnet/DotnetForeachBatch.scala | 34 +++ 33 files changed, 1731 insertions(+), 156 deletions(-) create mode 100644 src/csharp/Microsoft.Spark.UnitTest/CallbackTests.cs create mode 100644 src/csharp/Microsoft.Spark.UnitTest/TestUtils/XunitConsoleOutHelper.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/CallbackConnection.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/CallbackServer.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/ForeachBatchCallbackHandler.cs create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/ICallbackHandler.cs create mode 100644 src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala create mode 100644 src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala create mode 100644 src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala create mode 100644 src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala create mode 100644 src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala create mode 100644 src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala create mode 100644 src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala create mode 100644 src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala create mode 100644 src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala create mode 100644 src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala create mode 100644 src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs index 15c2a22a7..0983035f4 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/Streaming/DataStreamWriterTests.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Threading; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Streaming; @@ -67,6 +68,69 @@ public void TestSignaturesV2_3_X() Assert.IsType(dsw.Trigger(Trigger.Once())); } + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void TestForeachBatch() + { + // Temporary folder to put our test stream input. + using var srcTempDirectory = new TemporaryDirectory(); + // Temporary folder to write ForeachBatch output. + using var dstTempDirectory = new TemporaryDirectory(); + + Func outerUdf = Udf(i => i + 100); + + // id column: [0, 1, ..., 9] + WriteCsv(0, 10, Path.Combine(srcTempDirectory.Path, "input1.csv")); + + DataStreamWriter dsw = _spark + .ReadStream() + .Schema("id INT") + .Csv(srcTempDirectory.Path) + .WriteStream() + .ForeachBatch((df, id) => + { + Func innerUdf = Udf(i => i + 200); + df.Select(outerUdf(innerUdf(Col("id")))) + .Write() + .Csv(Path.Combine(dstTempDirectory.Path, id.ToString())); + }); + + StreamingQuery sq = dsw.Start(); + + // Process until all available data in the source has been processed and committed + // to the ForeachBatch sink. + sq.ProcessAllAvailable(); + + // Add new file to the source path. The spark stream will read any new files + // added to the source path. + // id column: [10, 11, ..., 19] + WriteCsv(10, 10, Path.Combine(srcTempDirectory.Path, "input2.csv")); + + // Process until all available data in the source has been processed and committed + // to the ForeachBatch sink. + sq.ProcessAllAvailable(); + sq.Stop(); + + // Verify folders in the destination path. + string[] csvPaths = + Directory.GetDirectories(dstTempDirectory.Path).OrderBy(s => s).ToArray(); + var expectedPaths = new string[] + { + Path.Combine(dstTempDirectory.Path, "0"), + Path.Combine(dstTempDirectory.Path, "1"), + }; + Assert.True(expectedPaths.SequenceEqual(csvPaths)); + + // Read the generated csv paths and verify contents. + DataFrame df = _spark + .Read() + .Schema("id INT") + .Csv(csvPaths[0], csvPaths[1]) + .Sort("id"); + + IEnumerable actualIds = df.Collect().Select(r => r.GetAs("id")); + Assert.True(Enumerable.Range(300, 20).SequenceEqual(actualIds)); + } + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] public void TestForeach() { @@ -200,6 +264,15 @@ private void TestAndValidateForeach( foreachWriterOutputDF.Collect().Select(r => r.Values)); } + private void WriteCsv(int start, int count, string path) + { + using var streamWriter = new StreamWriter(path); + foreach (int i in Enumerable.Range(start, count)) + { + streamWriter.WriteLine(i); + } + } + [Serializable] private class TestForeachWriter : IForeachWriter { diff --git a/src/csharp/Microsoft.Spark.UnitTest/CallbackTests.cs b/src/csharp/Microsoft.Spark.UnitTest/CallbackTests.cs new file mode 100644 index 000000000..04266e814 --- /dev/null +++ b/src/csharp/Microsoft.Spark.UnitTest/CallbackTests.cs @@ -0,0 +1,239 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Net; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Network; +using Moq; +using Xunit; + +namespace Microsoft.Spark.UnitTest +{ + [Collection("Spark Unit Tests")] + public class CallbackTests + { + private readonly Mock _mockJvm; + + public CallbackTests(SparkFixture fixture) + { + _mockJvm = fixture.MockJvm; + } + + [Fact] + public async Task TestCallbackIds() + { + int numToRegister = 100; + var callbackServer = new CallbackServer(_mockJvm.Object, false); + var callbackHandler = new TestCallbackHandler(); + + var ids = new ConcurrentBag(); + var tasks = new List(); + for (int i = 0; i < numToRegister; ++i) + { + tasks.Add( + Task.Run(() => ids.Add(callbackServer.RegisterCallback(callbackHandler)))); + } + + await Task.WhenAll(tasks); + + IOrderedEnumerable actualIds = ids.OrderBy(i => i); + IEnumerable expectedIds = Enumerable.Range(1, numToRegister); + Assert.True(expectedIds.SequenceEqual(actualIds)); + } + + [Fact] + public void TestCallbackServer() + { + var callbackServer = new CallbackServer(_mockJvm.Object, false); + var callbackHandler = new TestCallbackHandler(); + + callbackHandler.Id = callbackServer.RegisterCallback(callbackHandler); + Assert.Equal(1, callbackHandler.Id); + + using ISocketWrapper callbackSocket = SocketFactory.CreateSocket(); + callbackServer.Run(callbackSocket); + + int connectionNumber = 10; + for (int i = 0; i < connectionNumber; ++i) + { + var ipEndpoint = (IPEndPoint)callbackSocket.LocalEndPoint; + ISocketWrapper clientSocket = SocketFactory.CreateSocket(); + clientSocket.Connect(ipEndpoint.Address, ipEndpoint.Port); + + WriteAndReadTestData(clientSocket, callbackHandler, i, new CancellationToken()); + } + + Assert.Equal(connectionNumber, callbackServer.CurrentNumConnections); + + IOrderedEnumerable actualValues = callbackHandler.Inputs.OrderBy(i => i); + IEnumerable expectedValues = Enumerable + .Range(0, connectionNumber) + .Select(i => callbackHandler.Apply(i)) + .OrderBy(i => i); + Assert.True(expectedValues.SequenceEqual(actualValues)); + } + + [Fact] + public void TestCallbackHandlers() + { + var tokenSource = new CancellationTokenSource(); + var callbackHandlersDict = new ConcurrentDictionary(); + int inputToHandler = 1; + { + // Test CallbackConnection using a ICallbackHandler that runs + // normally without error. + var callbackHandler = new TestCallbackHandler + { + Id = 1 + }; + callbackHandlersDict[callbackHandler.Id] = callbackHandler; + TestCallbackConnection( + callbackHandlersDict, + callbackHandler, + inputToHandler, + tokenSource.Token); + Assert.Single(callbackHandler.Inputs); + Assert.Equal( + callbackHandler.Apply(inputToHandler), + callbackHandler.Inputs.First()); + } + { + // Test CallbackConnection using a ICallbackHandler that + // throws an exception. + var callbackHandler = new ThrowsExceptionHandler + { + Id = 2 + }; + callbackHandlersDict[callbackHandler.Id] = callbackHandler; + TestCallbackConnection( + callbackHandlersDict, + callbackHandler, + inputToHandler, + tokenSource.Token); + Assert.Empty(callbackHandler.Inputs); + } + { + // Test CallbackConnection when cancellation has been requested for the token. + tokenSource.Cancel(); + var callbackHandler = new TestCallbackHandler + { + Id = 3 + }; + callbackHandlersDict[callbackHandler.Id] = callbackHandler; + TestCallbackConnection( + callbackHandlersDict, + callbackHandler, + inputToHandler, + tokenSource.Token); + Assert.Empty(callbackHandler.Inputs); + } + } + + private void TestCallbackConnection( + ConcurrentDictionary callbackHandlersDict, + ITestCallbackHandler callbackHandler, + int inputToHandler, + CancellationToken token) + { + using ISocketWrapper serverListener = SocketFactory.CreateSocket(); + serverListener.Listen(); + + var ipEndpoint = (IPEndPoint)serverListener.LocalEndPoint; + ISocketWrapper clientSocket = SocketFactory.CreateSocket(); + clientSocket.Connect(ipEndpoint.Address, ipEndpoint.Port); + + var callbackConnection = new CallbackConnection(0, clientSocket, callbackHandlersDict); + Task.Run(() => callbackConnection.Run(token)); + + using ISocketWrapper serverSocket = serverListener.Accept(); + WriteAndReadTestData(serverSocket, callbackHandler, inputToHandler, token); + } + + private void WriteAndReadTestData( + ISocketWrapper socket, + ITestCallbackHandler callbackHandler, + int inputToHandler, + CancellationToken token) + { + Stream inputStream = socket.InputStream; + Stream outputStream = socket.OutputStream; + + SerDe.Write(outputStream, (int)CallbackFlags.CALLBACK); + SerDe.Write(outputStream, callbackHandler.Id); + SerDe.Write(outputStream, sizeof(int)); + SerDe.Write(outputStream, inputToHandler); + SerDe.Write(outputStream, (int)CallbackFlags.END_OF_STREAM); + outputStream.Flush(); + + if (token.IsCancellationRequested) + { + Assert.Throws(() => SerDe.ReadInt32(inputStream)); + } + else + { + int callbackFlag = SerDe.ReadInt32(inputStream); + if (callbackFlag == (int)CallbackFlags.DOTNET_EXCEPTION_THROWN) + { + string exceptionMessage = SerDe.ReadString(inputStream); + Assert.False(string.IsNullOrEmpty(exceptionMessage)); + Assert.Contains(callbackHandler.ExceptionMessage, exceptionMessage); + } + else + { + Assert.Equal((int)CallbackFlags.END_OF_STREAM, callbackFlag); + } + } + } + + private class TestCallbackHandler : ICallbackHandler, ITestCallbackHandler + { + public void Run(Stream inputStream) => Inputs.Add(Apply(SerDe.ReadInt32(inputStream))); + + public ConcurrentBag Inputs { get; } = new ConcurrentBag(); + + public int Id { get; set; } + + public bool Throws { get; } = false; + + public string ExceptionMessage => throw new NotImplementedException(); + + public int Apply(int i) => 10 * i; + } + + private class ThrowsExceptionHandler : ICallbackHandler, ITestCallbackHandler + { + public void Run(Stream inputStream) => throw new Exception(ExceptionMessage); + + public ConcurrentBag Inputs { get; } = new ConcurrentBag(); + + public int Id { get; set; } + + public bool Throws { get; } = true; + + public string ExceptionMessage { get; } = "Dotnet Callback Handler Exception Message"; + + public int Apply(int i) => throw new NotImplementedException(); + } + + private interface ITestCallbackHandler + { + ConcurrentBag Inputs { get; } + + int Id { get; set; } + + bool Throws { get; } + + string ExceptionMessage { get; } + + int Apply(int i); + } + } +} diff --git a/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs b/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs index 02f2c8b3b..06c9a3fe2 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/SparkFixture.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System; -using System.IO; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Moq; diff --git a/src/csharp/Microsoft.Spark.UnitTest/TestUtils/XunitConsoleOutHelper.cs b/src/csharp/Microsoft.Spark.UnitTest/TestUtils/XunitConsoleOutHelper.cs new file mode 100644 index 000000000..f71630671 --- /dev/null +++ b/src/csharp/Microsoft.Spark.UnitTest/TestUtils/XunitConsoleOutHelper.cs @@ -0,0 +1,34 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Xunit.Abstractions; + +namespace Microsoft.Spark.UnitTest.TestUtils +{ + // Tests can subclass this to get Console output to display when using + // xUnit testing framework. + // Workaround found at https://github.com/microsoft/vstest/issues/799 + public class XunitConsoleOutHelper : IDisposable + { + private readonly ITestOutputHelper _output; + private readonly TextWriter _originalOut; + private readonly TextWriter _textWriter; + + public XunitConsoleOutHelper(ITestOutputHelper output) + { + _output = output; + _originalOut = Console.Out; + _textWriter = new StringWriter(); + Console.SetOut(_textWriter); + } + + public void Dispose() + { + _output.WriteLine(_textWriter.ToString()); + Console.SetOut(_originalOut); + } + } +} diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs index 589e0ea0b..8978e321e 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/CommandExecutorTests.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; using System.Collections; using System.Collections.Generic; using System.IO; diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs index 5fac38035..d5800bb1e 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/DaemonWorkerTests.cs @@ -3,14 +3,10 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections; using System.Collections.Generic; -using System.IO; using System.Net; using System.Threading.Tasks; -using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Network; -using Razorvine.Pickle; using Xunit; namespace Microsoft.Spark.Worker.UnitTest diff --git a/src/csharp/Microsoft.Spark.Worker.UnitTest/TaskRunnerTests.cs b/src/csharp/Microsoft.Spark.Worker.UnitTest/TaskRunnerTests.cs index 436a45940..86a254f4b 100644 --- a/src/csharp/Microsoft.Spark.Worker.UnitTest/TaskRunnerTests.cs +++ b/src/csharp/Microsoft.Spark.Worker.UnitTest/TaskRunnerTests.cs @@ -2,13 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Collections; using System.Collections.Generic; using System.Net; using System.Threading.Tasks; -using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Network; -using Razorvine.Pickle; using Xunit; namespace Microsoft.Spark.Worker.UnitTest diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackConnection.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackConnection.cs new file mode 100644 index 000000000..512318429 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackConnection.cs @@ -0,0 +1,280 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Buffers.Binary; +using System.Collections.Concurrent; +using System.IO; +using System.Threading; +using Microsoft.Spark.Network; +using Microsoft.Spark.Services; + +namespace Microsoft.Spark.Interop.Ipc +{ + /// + /// CallbackConnection is used to process the callback communication between + /// Dotnet and the JVM. It uses a TCP socket to communicate with the JVM side + /// and the socket is expected to be reused. + /// + internal sealed class CallbackConnection + { + private static readonly ILoggerService s_logger = + LoggerServiceFactory.GetLogger(typeof(CallbackConnection)); + + private readonly ISocketWrapper _socket; + + /// + /// Keeps track of all s by its Id. This is accessed + /// by the and the . + /// + private readonly ConcurrentDictionary _callbackHandlers; + + private volatile bool _isRunning = false; + + private int _numCallbacksRun = 0; + + internal CallbackConnection( + long connectionId, + ISocketWrapper socket, + ConcurrentDictionary callbackHandlers) + { + ConnectionId = connectionId; + _socket = socket; + _callbackHandlers = callbackHandlers; + + s_logger.LogInfo( + $"[{ConnectionId}] Connected with RemoteEndPoint: {socket.RemoteEndPoint}"); + } + + private enum ConnectionStatus + { + /// + /// Connection is normal. + /// + OK, + + /// + /// Socket is closed by the JVM. + /// + SOCKET_CLOSED, + + /// + /// Request to close connection. + /// + REQUEST_CLOSE + } + + internal long ConnectionId { get; } + + /// + /// Run and start processing the callback connection. + /// + /// Cancellation token used to stop the connection. + internal void Run(CancellationToken token) + { + _isRunning = true; + Stream inputStream = _socket.InputStream; + Stream outputStream = _socket.OutputStream; + + token.Register(() => Stop()); + + try + { + while (_isRunning) + { + ConnectionStatus connectionStatus = + ProcessStream(inputStream, outputStream, out bool readComplete); + + if (connectionStatus == ConnectionStatus.OK) + { + outputStream.Flush(); + + ++_numCallbacksRun; + + // If the socket is not read through completely, then it cannot be reused. + if (!readComplete) + { + _isRunning = false; + + // Wait for server to complete to avoid 'connection reset' exception. + s_logger.LogInfo( + $"[{ConnectionId}] Sleep 500 millisecond to close socket."); + Thread.Sleep(500); + } + } + else if (connectionStatus == ConnectionStatus.REQUEST_CLOSE) + { + _isRunning = false; + s_logger.LogInfo( + $"[{ConnectionId}] Request to close connection received."); + } + else + { + _isRunning = false; + s_logger.LogWarn($"[{ConnectionId}] Socket is closed by JVM."); + } + } + } + catch (Exception e) + { + _isRunning = false; + s_logger.LogError($"[{ConnectionId}] Exiting with exception: {e}"); + } + finally + { + try + { + _socket.Dispose(); + } + catch (Exception e) + { + s_logger.LogWarn($"[{ConnectionId}] Exception while closing socket {e}"); + } + + s_logger.LogInfo( + $"[{ConnectionId}] Finished running {_numCallbacksRun} callback(s)."); + } + } + + private void Stop() + { + _isRunning = false; + s_logger.LogInfo($"[{ConnectionId}] Stopping CallbackConnection."); + } + + /// + /// Process the input and output streams. + /// + /// The input stream. + /// The output stream. + /// True if stream is read completely, false otherwise. + /// The connection status. + private ConnectionStatus ProcessStream( + Stream inputStream, + Stream outputStream, + out bool readComplete) + { + readComplete = false; + + try + { + byte[] requestFlagBytes = SerDe.ReadBytes(inputStream, sizeof(int)); + // For socket stream, read on the stream returns 0, which + // SerDe.ReadBytes() returns as null to denote the stream is closed. + if (requestFlagBytes == null) + { + return ConnectionStatus.SOCKET_CLOSED; + } + + // Check value of the initial request. Expected values are: + // - CallbackFlags.CLOSE + // - CallbackFlags.CALLBACK + int requestFlag = BinaryPrimitives.ReadInt32BigEndian(requestFlagBytes); + if (requestFlag == (int)CallbackFlags.CLOSE) { + return ConnectionStatus.REQUEST_CLOSE; + } + else if (requestFlag != (int)CallbackFlags.CALLBACK) + { + throw new Exception( + string.Format( + "Unexpected callback flag received. Expected: {0}, Received: {1}.", + CallbackFlags.CALLBACK, + requestFlag)); + } + + // Use callback id to get the registered handler. + int callbackId = SerDe.ReadInt32(inputStream); + if (!_callbackHandlers.TryGetValue( + callbackId, + out ICallbackHandler callbackHandler)) + { + throw new Exception($"Unregistered callback id: {callbackId}"); + } + + s_logger.LogInfo( + string.Format( + "[{0}] Received request for callback id: {1}, callback handler: {2}", + ConnectionId, + callbackId, + callbackHandler)); + + // Save contents of callback handler data to be used later. + using var callbackDataStream = + new MemoryStream(SerDe.ReadBytes(inputStream, SerDe.ReadInt32(inputStream))); + + // Check the end of stream. + int endOfStream = SerDe.ReadInt32(inputStream); + if (endOfStream == (int)CallbackFlags.END_OF_STREAM) + { + s_logger.LogDebug($"[{ConnectionId}] Received END_OF_STREAM signal."); + + // Run callback handler. + callbackHandler.Run(callbackDataStream); + + SerDe.Write(outputStream, (int)CallbackFlags.END_OF_STREAM); + readComplete = true; + } + else + { + // This may happen when the input data is not read completely. + s_logger.LogWarn( + $"[{ConnectionId}] Unexpected end of stream: {endOfStream}."); + + // Write flag to indicate the connection should be closed. + SerDe.Write(outputStream, (int)CallbackFlags.CLOSE); + } + + return ConnectionStatus.OK; + } + catch (Exception e) + { + s_logger.LogError($"[{ConnectionId}] ProcessStream() failed with exception: {e}"); + + try + { + SerDe.Write(outputStream, (int)CallbackFlags.DOTNET_EXCEPTION_THROWN); + SerDe.Write(outputStream, e.ToString()); + } + catch (IOException) + { + // JVM closed the socket. + } + catch (Exception ex) + { + s_logger.LogError( + $"[{ConnectionId}] Writing exception to stream failed with exception: {ex}"); + } + + throw; + } + } + } + + /// + /// Enums with which the Dotnet CallbackConnection communicates with + /// the JVM CallbackConnection. + /// + internal enum CallbackFlags : int + { + /// + /// Flag to indicate connection should be closed. + /// + CLOSE = -1, + + /// + /// Flag to indiciate callback should be called. + /// + CALLBACK = -2, + + /// + /// Flag to indicate an exception thrown from dotnet. + /// + DOTNET_EXCEPTION_THROWN = -3, + + /// + /// Flag to indicate end of stream. + /// + END_OF_STREAM = -4 + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackServer.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackServer.cs new file mode 100644 index 000000000..ef6c0407a --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/CallbackServer.cs @@ -0,0 +1,256 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Net; +using System.Threading; +using Microsoft.Spark.Network; +using Microsoft.Spark.Services; + +namespace Microsoft.Spark.Interop.Ipc +{ + /// + /// CallbackServer services callback requests from the JVM. + /// + internal sealed class CallbackServer + { + private static readonly ILoggerService s_logger = + LoggerServiceFactory.GetLogger(typeof(CallbackServer)); + + private readonly IJvmBridge _jvm; + + /// + /// Keeps track of all s by its Id. This is accessed + /// by the and the + /// running in the worker threads. + /// + private readonly ConcurrentDictionary _callbackHandlers = + new ConcurrentDictionary(); + + /// + /// Keeps track of all objects identified by its + /// . The main thread creates a + /// each time it receives a new socket connection + /// from the JVM side and inserts it into . Each worker + /// thread calls and removes the connection + /// once this call is finished. will not return + /// unless the needs to be closed. + /// Also, is used to bound the number of worker threads + /// since it gives you the total number of active s. + /// + private readonly ConcurrentDictionary _connections = + new ConcurrentDictionary(); + + /// + /// Each worker thread picks up a CallbackConnection from _waitingConnections + /// and runs it. + /// + private readonly BlockingCollection _waitingConnections = + new BlockingCollection(); + + /// + /// A used to notify threads that operations + /// should be canceled. + /// + private readonly CancellationTokenSource _tokenSource = new CancellationTokenSource(); + + /// + /// Counter used to generate a unique id when registering a . + /// + private int _callbackCounter = 0; + + private bool _isRunning = false; + + private ISocketWrapper _listener; + + internal int CurrentNumConnections => _connections.Count; + + internal CallbackServer(IJvmBridge jvm, bool run = true) + { + AppDomain.CurrentDomain.ProcessExit += (s, e) => Shutdown(); + _jvm = jvm; + + if (run) + { + Run(); + } + } + + /// + /// Produce a unique id and register a with it. + /// + /// The handler to register. + /// A unique id associated with the handler. + internal int RegisterCallback(ICallbackHandler callbackHandler) + { + int callbackId = Interlocked.Increment(ref _callbackCounter); + _callbackHandlers[callbackId] = callbackHandler; + + return callbackId; + } + + /// + /// Runs the callback server. + /// + /// The listening socket. + internal void Run(ISocketWrapper listener) + { + if (_isRunning) + { + s_logger.LogWarn("CallbackServer is already running."); + return; + } + + s_logger.LogInfo($"Starting CallbackServer."); + _isRunning = true; + + try + { + _listener = listener; + _listener.Listen(); + + // Communicate with the JVM the callback server's address and port. + var localEndPoint = (IPEndPoint)_listener.LocalEndPoint; + _jvm.CallStaticJavaMethod( + "DotnetHandler", + "connectCallback", + localEndPoint.Address.ToString(), + localEndPoint.Port); + + s_logger.LogInfo($"Started CallbackServer on {localEndPoint}"); + + // Start accepting connections from JVM. + new Thread(() => StartServer(_listener)) + { + IsBackground = true + }.Start(); + } + catch (Exception e) + { + s_logger.LogError($"CallbackServer exiting with exception: {e}"); + Shutdown(); + } + } + + /// + /// Runs the callback server. + /// + private void Run() + { + Run(SocketFactory.CreateSocket()); + } + + /// + /// Starts listening to any connection from JVM. + /// + /// + private void StartServer(ISocketWrapper listener) + { + try + { + long connectionId = 1; + int numWorkerThreads = 0; + + while (_isRunning) + { + ISocketWrapper socket = listener.Accept(); + var connection = + new CallbackConnection(connectionId, socket, _callbackHandlers); + + _waitingConnections.Add(connection); + _connections[connectionId] = connection; + ++connectionId; + + int numConnections = CurrentNumConnections; + + // Start worker thread until there are at least as many worker threads + // as there are CallbackConnections. CallbackConnections are expected + // to stay open and reuse the socket to service repeated callback + // requests. However, if there is an issue with a connection, then + // CallbackConnection.Run will return, freeing up extra worker threads + // to service any _waitingConnections. + // + // For example, + // Assume there were 5 worker threads, each servicing a CallbackConnection + // (5 total healthy connections). If 2 CallbackConnection sockets closed + // unexpectedly, then there would be 5 worker threads and 3 healthy + // connections. If a new connection request arrived, then the + // CallbackConnection would be added to the _waitingConnections collection + // and no new worker threads would be started (2 worker threads are already + // waiting to take CallbackConnections from _waitingConnections). + while (numWorkerThreads < numConnections) + { + new Thread(RunWorkerThread) + { + IsBackground = true + }.Start(); + ++numWorkerThreads; + } + + s_logger.LogInfo( + $"Pool snapshot: [NumThreads:{numWorkerThreads}], " + + $"[NumConnections:{numConnections}]"); + } + } + catch (Exception e) + { + s_logger.LogError($"StartServer() exits with exception: {e}"); + Shutdown(); + } + } + + /// + /// is called for each worker thread when it starts. + /// doesn't return (except for the error cases), and + /// keeps pulling from and runs the retrieved + /// . + /// + private void RunWorkerThread() + { + try + { + while (_isRunning) + { + if (_waitingConnections.TryTake( + out CallbackConnection connection, + Timeout.Infinite)) + { + // The connection will only return when the connection is closing + // (via CancellationToken) or there are error cases. + connection.Run(_tokenSource.Token); + + // Assume the connection is in a bad state, and do not reuse it. + // Remove it from _connections list to prevent the server thread from + // creating more threads than needed. + _connections.TryRemove(connection.ConnectionId, out CallbackConnection _); + } + } + } + catch (Exception e) + { + s_logger.LogError($"RunWorkerThread() exits with an exception: {e}"); + Shutdown(); + } + } + + /// + /// Shuts down the by canceling any running threads + /// and disposing of resources. + /// + private void Shutdown() + { + s_logger.LogInfo("Shutting down CallbackServer"); + + _tokenSource.Cancel(); + _waitingConnections.Dispose(); + _connections.Clear(); + _callbackHandlers.Clear(); + _listener?.Dispose(); + _isRunning = false; + + _jvm.CallStaticJavaMethod("DotnetHandler", "closeCallback"); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/ForeachBatchCallbackHandler.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/ForeachBatchCallbackHandler.cs new file mode 100644 index 000000000..9ba6ee24a --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/ForeachBatchCallbackHandler.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Streaming; + +namespace Microsoft.Spark.Interop.Ipc +{ + /// + /// callback handler. + /// + internal sealed class ForeachBatchCallbackHandler : ICallbackHandler + { + private readonly IJvmBridge _jvm; + + private readonly Action _func; + + internal ForeachBatchCallbackHandler(IJvmBridge jvm, Action func) + { + _jvm = jvm; + _func = func; + } + + public void Run(Stream inputStream) + { + var batchDf = + new DataFrame(new JvmObjectReference(SerDe.ReadString(inputStream), _jvm)); + long batchId = SerDe.ReadInt64(inputStream); + + _func(batchDf, batchId); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/ICallbackHandler.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/ICallbackHandler.cs new file mode 100644 index 000000000..0bd280b01 --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/ICallbackHandler.cs @@ -0,0 +1,16 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.IO; + +namespace Microsoft.Spark.Interop.Ipc +{ + /// + /// Interface for handling callbacks between the JVM and Dotnet. + /// + internal interface ICallbackHandler + { + void Run(Stream inputStream); + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs b/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs index f2523d065..95fa5d586 100644 --- a/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs +++ b/src/csharp/Microsoft.Spark/Interop/SparkEnvironment.cs @@ -84,5 +84,14 @@ internal static IConfigurationService ConfigurationService s_configurationService = value; } } + + private static CallbackServer s_callbackServer; + internal static CallbackServer CallbackServer + { + get + { + return s_callbackServer ??= new CallbackServer(JvmBridge); + } + } } } diff --git a/src/csharp/Microsoft.Spark/Network/DefaultSocketWrapper.cs b/src/csharp/Microsoft.Spark/Network/DefaultSocketWrapper.cs index 8647a14cb..296bb67df 100644 --- a/src/csharp/Microsoft.Spark/Network/DefaultSocketWrapper.cs +++ b/src/csharp/Microsoft.Spark/Network/DefaultSocketWrapper.cs @@ -137,5 +137,10 @@ private Stream CreateStream(string bufferSizeEnvVarName) /// Returns the local endpoint. /// public EndPoint LocalEndPoint => _innerSocket.LocalEndPoint; + + /// + /// Returns the remote endpoint. + /// + public EndPoint RemoteEndPoint => _innerSocket.RemoteEndPoint; } } diff --git a/src/csharp/Microsoft.Spark/Network/ISocketWrapper.cs b/src/csharp/Microsoft.Spark/Network/ISocketWrapper.cs index c29d7637c..1dbba3c47 100644 --- a/src/csharp/Microsoft.Spark/Network/ISocketWrapper.cs +++ b/src/csharp/Microsoft.Spark/Network/ISocketWrapper.cs @@ -50,5 +50,10 @@ internal interface ISocketWrapper : IDisposable /// Returns the local endpoint. /// EndPoint LocalEndPoint { get; } + + /// + /// Returns the remote endpoint. + /// + EndPoint RemoteEndPoint { get; } } } diff --git a/src/csharp/Microsoft.Spark/Sql/Streaming/DataStreamWriter.cs b/src/csharp/Microsoft.Spark/Sql/Streaming/DataStreamWriter.cs index 2cf752459..f371b0665 100644 --- a/src/csharp/Microsoft.Spark/Sql/Streaming/DataStreamWriter.cs +++ b/src/csharp/Microsoft.Spark/Sql/Streaming/DataStreamWriter.cs @@ -2,7 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; using System.Collections.Generic; +using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql.Types; using Microsoft.Spark.Utils; @@ -205,6 +207,32 @@ public DataStreamWriter Foreach(IForeachWriter writer) return this; } + /// + /// Sets the output of the streaming query to be processed using the provided + /// function. This is supported only in the micro-batch execution modes (that + /// is, when the trigger is not continuous). In every micro-batch, the provided + /// function will be called in every micro-batch with (i) the output rows as a + /// and (ii) the batch identifier. The batchId can be used + /// to deduplicate and transactionally write the output (that is, the provided + /// Dataset) to external systems. The output is guaranteed + /// to exactly same for the same batchId (assuming all operations are deterministic + /// in the query). + /// + /// The function to apply to the DataFrame + /// This DataStreamWriter object + [Since(Versions.V2_4_0)] + public DataStreamWriter ForeachBatch(Action func) + { + int callbackId = SparkEnvironment.CallbackServer.RegisterCallback( + new ForeachBatchCallbackHandler(_jvmObject.Jvm, func)); + _jvmObject.Jvm.CallStaticJavaMethod( + "org.apache.spark.sql.api.dotnet.DotnetForeachBatchHelper", + "callForeachBatch", + this, + callbackId); + return this; + } + /// /// Helper function to add given key/value pair as a new option. /// diff --git a/src/csharp/Microsoft.Spark/Sql/Streaming/StreamingQuery.cs b/src/csharp/Microsoft.Spark/Sql/Streaming/StreamingQuery.cs index 7e948c076..645ca0e0c 100644 --- a/src/csharp/Microsoft.Spark/Sql/Streaming/StreamingQuery.cs +++ b/src/csharp/Microsoft.Spark/Sql/Streaming/StreamingQuery.cs @@ -46,6 +46,16 @@ public sealed class StreamingQuery : IJvmObjectReferenceProvider public bool AwaitTermination(long timeoutMs) => (bool)_jvmObject.Invoke("awaitTermination", timeoutMs); + /// + /// Blocks until all available data in the source has been processed and committed to the + /// sink. This method is intended for testing. Note that in the case of continually + /// arriving data, this method may block forever. Additionally, this method is only + /// guaranteed to block until data that has been synchronously appended data to a + /// `org.apache.spark.sql.execution.streaming.Source` prior to invocation. + /// (i.e. `getOffset` must immediately reflect the addition). + /// + public void ProcessAllAvailable() => _jvmObject.Invoke("processAllAvailable"); + /// /// Stops the execution of this query if it is running. This method blocks until the /// threads performing execution stop. diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala new file mode 100644 index 000000000..0026d78df --- /dev/null +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.DataOutputStream + +import org.apache.spark.internal.Logging + +import scala.collection.mutable.Queue + +/** + * CallbackClient is used to communicate with the Dotnet CallbackServer. + * The client manages and maintains a pool of open CallbackConnections. + * Any callback request is delegated to a new CallbackConnection or + * unused CallbackConnection. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackClient(address: String, port: Int) extends Logging { + private[this] val connectionPool: Queue[CallbackConnection] = Queue[CallbackConnection]() + + private[this] var isShutdown: Boolean = false + + final def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = + getOrCreateConnection() match { + case Some(connection) => + try { + connection.send(callbackId, writeBody) + } catch { + case e: Exception => + logError(s"Error calling callback [callback id = $callbackId].", e) + connection.close() + throw e + } + case None => throw new Exception("Unable to get or create connection.") + } + + private def getOrCreateConnection(): Option[CallbackConnection] = synchronized { + if (isShutdown) { + logInfo("Cannot get or create connection while client is shutdown.") + return None + } + + if (connectionPool.nonEmpty) { + return Some(connectionPool.dequeue()) + } + + Some(new CallbackConnection(address, port)) + } + + private def addConnection(connection: CallbackConnection): Unit = synchronized { + assert(connection != null) + connectionPool.enqueue(connection) + } + + def shutdown(): Unit = synchronized { + if (isShutdown) { + logInfo("Shutdown called, but already shutdown.") + return + } + + logInfo("Shutting down.") + connectionPool.foreach(_.close) + connectionPool.clear + isShutdown = true + } +} diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala new file mode 100644 index 000000000..36726181e --- /dev/null +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.{ByteArrayOutputStream, Closeable, DataInputStream, DataOutputStream} +import java.net.Socket + +import org.apache.spark.internal.Logging + +/** + * CallbackConnection is used to process the callback communication + * between the JVM and Dotnet. It uses a TCP socket to communicate with + * the Dotnet CallbackServer and the socket is expected to be reused. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackConnection(address: String, port: Int) extends Logging { + private[this] val socket: Socket = new Socket(address, port) + private[this] val inputStream: DataInputStream = new DataInputStream(socket.getInputStream) + private[this] val outputStream: DataOutputStream = new DataOutputStream(socket.getOutputStream) + + def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = { + logInfo(s"Calling callback [callback id = $callbackId] ...") + + try { + SerDe.writeInt(outputStream, CallbackFlags.CALLBACK) + SerDe.writeInt(outputStream, callbackId) + + val byteArrayOutputStream = new ByteArrayOutputStream() + writeBody(new DataOutputStream(byteArrayOutputStream)) + SerDe.writeInt(outputStream, byteArrayOutputStream.size) + byteArrayOutputStream.writeTo(outputStream); + } catch { + case e: Exception => { + throw new Exception("Error writing to stream.", e) + } + } + + logInfo(s"Signaling END_OF_STREAM.") + try { + SerDe.writeInt(outputStream, CallbackFlags.END_OF_STREAM) + outputStream.flush() + + val endOfStreamResponse = readFlag(inputStream) + endOfStreamResponse match { + case CallbackFlags.END_OF_STREAM => + logInfo(s"Received END_OF_STREAM signal. Calling callback [callback id = $callbackId] successful.") + case _ => { + throw new Exception(s"Error verifying end of stream. Expected: ${CallbackFlags.END_OF_STREAM}, " + + s"Received: $endOfStreamResponse") + } + } + } catch { + case e: Exception => { + throw new Exception("Error while verifying end of stream.", e) + } + } + } + + def close(): Unit = { + try { + SerDe.writeInt(outputStream, CallbackFlags.CLOSE) + outputStream.flush() + } catch { + case e: Exception => logInfo("Unable to send close to .NET callback server.", e) + } + + close(socket) + close(outputStream) + close(inputStream) + } + + private def close(s: Socket): Unit = { + try { + assert(s != null) + s.close() + } catch { + case e: Exception => logInfo("Unable to close socket.", e) + } + } + + private def close(c: Closeable): Unit = { + try { + assert(c != null) + c.close() + } catch { + case e: Exception => logInfo("Unable to close closeable.", e) + } + } + + private def readFlag(inputStream: DataInputStream): Int = { + val callbackFlag = SerDe.readInt(inputStream) + if (callbackFlag == CallbackFlags.DOTNET_EXCEPTION_THROWN) { + val exceptionMessage = SerDe.readString(inputStream) + throw new DotnetException(exceptionMessage) + } + callbackFlag + } + + private object CallbackFlags { + val CLOSE: Int = -1 + val CALLBACK: Int = -2 + val DOTNET_EXCEPTION_THROWN: Int = -3 + val END_OF_STREAM: Int = -4 + } +} \ No newline at end of file diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala index 45b3cd5a4..002945bb8 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala @@ -6,9 +6,8 @@ package org.apache.spark.api.dotnet -import java.io.DataOutputStream -import java.net.{InetSocketAddress, Socket} -import java.util.concurrent.{BlockingQueue, LinkedBlockingQueue, TimeUnit} +import java.net.InetSocketAddress +import java.util.concurrent.TimeUnit import io.netty.bootstrap.ServerBootstrap import io.netty.channel.nio.NioEventLoopGroup @@ -50,7 +49,6 @@ class DotnetBackend extends Logging { // lengthFieldLength = 4 // lengthAdjustment = 0 // initialBytesToStrip = 4, i.e. strip out the length field itself - // new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) .addLast("decoder", new ByteArrayDecoder()) .addLast("handler", new DotnetBackendHandler(self)) @@ -81,30 +79,26 @@ class DotnetBackend extends Logging { bootstrap = null // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - try { - val dos = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dos, "close") - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true + DotnetBackend.shutdownCallbackClient() } } -object DotnetBackend { - // Channels to callback server. - private[spark] val callbackSockets: BlockingQueue[Socket] = new LinkedBlockingQueue[Socket]() - @volatile private[spark] var callbackPort: Int = 0 +object DotnetBackend extends Logging { + @volatile private[spark] var callbackClient: CallbackClient = null + + private[spark] def setCallbackClient(address: String, port: Int) = synchronized { + if (DotnetBackend.callbackClient == null) { + logInfo(s"Connecting to a callback server at $address:$port") + DotnetBackend.callbackClient = new CallbackClient(address, port) + } else { + throw new Exception("Callback client already set.") + } + } - // flag to denote whether the callback socket is shutdown explicitly - @volatile private[spark] var callbackSocketShutdown: Boolean = false + private[spark] def shutdownCallbackClient(): Unit = synchronized { + if (callbackClient != null) { + callbackClient.shutdown() + callbackClient = null + } + } } diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala index f4e9490a0..1cde1d1c5 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala @@ -7,7 +7,6 @@ package org.apache.spark.api.dotnet import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} -import java.net.Socket import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler} import org.apache.spark.api.dotnet.SerDe._ @@ -67,32 +66,16 @@ class DotnetBackendHandler(server: DotnetBackend) writeInt(dos, -1) } case "connectCallback" => - val t = readObjectType(dis) - assert(t == 'i') + assert(readObjectType(dis) == 'c') + val address = readString(dis) + assert(readObjectType(dis) == 'i') val port = readInt(dis) - logInfo(s"Connecting to a callback server at port $port") - DotnetBackend.callbackPort = port + DotnetBackend.setCallbackClient(address, port); writeInt(dos, 0) writeType(dos, "void") case "closeCallback" => - // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets.") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - val dataOutputStream = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dataOutputStream, "close") - try { - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true - + logInfo("Requesting to close callback client") + DotnetBackend.shutdownCallbackClient() writeInt(dos, 0) writeType(dos, "void") diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala new file mode 100644 index 000000000..c70d16b03 --- /dev/null +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala @@ -0,0 +1,13 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +class DotnetException(message: String, cause: Throwable) + extends Exception(message, cause) { + + def this(message: String) = this(message, null) +} diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala new file mode 100644 index 000000000..0026d78df --- /dev/null +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.DataOutputStream + +import org.apache.spark.internal.Logging + +import scala.collection.mutable.Queue + +/** + * CallbackClient is used to communicate with the Dotnet CallbackServer. + * The client manages and maintains a pool of open CallbackConnections. + * Any callback request is delegated to a new CallbackConnection or + * unused CallbackConnection. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackClient(address: String, port: Int) extends Logging { + private[this] val connectionPool: Queue[CallbackConnection] = Queue[CallbackConnection]() + + private[this] var isShutdown: Boolean = false + + final def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = + getOrCreateConnection() match { + case Some(connection) => + try { + connection.send(callbackId, writeBody) + } catch { + case e: Exception => + logError(s"Error calling callback [callback id = $callbackId].", e) + connection.close() + throw e + } + case None => throw new Exception("Unable to get or create connection.") + } + + private def getOrCreateConnection(): Option[CallbackConnection] = synchronized { + if (isShutdown) { + logInfo("Cannot get or create connection while client is shutdown.") + return None + } + + if (connectionPool.nonEmpty) { + return Some(connectionPool.dequeue()) + } + + Some(new CallbackConnection(address, port)) + } + + private def addConnection(connection: CallbackConnection): Unit = synchronized { + assert(connection != null) + connectionPool.enqueue(connection) + } + + def shutdown(): Unit = synchronized { + if (isShutdown) { + logInfo("Shutdown called, but already shutdown.") + return + } + + logInfo("Shutting down.") + connectionPool.foreach(_.close) + connectionPool.clear + isShutdown = true + } +} diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala new file mode 100644 index 000000000..36726181e --- /dev/null +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.{ByteArrayOutputStream, Closeable, DataInputStream, DataOutputStream} +import java.net.Socket + +import org.apache.spark.internal.Logging + +/** + * CallbackConnection is used to process the callback communication + * between the JVM and Dotnet. It uses a TCP socket to communicate with + * the Dotnet CallbackServer and the socket is expected to be reused. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackConnection(address: String, port: Int) extends Logging { + private[this] val socket: Socket = new Socket(address, port) + private[this] val inputStream: DataInputStream = new DataInputStream(socket.getInputStream) + private[this] val outputStream: DataOutputStream = new DataOutputStream(socket.getOutputStream) + + def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = { + logInfo(s"Calling callback [callback id = $callbackId] ...") + + try { + SerDe.writeInt(outputStream, CallbackFlags.CALLBACK) + SerDe.writeInt(outputStream, callbackId) + + val byteArrayOutputStream = new ByteArrayOutputStream() + writeBody(new DataOutputStream(byteArrayOutputStream)) + SerDe.writeInt(outputStream, byteArrayOutputStream.size) + byteArrayOutputStream.writeTo(outputStream); + } catch { + case e: Exception => { + throw new Exception("Error writing to stream.", e) + } + } + + logInfo(s"Signaling END_OF_STREAM.") + try { + SerDe.writeInt(outputStream, CallbackFlags.END_OF_STREAM) + outputStream.flush() + + val endOfStreamResponse = readFlag(inputStream) + endOfStreamResponse match { + case CallbackFlags.END_OF_STREAM => + logInfo(s"Received END_OF_STREAM signal. Calling callback [callback id = $callbackId] successful.") + case _ => { + throw new Exception(s"Error verifying end of stream. Expected: ${CallbackFlags.END_OF_STREAM}, " + + s"Received: $endOfStreamResponse") + } + } + } catch { + case e: Exception => { + throw new Exception("Error while verifying end of stream.", e) + } + } + } + + def close(): Unit = { + try { + SerDe.writeInt(outputStream, CallbackFlags.CLOSE) + outputStream.flush() + } catch { + case e: Exception => logInfo("Unable to send close to .NET callback server.", e) + } + + close(socket) + close(outputStream) + close(inputStream) + } + + private def close(s: Socket): Unit = { + try { + assert(s != null) + s.close() + } catch { + case e: Exception => logInfo("Unable to close socket.", e) + } + } + + private def close(c: Closeable): Unit = { + try { + assert(c != null) + c.close() + } catch { + case e: Exception => logInfo("Unable to close closeable.", e) + } + } + + private def readFlag(inputStream: DataInputStream): Int = { + val callbackFlag = SerDe.readInt(inputStream) + if (callbackFlag == CallbackFlags.DOTNET_EXCEPTION_THROWN) { + val exceptionMessage = SerDe.readString(inputStream) + throw new DotnetException(exceptionMessage) + } + callbackFlag + } + + private object CallbackFlags { + val CLOSE: Int = -1 + val CALLBACK: Int = -2 + val DOTNET_EXCEPTION_THROWN: Int = -3 + val END_OF_STREAM: Int = -4 + } +} \ No newline at end of file diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala index 45b3cd5a4..002945bb8 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala @@ -6,9 +6,8 @@ package org.apache.spark.api.dotnet -import java.io.DataOutputStream -import java.net.{InetSocketAddress, Socket} -import java.util.concurrent.{BlockingQueue, LinkedBlockingQueue, TimeUnit} +import java.net.InetSocketAddress +import java.util.concurrent.TimeUnit import io.netty.bootstrap.ServerBootstrap import io.netty.channel.nio.NioEventLoopGroup @@ -50,7 +49,6 @@ class DotnetBackend extends Logging { // lengthFieldLength = 4 // lengthAdjustment = 0 // initialBytesToStrip = 4, i.e. strip out the length field itself - // new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) .addLast("decoder", new ByteArrayDecoder()) .addLast("handler", new DotnetBackendHandler(self)) @@ -81,30 +79,26 @@ class DotnetBackend extends Logging { bootstrap = null // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - try { - val dos = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dos, "close") - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true + DotnetBackend.shutdownCallbackClient() } } -object DotnetBackend { - // Channels to callback server. - private[spark] val callbackSockets: BlockingQueue[Socket] = new LinkedBlockingQueue[Socket]() - @volatile private[spark] var callbackPort: Int = 0 +object DotnetBackend extends Logging { + @volatile private[spark] var callbackClient: CallbackClient = null + + private[spark] def setCallbackClient(address: String, port: Int) = synchronized { + if (DotnetBackend.callbackClient == null) { + logInfo(s"Connecting to a callback server at $address:$port") + DotnetBackend.callbackClient = new CallbackClient(address, port) + } else { + throw new Exception("Callback client already set.") + } + } - // flag to denote whether the callback socket is shutdown explicitly - @volatile private[spark] var callbackSocketShutdown: Boolean = false + private[spark] def shutdownCallbackClient(): Unit = synchronized { + if (callbackClient != null) { + callbackClient.shutdown() + callbackClient = null + } + } } diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala index f4e9490a0..1cde1d1c5 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala @@ -7,7 +7,6 @@ package org.apache.spark.api.dotnet import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} -import java.net.Socket import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler} import org.apache.spark.api.dotnet.SerDe._ @@ -67,32 +66,16 @@ class DotnetBackendHandler(server: DotnetBackend) writeInt(dos, -1) } case "connectCallback" => - val t = readObjectType(dis) - assert(t == 'i') + assert(readObjectType(dis) == 'c') + val address = readString(dis) + assert(readObjectType(dis) == 'i') val port = readInt(dis) - logInfo(s"Connecting to a callback server at port $port") - DotnetBackend.callbackPort = port + DotnetBackend.setCallbackClient(address, port); writeInt(dos, 0) writeType(dos, "void") case "closeCallback" => - // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets.") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - val dataOutputStream = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dataOutputStream, "close") - try { - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true - + logInfo("Requesting to close callback client") + DotnetBackend.shutdownCallbackClient() writeInt(dos, 0) writeType(dos, "void") diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala new file mode 100644 index 000000000..c70d16b03 --- /dev/null +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala @@ -0,0 +1,13 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +class DotnetException(message: String, cause: Throwable) + extends Exception(message, cause) { + + def this(message: String) = this(message, null) +} diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala new file mode 100644 index 000000000..c0de9c7bc --- /dev/null +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.sql.api.dotnet + +import org.apache.spark.api.dotnet.{CallbackClient, DotnetBackend, SerDe} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.streaming.DataStreamWriter + +class DotnetForeachBatchFunction(callbackClient: CallbackClient, callbackId: Int) extends Logging { + def call(batchDF: DataFrame, batchId: Long): Unit = + callbackClient.send( + callbackId, + dos => { + SerDe.writeJObj(dos, batchDF) + SerDe.writeLong(dos, batchId) + }) +} + +object DotnetForeachBatchHelper { + def callForeachBatch(dsw: DataStreamWriter[Row], callbackId: Int): Unit = { + val callbackClient = DotnetBackend.callbackClient + if (callbackClient == null) { + throw new Exception("DotnetBackend.callbackClient is null.") + } + + val dotnetForeachFunc = new DotnetForeachBatchFunction(callbackClient, callbackId) + dsw.foreachBatch(dotnetForeachFunc.call _) + } +} diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala new file mode 100644 index 000000000..0026d78df --- /dev/null +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackClient.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.DataOutputStream + +import org.apache.spark.internal.Logging + +import scala.collection.mutable.Queue + +/** + * CallbackClient is used to communicate with the Dotnet CallbackServer. + * The client manages and maintains a pool of open CallbackConnections. + * Any callback request is delegated to a new CallbackConnection or + * unused CallbackConnection. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackClient(address: String, port: Int) extends Logging { + private[this] val connectionPool: Queue[CallbackConnection] = Queue[CallbackConnection]() + + private[this] var isShutdown: Boolean = false + + final def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = + getOrCreateConnection() match { + case Some(connection) => + try { + connection.send(callbackId, writeBody) + } catch { + case e: Exception => + logError(s"Error calling callback [callback id = $callbackId].", e) + connection.close() + throw e + } + case None => throw new Exception("Unable to get or create connection.") + } + + private def getOrCreateConnection(): Option[CallbackConnection] = synchronized { + if (isShutdown) { + logInfo("Cannot get or create connection while client is shutdown.") + return None + } + + if (connectionPool.nonEmpty) { + return Some(connectionPool.dequeue()) + } + + Some(new CallbackConnection(address, port)) + } + + private def addConnection(connection: CallbackConnection): Unit = synchronized { + assert(connection != null) + connectionPool.enqueue(connection) + } + + def shutdown(): Unit = synchronized { + if (isShutdown) { + logInfo("Shutdown called, but already shutdown.") + return + } + + logInfo("Shutting down.") + connectionPool.foreach(_.close) + connectionPool.clear + isShutdown = true + } +} diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala new file mode 100644 index 000000000..36726181e --- /dev/null +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/CallbackConnection.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +import java.io.{ByteArrayOutputStream, Closeable, DataInputStream, DataOutputStream} +import java.net.Socket + +import org.apache.spark.internal.Logging + +/** + * CallbackConnection is used to process the callback communication + * between the JVM and Dotnet. It uses a TCP socket to communicate with + * the Dotnet CallbackServer and the socket is expected to be reused. + * @param address The address of the Dotnet CallbackServer + * @param port The port of the Dotnet CallbackServer + */ +class CallbackConnection(address: String, port: Int) extends Logging { + private[this] val socket: Socket = new Socket(address, port) + private[this] val inputStream: DataInputStream = new DataInputStream(socket.getInputStream) + private[this] val outputStream: DataOutputStream = new DataOutputStream(socket.getOutputStream) + + def send( + callbackId: Int, + writeBody: DataOutputStream => Unit): Unit = { + logInfo(s"Calling callback [callback id = $callbackId] ...") + + try { + SerDe.writeInt(outputStream, CallbackFlags.CALLBACK) + SerDe.writeInt(outputStream, callbackId) + + val byteArrayOutputStream = new ByteArrayOutputStream() + writeBody(new DataOutputStream(byteArrayOutputStream)) + SerDe.writeInt(outputStream, byteArrayOutputStream.size) + byteArrayOutputStream.writeTo(outputStream); + } catch { + case e: Exception => { + throw new Exception("Error writing to stream.", e) + } + } + + logInfo(s"Signaling END_OF_STREAM.") + try { + SerDe.writeInt(outputStream, CallbackFlags.END_OF_STREAM) + outputStream.flush() + + val endOfStreamResponse = readFlag(inputStream) + endOfStreamResponse match { + case CallbackFlags.END_OF_STREAM => + logInfo(s"Received END_OF_STREAM signal. Calling callback [callback id = $callbackId] successful.") + case _ => { + throw new Exception(s"Error verifying end of stream. Expected: ${CallbackFlags.END_OF_STREAM}, " + + s"Received: $endOfStreamResponse") + } + } + } catch { + case e: Exception => { + throw new Exception("Error while verifying end of stream.", e) + } + } + } + + def close(): Unit = { + try { + SerDe.writeInt(outputStream, CallbackFlags.CLOSE) + outputStream.flush() + } catch { + case e: Exception => logInfo("Unable to send close to .NET callback server.", e) + } + + close(socket) + close(outputStream) + close(inputStream) + } + + private def close(s: Socket): Unit = { + try { + assert(s != null) + s.close() + } catch { + case e: Exception => logInfo("Unable to close socket.", e) + } + } + + private def close(c: Closeable): Unit = { + try { + assert(c != null) + c.close() + } catch { + case e: Exception => logInfo("Unable to close closeable.", e) + } + } + + private def readFlag(inputStream: DataInputStream): Int = { + val callbackFlag = SerDe.readInt(inputStream) + if (callbackFlag == CallbackFlags.DOTNET_EXCEPTION_THROWN) { + val exceptionMessage = SerDe.readString(inputStream) + throw new DotnetException(exceptionMessage) + } + callbackFlag + } + + private object CallbackFlags { + val CLOSE: Int = -1 + val CALLBACK: Int = -2 + val DOTNET_EXCEPTION_THROWN: Int = -3 + val END_OF_STREAM: Int = -4 + } +} \ No newline at end of file diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala index 45b3cd5a4..002945bb8 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackend.scala @@ -6,9 +6,8 @@ package org.apache.spark.api.dotnet -import java.io.DataOutputStream -import java.net.{InetSocketAddress, Socket} -import java.util.concurrent.{BlockingQueue, LinkedBlockingQueue, TimeUnit} +import java.net.InetSocketAddress +import java.util.concurrent.TimeUnit import io.netty.bootstrap.ServerBootstrap import io.netty.channel.nio.NioEventLoopGroup @@ -50,7 +49,6 @@ class DotnetBackend extends Logging { // lengthFieldLength = 4 // lengthAdjustment = 0 // initialBytesToStrip = 4, i.e. strip out the length field itself - // new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) new LengthFieldBasedFrameDecoder(Integer.MAX_VALUE, 0, 4, 0, 4)) .addLast("decoder", new ByteArrayDecoder()) .addLast("handler", new DotnetBackendHandler(self)) @@ -81,30 +79,26 @@ class DotnetBackend extends Logging { bootstrap = null // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - try { - val dos = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dos, "close") - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true + DotnetBackend.shutdownCallbackClient() } } -object DotnetBackend { - // Channels to callback server. - private[spark] val callbackSockets: BlockingQueue[Socket] = new LinkedBlockingQueue[Socket]() - @volatile private[spark] var callbackPort: Int = 0 +object DotnetBackend extends Logging { + @volatile private[spark] var callbackClient: CallbackClient = null + + private[spark] def setCallbackClient(address: String, port: Int) = synchronized { + if (DotnetBackend.callbackClient == null) { + logInfo(s"Connecting to a callback server at $address:$port") + DotnetBackend.callbackClient = new CallbackClient(address, port) + } else { + throw new Exception("Callback client already set.") + } + } - // flag to denote whether the callback socket is shutdown explicitly - @volatile private[spark] var callbackSocketShutdown: Boolean = false + private[spark] def shutdownCallbackClient(): Unit = synchronized { + if (callbackClient != null) { + callbackClient.shutdown() + callbackClient = null + } + } } diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala index f4e9490a0..1cde1d1c5 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetBackendHandler.scala @@ -7,7 +7,6 @@ package org.apache.spark.api.dotnet import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} -import java.net.Socket import io.netty.channel.{ChannelHandlerContext, SimpleChannelInboundHandler} import org.apache.spark.api.dotnet.SerDe._ @@ -67,32 +66,16 @@ class DotnetBackendHandler(server: DotnetBackend) writeInt(dos, -1) } case "connectCallback" => - val t = readObjectType(dis) - assert(t == 'i') + assert(readObjectType(dis) == 'c') + val address = readString(dis) + assert(readObjectType(dis) == 'i') val port = readInt(dis) - logInfo(s"Connecting to a callback server at port $port") - DotnetBackend.callbackPort = port + DotnetBackend.setCallbackClient(address, port); writeInt(dos, 0) writeType(dos, "void") case "closeCallback" => - // Send close to .NET callback server. - logInfo("Requesting to close all call back sockets.") - var socket: Socket = null - do { - socket = DotnetBackend.callbackSockets.poll() - if (socket != null) { - val dataOutputStream = new DataOutputStream(socket.getOutputStream) - SerDe.writeString(dataOutputStream, "close") - try { - socket.close() - socket = null - } catch { - case e: Exception => logError("Exception when closing socket: ", e) - } - } - } while (socket != null) - DotnetBackend.callbackSocketShutdown = true - + logInfo("Requesting to close callback client") + DotnetBackend.shutdownCallbackClient() writeInt(dos, 0) writeType(dos, "void") diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala new file mode 100644 index 000000000..c70d16b03 --- /dev/null +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/DotnetException.scala @@ -0,0 +1,13 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.api.dotnet + +class DotnetException(message: String, cause: Throwable) + extends Exception(message, cause) { + + def this(message: String) = this(message, null) +} diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala new file mode 100644 index 000000000..c0de9c7bc --- /dev/null +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/sql/api/dotnet/DotnetForeachBatch.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the .NET Foundation under one or more agreements. + * The .NET Foundation licenses this file to you under the MIT license. + * See the LICENSE file in the project root for more information. + */ + +package org.apache.spark.sql.api.dotnet + +import org.apache.spark.api.dotnet.{CallbackClient, DotnetBackend, SerDe} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.streaming.DataStreamWriter + +class DotnetForeachBatchFunction(callbackClient: CallbackClient, callbackId: Int) extends Logging { + def call(batchDF: DataFrame, batchId: Long): Unit = + callbackClient.send( + callbackId, + dos => { + SerDe.writeJObj(dos, batchDF) + SerDe.writeLong(dos, batchId) + }) +} + +object DotnetForeachBatchHelper { + def callForeachBatch(dsw: DataStreamWriter[Row], callbackId: Int): Unit = { + val callbackClient = DotnetBackend.callbackClient + if (callbackClient == null) { + throw new Exception("DotnetBackend.callbackClient is null.") + } + + val dotnetForeachFunc = new DotnetForeachBatchFunction(callbackClient, callbackId) + dsw.foreachBatch(dotnetForeachFunc.call _) + } +} From c889f39a2827b94ad1ad6503f18836633f5fd00c Mon Sep 17 00:00:00 2001 From: elvaliuliuliu <47404285+elvaliuliuliu@users.noreply.github.com> Date: Tue, 23 Jun 2020 10:58:15 -0700 Subject: [PATCH 14/27] Fix NRE for TimestampType and DateType and support nullable value types (#530) --- .../IpcTests/Sql/SparkSessionTests.cs | 48 +++++++++++++++---- .../Microsoft.Spark/Sql/SparkSession.cs | 45 ++++++++++++++--- .../Microsoft.Spark/Sql/Types/SimpleTypes.cs | 19 ++++---- 3 files changed, 89 insertions(+), 23 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs index c312ddc6c..5a70a6698 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs @@ -94,7 +94,7 @@ public void TestCreateDataFrame() // Calling CreateDataFrame(IEnumerable _) without schema { - var data = new List(new string[] { "Alice", "Bob" }); + var data = new string[] { "Alice", "Bob", null }; StructType schema = SchemaWithSingleColumn(new StringType()); DataFrame df = _spark.CreateDataFrame(data); @@ -103,7 +103,16 @@ public void TestCreateDataFrame() // Calling CreateDataFrame(IEnumerable _) without schema { - var data = new List(new int[] { 1, 2 }); + var data = new int[] { 1, 2 }; + StructType schema = SchemaWithSingleColumn(new IntegerType(), false); + + DataFrame df = _spark.CreateDataFrame(data); + ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); + } + + // Calling CreateDataFrame(IEnumerable _) without schema + { + var data = new int?[] { 1, 2, null }; StructType schema = SchemaWithSingleColumn(new IntegerType()); DataFrame df = _spark.CreateDataFrame(data); @@ -112,7 +121,16 @@ public void TestCreateDataFrame() // Calling CreateDataFrame(IEnumerable _) without schema { - var data = new List(new double[] { 1.2, 2.3 }); + var data = new double[] { 1.2, 2.3 }; + StructType schema = SchemaWithSingleColumn(new DoubleType(), false); + + DataFrame df = _spark.CreateDataFrame(data); + ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); + } + + // Calling CreateDataFrame(IEnumerable _) without schema + { + var data = new double?[] { 1.2, 2.3, null }; StructType schema = SchemaWithSingleColumn(new DoubleType()); DataFrame df = _spark.CreateDataFrame(data); @@ -121,19 +139,29 @@ public void TestCreateDataFrame() // Calling CreateDataFrame(IEnumerable _) without schema { - var data = new List(new bool[] { true, false }); + var data = new bool[] { true, false }; + StructType schema = SchemaWithSingleColumn(new BooleanType(), false); + + DataFrame df = _spark.CreateDataFrame(data); + ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); + } + + // Calling CreateDataFrame(IEnumerable _) without schema + { + var data = new bool?[] { true, false, null }; StructType schema = SchemaWithSingleColumn(new BooleanType()); DataFrame df = _spark.CreateDataFrame(data); ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); } - + // Calling CreateDataFrame(IEnumerable _) without schema { var data = new Date[] { new Date(2020, 1, 1), - new Date(2020, 1, 2) + new Date(2020, 1, 2), + null }; StructType schema = SchemaWithSingleColumn(new DateType()); @@ -151,7 +179,8 @@ public void TestCreateDataFrameWithTimestamp() var data = new Timestamp[] { new Timestamp(2020, 1, 1, 0, 0, 0, 0), - new Timestamp(2020, 1, 2, 15, 30, 30, 0) + new Timestamp(2020, 1, 2, 15, 30, 30, 0), + null }; StructType schema = SchemaWithSingleColumn(new TimestampType()); @@ -172,8 +201,9 @@ private void ValidateDataFrame( /// Returns a single column schema of the given datatype. /// /// Datatype of the column + /// Indicates if values of the column can be null /// Schema as StructType - private StructType SchemaWithSingleColumn(DataType dataType) => - new StructType(new[] { new StructField("_1", dataType) }); + private StructType SchemaWithSingleColumn(DataType dataType, bool isNullable = true) => + new StructType(new[] { new StructField("_1", dataType, isNullable) }); } } diff --git a/src/csharp/Microsoft.Spark/Sql/SparkSession.cs b/src/csharp/Microsoft.Spark/Sql/SparkSession.cs index fc706081f..f0eab693f 100644 --- a/src/csharp/Microsoft.Spark/Sql/SparkSession.cs +++ b/src/csharp/Microsoft.Spark/Sql/SparkSession.cs @@ -151,9 +151,9 @@ public DataFrame Table(string tableName) => new DataFrame((JvmObjectReference)_jvmObject.Invoke("table", tableName)); /// - /// Creates a from an containing + /// Creates a from an containing /// s using the given schema. - /// It is important to make sure that the structure of every of + /// It is important to make sure that the structure of every of /// the provided matches /// the provided schema. Otherwise, there will be runtime exception. /// @@ -172,10 +172,21 @@ public DataFrame CreateDataFrame(IEnumerable data, StructType schema /// of type /// Dataframe object public DataFrame CreateDataFrame(IEnumerable data) => + CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new IntegerType(), false)); + + /// + /// Creates a Dataframe given data as of type + /// + /// + /// of type + /// + /// Dataframe object + public DataFrame CreateDataFrame(IEnumerable data) => CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new IntegerType())); /// - /// Creates a Dataframe given data as of type + /// Creates a Dataframe given data as of type + /// /// /// of type /// Dataframe object @@ -183,11 +194,22 @@ public DataFrame CreateDataFrame(IEnumerable data) => CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new StringType())); /// - /// Creates a Dataframe given data as of type + /// Creates a Dataframe given data as of type + /// /// /// of type /// Dataframe object public DataFrame CreateDataFrame(IEnumerable data) => + CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new DoubleType(), false)); + + /// + /// Creates a Dataframe given data as of type + /// + /// + /// of type + /// + /// Dataframe object + public DataFrame CreateDataFrame(IEnumerable data) => CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new DoubleType())); /// @@ -196,6 +218,16 @@ public DataFrame CreateDataFrame(IEnumerable data) => /// of type /// Dataframe object public DataFrame CreateDataFrame(IEnumerable data) => + CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new BooleanType(), false)); + + /// + /// Creates a Dataframe given data as of type + /// + /// + /// of type + /// + /// Dataframe object + public DataFrame CreateDataFrame(IEnumerable data) => CreateDataFrame(ToGenericRows(data), SchemaWithSingleColumn(new BooleanType())); /// @@ -299,9 +331,10 @@ public UdfRegistration Udf() => /// Returns a single column schema of the given datatype. /// /// Datatype of the column + /// Indicates if values of the column can be null /// Schema as StructType - private StructType SchemaWithSingleColumn(DataType dataType) => - new StructType(new[] { new StructField("_1", dataType) }); + private StructType SchemaWithSingleColumn(DataType dataType, bool isNullable = true) => + new StructType(new[] { new StructField("_1", dataType, isNullable) }); /// /// This method is transforming each element of IEnumerable of type T input into a single diff --git a/src/csharp/Microsoft.Spark/Sql/Types/SimpleTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/SimpleTypes.cs index 7b9bd7a6f..0638fdb60 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/SimpleTypes.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/SimpleTypes.cs @@ -81,6 +81,11 @@ public sealed class DateType : AtomicType /// internal override object FromInternal(object obj) { + if (obj == null) + { + return null; + } + return new Date(new DateTime((int)obj * TimeSpan.TicksPerDay + s_unixTimeEpoch.Ticks)); } } @@ -101,16 +106,14 @@ public sealed class TimestampType : AtomicType /// internal override object FromInternal(object obj) { - // Known issue that if the original type is "long" and its value can be fit into the - // "int", Pickler will serialize the value as int. - if (obj is long val) + if (obj == null) { - val = (long)obj; - } - else - { - val = (int)obj; + return null; } + + // Known issue that if the original type is "long" and its value can be fit into the + // "int", Pickler will serialize the value as int. + long val = (obj is long v) ? v : (int)obj; return new Timestamp( new DateTime(val * 10 + DateType.s_unixTimeEpoch.Ticks, DateTimeKind.Utc)); } From 0f576e5f51ce2e7be3a2d38074991e30126dcc18 Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Tue, 23 Jun 2020 10:59:06 -0700 Subject: [PATCH 15/27] Prep 0.12.0 release (#564) --- README.md | 2 +- benchmark/scala/pom.xml | 2 +- docs/release-notes/0.12/release-0.12.md | 115 ++++++++++++++++++++++++ eng/Versions.props | 2 +- src/scala/pom.xml | 2 +- 5 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 docs/release-notes/0.12/release-0.12.md diff --git a/README.md b/README.md index 2d5638a97..5b8647ca0 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ 2.3.* - v0.11.0 + v0.12.0 2.4.0 diff --git a/benchmark/scala/pom.xml b/benchmark/scala/pom.xml index 56b8dc1ea..54608d6ef 100644 --- a/benchmark/scala/pom.xml +++ b/benchmark/scala/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.microsoft.spark microsoft-spark-benchmark - 0.11.0 + 0.12.0 2019 UTF-8 diff --git a/docs/release-notes/0.12/release-0.12.md b/docs/release-notes/0.12/release-0.12.md new file mode 100644 index 000000000..7000299ea --- /dev/null +++ b/docs/release-notes/0.12/release-0.12.md @@ -0,0 +1,115 @@ +# .NET for Apache Spark 0.12 Release Notes + +### New Features/Improvements and Bug Fixes + +* Expose `DataStreamWriter.ForeachBatch` API ([#549](https://github.com/dotnet/spark/pull/549)) +* Support for [dotnet-interactive](https://github.com/dotnet/interactive) ([#515](https://github.com/dotnet/spark/pull/515)) ([#517](https://github.com/dotnet/spark/pull/517)) ([#554](https://github.com/dotnet/spark/pull/554)) +* Support for [Hyperspace v0.1.0](https://github.com/microsoft/hyperspace) APIs ([#555](https://github.com/dotnet/spark/pull/555)) +* Support for Spark 2.4.6 ([#547](https://github.com/dotnet/spark/pull/547)) +* Bug fixes: + * Udf bug caused by `BroadcastVariablesRegistry` ([#551](https://github.com/dotnet/spark/pull/551)) + * Null checks for `TimestampType` and `DateType` ([#530](https://github.com/dotnet/spark/pull/530)) +* Update `Microsoft.Data.Analysis` to v`0.4.0` ([#528](https://github.com/dotnet/spark/pull/528)) + +### Infrastructure / Documentation / Etc. + +* Improve build pipeline ([#510](https://github.com/dotnet/spark/pull/510)) ([#511](https://github.com/dotnet/spark/pull/511)) ([#512](https://github.com/dotnet/spark/pull/512)) ([#513](https://github.com/dotnet/spark/pull/513)) ([#524](https://github.com/dotnet/spark/pull/524)) +* Update AppName for the C# Spark Examples ([#548](https://github.com/dotnet/spark/pull/548)) +* Update maven links in build documentation ([#558](https://github.com/dotnet/spark/pull/558)) ([#560](https://github.com/dotnet/spark/pull/560)) + +### Breaking Changes + +* None + +### Known Issues + +* Broadcast variables do not work with [dotnet-interactive](https://github.com/dotnet/interactive) ([#561](https://github.com/dotnet/spark/pull/561)) + +### Compatibility + +#### Backward compatibility + +The following table describes the oldest version of the worker that the current version is compatible with, along with new features that are incompatible with the worker. + + + + + + + + + + + + + + + + + + + + + + + +
Oldest compatible Microsoft.Spark.Worker versionIncompatible features
v0.9.0DataFrame with Grouped Map UDF (#277)
DataFrame with Vector UDF (#277)
Support for Broadcast Variables (#414)
Support for TimestampType (#428)
+ +#### Forward compatibility + +The following table describes the oldest version of .NET for Apache Spark release that the current worker is compatible with. + + + + + + + + + + + + +
Oldest compatible .NET for Apache Spark release version
v0.9.0
+ +### Supported Spark Versions + +The following table outlines the supported Spark versions along with the microsoft-spark JAR to use with: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Spark Versionmicrosoft-spark JAR
2.3.*microsoft-spark-2.3.x-0.12.0.jar
2.4.0microsoft-spark-2.4.x-0.12.0.jar
2.4.1
2.4.3
2.4.4
2.4.5
2.4.6
2.4.2Not supported
diff --git a/eng/Versions.props b/eng/Versions.props index dc954bcc5..b1d1c2efd 100644 --- a/eng/Versions.props +++ b/eng/Versions.props @@ -1,7 +1,7 @@ - 0.11.0 + 0.12.0 prerelease $(RestoreSources); diff --git a/src/scala/pom.xml b/src/scala/pom.xml index 34ee5c338..aacc2da49 100644 --- a/src/scala/pom.xml +++ b/src/scala/pom.xml @@ -7,7 +7,7 @@ ${microsoft-spark.version} UTF-8 - 0.11.0 + 0.12.0 From 0879fd1a7a6d18b061dd97281c9b9342660c87ab Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Tue, 23 Jun 2020 14:23:37 -0700 Subject: [PATCH 16/27] Bugfix for Microsoft.Spark.Extensions.DotNet.Interactive duplicate file exception (#565) --- .../AssemblyKernelExtension.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs index 2deff5869..bb30e4957 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.DotNet.Interactive/AssemblyKernelExtension.cs @@ -45,6 +45,8 @@ public Task OnLoadAsync(IKernel kernel) kernelBase.AddMiddleware(async (command, context, next) => { + await next(command, context); + if ((context.HandlingKernel is CSharpKernel kernel) && (command is SubmitCode) && TryGetSparkSession(out SparkSession sparkSession) && @@ -57,8 +59,6 @@ public Task OnLoadAsync(IKernel kernel) sparkSession.SparkContext.AddFile(filePath); } } - - await next(command, context); }); } From 3106e8e87f811979cd0a5a1957ac6bcf177be2ce Mon Sep 17 00:00:00 2001 From: John Baro Date: Sat, 27 Jun 2020 04:42:29 +1000 Subject: [PATCH 17/27] Expose JVM exceptions (#566) * Wrap spark exceptions per #472 * Extra tests for JvmException Code styling per guidelines from review by @imback82 in #541 * Add JvmBridge doc link Co-authored-by: Steve Suh * Fix per code guidelines Co-authored-by: Steve Suh * Fix cref link * Formatting Co-authored-by: Steve Suh * Add license header Co-authored-by: Steve Suh --- .../IpcTests/JvmBridgeTests.cs | 36 +++++++++++++++++++ .../Microsoft.Spark/Interop/Ipc/JvmBridge.cs | 2 +- src/csharp/Microsoft.Spark/JvmException.cs | 19 ++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/JvmBridgeTests.cs create mode 100644 src/csharp/Microsoft.Spark/JvmException.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/JvmBridgeTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/JvmBridgeTests.cs new file mode 100644 index 000000000..3ae609f5c --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/JvmBridgeTests.cs @@ -0,0 +1,36 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests +{ + [Collection("Spark E2E Tests")] + public class JvmBridgeTests + { + private readonly SparkSession _spark; + + public JvmBridgeTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestInnerJvmException() + { + try + { + _spark.Sql("THROW!!!"); + } + catch (Exception ex) + { + Assert.NotNull(ex.InnerException); + Assert.IsType(ex.InnerException); + Assert.False(string.IsNullOrWhiteSpace(ex.InnerException.Message)); + } + } + } +} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs index abfa63b19..231263c74 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs @@ -189,7 +189,7 @@ private object CallJavaMethod( args); _logger.LogError(errorMessage); _logger.LogError(jvmFullStackTrace); - throw new Exception(errorMessage); + throw new Exception(errorMessage, new JvmException(jvmFullStackTrace)); } char typeAsChar = Convert.ToChar(inputStream.ReadByte()); diff --git a/src/csharp/Microsoft.Spark/JvmException.cs b/src/csharp/Microsoft.Spark/JvmException.cs new file mode 100644 index 000000000..75e3aec42 --- /dev/null +++ b/src/csharp/Microsoft.Spark/JvmException.cs @@ -0,0 +1,19 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.Spark +{ + /// + /// Contains the message returned from the on an error. + /// + public class JvmException : Exception + { + public JvmException(string message) + : base(message) + { + } + } +} From 886cec05876193d2e7153dd53126deeb88ea22ba Mon Sep 17 00:00:00 2001 From: Steve Suh Date: Sat, 27 Jun 2020 12:24:23 -0700 Subject: [PATCH 18/27] AssemblyLoader should use absolute assembly path when loading assemblies (#570) --- .../AssemblyLoaderTests.cs | 26 +++++++++++++++++++ .../Microsoft.Spark/Utils/AssemblyLoader.cs | 6 ++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs b/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs index f2f0dd30e..c2c5e63ee 100644 --- a/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs +++ b/src/csharp/Microsoft.Spark.UnitTest/AssemblyLoaderTests.cs @@ -4,7 +4,11 @@ using System; using System.IO; +using System.Reflection; +using System.Runtime.Loader; +using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Utils; +using Moq; using Xunit; namespace Microsoft.Spark.UnitTest @@ -12,6 +16,13 @@ namespace Microsoft.Spark.UnitTest [Collection("Spark Unit Tests")] public class AssemblyLoaderTests { + private readonly Mock _mockJvm; + + public AssemblyLoaderTests(SparkFixture _fixture) + { + _mockJvm = _fixture.MockJvm; + } + [Fact] public void TestAssemblySearchPathResolver() { @@ -45,5 +56,20 @@ public void TestAssemblySearchPathResolver() AssemblySearchPathResolver.AssemblySearchPathsEnvVarName, null); } + + [Fact] + public void TestResolveAssemblyWithRelativePath() + { + _mockJvm.Setup(m => m.CallStaticJavaMethod( + "org.apache.spark.SparkFiles", + "getRootDirectory")) + .Returns("."); + + AssemblyLoader.LoadFromFile = AssemblyLoadContext.Default.LoadFromAssemblyPath; + Assembly expectedAssembly = Assembly.GetExecutingAssembly(); + Assembly actualAssembly = AssemblyLoader.ResolveAssembly(expectedAssembly.FullName); + + Assert.Equal(expectedAssembly, actualAssembly); + } } } diff --git a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs index 3b9b34f5e..fbc6e199a 100644 --- a/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs +++ b/src/csharp/Microsoft.Spark/Utils/AssemblyLoader.cs @@ -189,12 +189,12 @@ private static bool TryLoadAssembly(string assemblyFileName, ref Assembly assemb { foreach (string searchPath in s_searchPaths.Value) { - string assemblyPath = Path.Combine(searchPath, assemblyFileName); - if (File.Exists(assemblyPath)) + var assemblyFile = new FileInfo(Path.Combine(searchPath, assemblyFileName)); + if (assemblyFile.Exists) { try { - assembly = LoadFromFile(assemblyPath); + assembly = LoadFromFile(assemblyFile.FullName); return true; } catch (Exception ex) when ( From 2f90321c3d7b6aea9485c09f1b6383fce5cf4a3b Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Sat, 27 Jun 2020 12:25:27 -0700 Subject: [PATCH 19/27] Prep 0.12.1 Release (#572) --- README.md | 2 +- benchmark/scala/pom.xml | 2 +- docs/release-notes/0.12.1/release-0.12.1.md | 110 ++++++++++++++++++++ eng/Versions.props | 2 +- src/scala/pom.xml | 2 +- 5 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 docs/release-notes/0.12.1/release-0.12.1.md diff --git a/README.md b/README.md index 5b8647ca0..7aef188eb 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ 2.3.* - v0.12.0 + v0.12.1 2.4.0 diff --git a/benchmark/scala/pom.xml b/benchmark/scala/pom.xml index 54608d6ef..3aa7b195a 100644 --- a/benchmark/scala/pom.xml +++ b/benchmark/scala/pom.xml @@ -3,7 +3,7 @@ 4.0.0 com.microsoft.spark microsoft-spark-benchmark - 0.12.0 + 0.12.1 2019 UTF-8 diff --git a/docs/release-notes/0.12.1/release-0.12.1.md b/docs/release-notes/0.12.1/release-0.12.1.md new file mode 100644 index 000000000..53f4d928a --- /dev/null +++ b/docs/release-notes/0.12.1/release-0.12.1.md @@ -0,0 +1,110 @@ +# .NET for Apache Spark 0.12.1 Release Notes + +### New Features/Improvements + +* Expose `JvmException` to capture JVM error messages separately ([#566](https://github.com/dotnet/spark/pull/566)) + +### Bug Fixes + +* AssemblyLoader should use absolute assembly path when loading assemblies ([570](https://github.com/dotnet/spark/pull/570)) + +### Infrastructure / Documentation / Etc. + +* None + +### Breaking Changes + +* None + +### Known Issues + +* Broadcast variables do not work with [dotnet-interactive](https://github.com/dotnet/interactive) ([#561](https://github.com/dotnet/spark/pull/561)) + +### Compatibility + +#### Backward compatibility + +The following table describes the oldest version of the worker that the current version is compatible with, along with new features that are incompatible with the worker. + + + + + + + + + + + + + + + + + + + + + + + +
Oldest compatible Microsoft.Spark.Worker versionIncompatible features
v0.9.0DataFrame with Grouped Map UDF (#277)
DataFrame with Vector UDF (#277)
Support for Broadcast Variables (#414)
Support for TimestampType (#428)
+ +#### Forward compatibility + +The following table describes the oldest version of .NET for Apache Spark release that the current worker is compatible with. + + + + + + + + + + + + +
Oldest compatible .NET for Apache Spark release version
v0.9.0
+ +### Supported Spark Versions + +The following table outlines the supported Spark versions along with the microsoft-spark JAR to use with: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Spark Versionmicrosoft-spark JAR
2.3.*microsoft-spark-2.3.x-0.12.1.jar
2.4.0microsoft-spark-2.4.x-0.12.1.jar
2.4.1
2.4.3
2.4.4
2.4.5
2.4.6
2.4.2Not supported
diff --git a/eng/Versions.props b/eng/Versions.props index b1d1c2efd..1219678bb 100644 --- a/eng/Versions.props +++ b/eng/Versions.props @@ -1,7 +1,7 @@ - 0.12.0 + 0.12.1 prerelease $(RestoreSources); diff --git a/src/scala/pom.xml b/src/scala/pom.xml index aacc2da49..035221cd4 100644 --- a/src/scala/pom.xml +++ b/src/scala/pom.xml @@ -7,7 +7,7 @@ ${microsoft-spark.version} UTF-8 - 0.12.0 + 0.12.1 From 2a597d8a980b25c85780dcb6dead057fef2ece07 Mon Sep 17 00:00:00 2001 From: Andrew Fogarty Date: Tue, 30 Jun 2020 22:47:51 -0700 Subject: [PATCH 20/27] README.md for Extension directory (#569) --- src/csharp/Extensions/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/csharp/Extensions/README.md diff --git a/src/csharp/Extensions/README.md b/src/csharp/Extensions/README.md new file mode 100644 index 000000000..fa32b6946 --- /dev/null +++ b/src/csharp/Extensions/README.md @@ -0,0 +1,19 @@ +# .Net for Apache Spark Extensions + +## Table of Contents +* [NuGet Packages](#nuget-packages) + +## NuGet Packages + +The following .Net for Apache Spark extensions are available as NuGet packages: + +### First-Party + +* [Microsoft.Spark.Extensions.Azure.Synapse.Analytics](https://www.nuget.org/packages/Microsoft.Spark.Extensions.Azure.Synapse.Analytics/) +* [Microsoft.Spark.Extensions.Delta](https://www.nuget.org/packages/Microsoft.Spark.Extensions.Delta/) +* [Microsoft.Spark.Extensions.DotNet.Interactive](https://www.nuget.org/packages/Microsoft.Spark.Extensions.DotNet.Interactive/) +* [Microsoft.Spark.Extensions.Hyperspace](https://www.nuget.org/packages/Microsoft.Spark.Extensions.Hyperspace/) + +### Third-Party + +* Community-created extensions can be added here. \ No newline at end of file From ec8189097135ba67b995b7762faf4225646f270b Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 8 Jul 2020 18:35:13 +0100 Subject: [PATCH 21/27] Introduce a base class for Spark.ML.Features (#574) --- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 143 +++++------------- .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 73 +++++++++ .../Microsoft.Spark/ML/Feature/HashingTF.cs | 104 ++++--------- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 82 +++------- .../Microsoft.Spark/ML/Feature/IDFModel.cs | 77 +++------- .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 68 ++------- .../Microsoft.Spark/ML/Feature/Word2Vec.cs | 42 ++--- .../ML/Feature/Word2VecModel.cs | 33 +--- .../Microsoft.Spark/ML/Util/Identifiable.cs | 15 ++ 9 files changed, 222 insertions(+), 415 deletions(-) create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Util/Identifiable.cs diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 924c8b362..8b530f66c 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -20,19 +20,16 @@ namespace Microsoft.Spark.ML.Feature /// will be thrown. The splits parameter is only used for single column usage, and splitsArray /// is for multiple columns. /// - public class Bucketizer : IJvmObjectReferenceProvider + public class Bucketizer : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_bucketizerClassName = "org.apache.spark.ml.feature.Bucketizer"; - private readonly JvmObjectReference _jvmObject; - /// /// Create a without any parameters /// - public Bucketizer() + public Bucketizer() : base(s_bucketizerClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName); } /// @@ -40,14 +37,12 @@ public Bucketizer() /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public Bucketizer(string uid) + public Bucketizer(string uid) : base(s_bucketizerClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName, uid); } - internal Bucketizer(JvmObjectReference jvmObject) + internal Bucketizer(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -56,11 +51,8 @@ internal Bucketizer(JvmObjectReference jvmObject) /// Gets the splits that were set using SetSplits /// /// double[], the splits to be used to bucket the input column - public double[] GetSplits() - { - return (double[])_jvmObject.Invoke("getSplits"); - } - + public double[] GetSplits() => (double[])_jvmObject.Invoke("getSplits"); + /// /// Split points for splitting a single column into buckets. To split multiple columns use /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time @@ -72,20 +64,15 @@ public double[] GetSplits() /// increasing. Values outside the splits specified will be treated as errors. /// /// New object - public Bucketizer SetSplits(double[] value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); - } + public Bucketizer SetSplits(double[] value) => + WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); /// /// Gets the splits that were set by SetSplitsArray /// /// double[][], the splits to be used to bucket the input columns - public double[][] GetSplitsArray() - { - return (double[][])_jvmObject.Invoke("getSplitsArray"); - } - + public double[][] GetSplitsArray() => (double[][])_jvmObject.Invoke("getSplitsArray"); + /// /// Split points fot splitting multiple columns into buckets. To split a single column use /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. @@ -97,41 +84,32 @@ public double[][] GetSplitsArray() /// includes y. The splits should be of length >= 3 and strictly increasing. /// Values outside the splits specified will be treated as errors. /// New object - public Bucketizer SetSplitsArray(double[][] value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", (object)value)); - } + public Bucketizer SetSplitsArray(double[][] value) => + WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", (object)value)); /// /// Gets the column that the should read from and convert into /// buckets. This would have been set by SetInputCol /// /// string, the input column - public string GetInputCol() - { - return (string)_jvmObject.Invoke("getInputCol"); - } - + public string GetInputCol() => (string)_jvmObject.Invoke("getInputCol"); + /// /// Sets the column that the should read from and convert into /// buckets /// /// The name of the column to as the source of the buckets /// New object - public Bucketizer SetInputCol(string value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); - } - + public Bucketizer SetInputCol(string value) => + WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); + /// /// Gets the columns that should read from and convert into /// buckets. This is set by SetInputCol /// /// IEnumerable<string>, list of input columns - public IEnumerable GetInputCols() - { - return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList(); - } + public IEnumerable GetInputCols() => + ((string[])(_jvmObject.Invoke("getInputCols"))).ToList(); /// /// Sets the columns that should read from and convert into @@ -142,73 +120,50 @@ public IEnumerable GetInputCols() /// /// List of input columns to use as sources for buckets /// New object - public Bucketizer SetInputCols(IEnumerable value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); - } - + public Bucketizer SetInputCols(IEnumerable value) => + WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); + /// /// Gets the name of the column the output data will be written to. This is set by /// SetInputCol /// /// string, the output column - public string GetOutputCol() - { - return (string)_jvmObject.Invoke("getOutputCol"); - } - + public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. /// /// The name of the new column which contains the bucket ID /// New object - public Bucketizer SetOutputCol(string value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); - } + public Bucketizer SetOutputCol(string value) => + WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); /// /// The list of columns that the will create in the DataFrame. /// This is set by SetOutputCols /// /// IEnumerable<string>, list of output columns - public IEnumerable GetOutputCols() - { - return ((string[])_jvmObject.Invoke("getOutputCols")).ToList(); - } - + public IEnumerable GetOutputCols() => + ((string[])_jvmObject.Invoke("getOutputCols")).ToList(); + /// /// The list of columns that the will create in the DataFrame. /// /// List of column names which will contain the bucket ID /// New object - public Bucketizer SetOutputCols(List value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); - } - + public Bucketizer SetOutputCols(List value) => + WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); + /// /// Loads the that was previously saved using Save /// /// The path the previous was saved to /// New object - public static Bucketizer Load(string path) - { - return WrapAsBucketizer( + public static Bucketizer Load(string path) => + WrapAsBucketizer( SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_bucketizerClassName,"load", path)); - } - - /// - /// Saves the so that it can be loaded later using Load - /// - /// The path to save the to - /// New object - public Bucketizer Save(string path) - { - return WrapAsBucketizer(_jvmObject.Invoke("save", path)); - } /// /// Executes the and transforms the DataFrame to include the new @@ -218,31 +173,15 @@ public Bucketizer Save(string path) /// /// containing the original data and the new bucketed columns /// - public DataFrame Transform(DataFrame source) - { - return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - } - - /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } + public DataFrame Transform(DataFrame source) => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); /// /// How should the handle invalid data, choices are "skip", /// "error" or "keep" /// /// string showing the way Spark will handle invalid data - public string GetHandleInvalid() - { - return (string)_jvmObject.Invoke("getHandleInvalid"); - } + public string GetHandleInvalid() => (string)_jvmObject.Invoke("getHandleInvalid"); /// /// Tells the what to do with invalid data. @@ -251,11 +190,9 @@ public string GetHandleInvalid() /// /// "skip", "error" or "keep" /// New object - public Bucketizer SetHandleInvalid(string value) - { - return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); - } - + public Bucketizer SetHandleInvalid(string value) => + WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); + private static Bucketizer WrapAsBucketizer(object obj) => new Bucketizer((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs new file mode 100644 index 000000000..d47339178 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -0,0 +1,73 @@ +using System; +using System.Linq; +using System.Reflection; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// FeatureBase is to share code amongst all of the ML.Feature objects, there are a few + /// interfaces that the Scala code implements across all of the objects. This should help to + /// write the extra objects faster. + /// + /// + /// The class that implements FeatureBase, this is needed so we can create new objects where + /// spark returns new objects rather than update existing objects. + /// + public class FeatureBase : Identifiable + { + internal readonly JvmObjectReference _jvmObject; + + internal FeatureBase(string className) + : this(SparkEnvironment.JvmBridge.CallConstructor(className)) + { + } + + internal FeatureBase(string className, string uid) + : this(SparkEnvironment.JvmBridge.CallConstructor(className, uid)) + { + } + + internal FeatureBase(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + /// + /// Returns the JVM toString value rather than the .NET ToString default + /// + /// JVM toString() value + public override string ToString() => (string)_jvmObject.Invoke("toString"); + + /// + /// The UID that was used to create the object. If no UID is passed in when creating the + /// object then a random UID is created when the object is created. + /// + /// string UID identifying the object + public string Uid() => (string)_jvmObject.Invoke("uid"); + + /// + /// Saves the object so that it can be loaded later using Load. Note that these objects + /// can be shared with Scala by Loading or Saving in Scala. + /// + /// The path to save the object to + /// New object + public T Save(string path) => + WrapAsType((JvmObjectReference)_jvmObject.Invoke("save", path)); + + private T WrapAsType(JvmObjectReference reference) + { + ConstructorInfo constructor = typeof(T) + .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) + .Single(c => + { + ParameterInfo[] parameters = c.GetParameters(); + return (parameters.Length == 1) && + (parameters[0].ParameterType == typeof(JvmObjectReference)); + }); + + return (T)constructor.Invoke(new object[] {reference}); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 50b4fe04a..d4e815d66 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -19,34 +19,29 @@ namespace Microsoft.Spark.ML.Feature /// power of two as the numFeatures parameter; otherwise the features will not be mapped evenly /// to the columns. /// - public class HashingTF : IJvmObjectReferenceProvider + public class HashingTF : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_hashingTfClassName = "org.apache.spark.ml.feature.HashingTF"; - - private readonly JvmObjectReference _jvmObject; - + /// /// Create a without any parameters /// - public HashingTF() + public HashingTF() : base(s_hashingTfClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName); } /// /// Create a with a UID that is used to give the /// a unique ID - /// unique identifier /// - public HashingTF(string uid) + /// An immutable unique ID for the object and its derivatives. + public HashingTF(string uid) : base(s_hashingTfClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName, uid); } - internal HashingTF(JvmObjectReference jvmObject) + internal HashingTF(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -56,31 +51,16 @@ internal HashingTF(JvmObjectReference jvmObject) /// /// The path the previous was saved to /// New object - public static HashingTF Load(string path) - { - return WrapAsHashingTF( + public static HashingTF Load(string path) => + WrapAsHashingTF( SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_hashingTfClassName, "load", path)); - } - - /// - /// Saves the so that it can be loaded later using Load - /// - /// The path to save the to - /// New object - public HashingTF Save(string path) - { - return WrapAsHashingTF(_jvmObject.Invoke("save", path)); - } - + /// /// Gets the binary toggle that controls term frequency counts /// /// Flag showing whether the binary toggle is on or off - public bool GetBinary() - { - return (bool)_jvmObject.Invoke("getBinary"); - } + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); /// /// Binary toggle to control term frequency counts. @@ -88,50 +68,38 @@ public bool GetBinary() /// models that model binary events rather than integer counts /// /// binary toggle, default is false - public HashingTF SetBinary(bool value) - { - return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); - } - + public HashingTF SetBinary(bool value) => + WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); + /// /// Gets the column that the should read from /// /// string, the name of the input column - public string GetInputCol() - { - return (string)_jvmObject.Invoke("getInputCol"); - } - + public string GetInputCol() => (string)_jvmObject.Invoke("getInputCol"); + /// /// Sets the column that the should read from /// /// The name of the column to as the source /// New object - public HashingTF SetInputCol(string value) - { - return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value)); - } + public HashingTF SetInputCol(string value) => + WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value)); /// /// The will create a new column in the , /// this is the name of the new column. /// /// string, the name of the output col - public string GetOutputCol() - { - return (string)_jvmObject.Invoke("getOutputCol"); - } - + public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); + /// /// The will create a new column in the , /// this is the name of the new column. /// /// The name of the new column /// New object - public HashingTF SetOutputCol(string value) - { - return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); - } + public HashingTF SetOutputCol(string value) => + WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); /// /// Gets the number of features that should be used. Since a simple modulo is used to @@ -140,11 +108,8 @@ public HashingTF SetOutputCol(string value) /// columns. /// /// The number of features to be used - public int GetNumFeatures() - { - return (int)_jvmObject.Invoke("getNumFeatures"); - } - + public int GetNumFeatures() => (int)_jvmObject.Invoke("getNumFeatures"); + /// /// Sets the number of features that should be used. Since a simple modulo is used to /// transform the hash function to a column index, it is advisable to use a power of two as @@ -153,19 +118,8 @@ public int GetNumFeatures() /// /// int /// New object - public HashingTF SetNumFeatures(int value) - { - return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); - } - - /// - /// An immutable unique ID for the object and its derivatives. - /// - /// string, unique ID for the object - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } + public HashingTF SetNumFeatures(int value) => + WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); /// /// Executes the and transforms the DataFrame to include the new @@ -173,11 +127,9 @@ public string Uid() /// /// The to add the tokens to /// containing the original data and the tokens - public DataFrame Transform(DataFrame source) - { - return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - } - + public DataFrame Transform(DataFrame source) => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + private static HashingTF WrapAsHashingTF(object obj) => new HashingTF((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index 5c2259aaf..56d2fa59f 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -17,18 +17,15 @@ namespace Microsoft.Spark.ML.Feature /// of documents (controlled by the variable minDocFreq). For terms that are not in at least /// minDocFreq documents, the IDF is found as 0, resulting in TF-IDFs of 0. /// - public class IDF : IJvmObjectReferenceProvider + public class IDF : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF"; - private readonly JvmObjectReference _jvmObject; - /// /// Create a without any parameters /// - public IDF() + public IDF() : base(s_IDFClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName); } /// @@ -36,14 +33,12 @@ public IDF() /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public IDF(string uid) + public IDF(string uid) : base(s_IDFClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName, uid); } - internal IDF(JvmObjectReference jvmObject) + internal IDF(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -52,82 +47,53 @@ internal IDF(JvmObjectReference jvmObject) /// Gets the column that the should read from /// /// string, input column - public string GetInputCol() - { - return (string)(_jvmObject.Invoke("getInputCol")); - } - + public string GetInputCol() => (string)(_jvmObject.Invoke("getInputCol")); + /// /// Sets the column that the should read from /// /// The name of the column to as the source /// New object - public IDF SetInputCol(string value) - { - return WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); - } + public IDF SetInputCol(string value) => WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); /// /// The will create a new column in the DataFrame, this is the /// name of the new column. /// /// string, the output column - public string GetOutputCol() - { - return (string)(_jvmObject.Invoke("getOutputCol")); - } - + public string GetOutputCol() => (string)(_jvmObject.Invoke("getOutputCol")); + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. /// /// The name of the new column /// New object - public IDF SetOutputCol(string value) - { - return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); - } + public IDF SetOutputCol(string value) => + WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); /// /// Minimum of documents in which a term should appear for filtering /// /// int, minimum number of documents in which a term should appear - public int GetMinDocFreq() - { - return (int)_jvmObject.Invoke("getMinDocFreq"); - } - + public int GetMinDocFreq() => (int)_jvmObject.Invoke("getMinDocFreq"); + /// /// Minimum of documents in which a term should appear for filtering /// /// int, the minimum of documents a term should appear in /// New object - public IDF SetMinDocFreq(int value) - { - return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value)); - } - + public IDF SetMinDocFreq(int value) => + WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value)); + /// /// Fits a model to the input data. /// /// The to fit the model to /// New object - public IDFModel Fit(DataFrame source) - { - return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); - } + public IDFModel Fit(DataFrame source) => + new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); - /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } - /// /// Loads the that was previously saved using Save /// @@ -138,17 +104,7 @@ public static IDF Load(string path) return WrapAsIDF( SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFClassName, "load", path)); } - - /// - /// Saves the so that it can be loaded later using Load - /// - /// The path to save the to - /// New object - public IDF Save(string path) - { - return WrapAsIDF(_jvmObject.Invoke("save", path)); - } - + private static IDF WrapAsIDF(object obj) => new IDF((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index 4fc8a4f30..31da6e153 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -12,19 +12,16 @@ namespace Microsoft.Spark.ML.Feature /// A that converts the input string to lowercase and then splits it by /// white spaces. /// - public class IDFModel : IJvmObjectReferenceProvider + public class IDFModel : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_IDFModelClassName = "org.apache.spark.ml.feature.IDFModel"; - private readonly JvmObjectReference _jvmObject; - /// /// Create a without any parameters /// - public IDFModel() + public IDFModel() : base(s_IDFModelClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName); } /// @@ -32,14 +29,12 @@ public IDFModel() /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public IDFModel(string uid) + public IDFModel(string uid) : base(s_IDFModelClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName, uid); } - internal IDFModel(JvmObjectReference jvmObject) + internal IDFModel(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -48,32 +43,24 @@ internal IDFModel(JvmObjectReference jvmObject) /// Gets the column that the should read from /// /// string, input column - public string GetInputCol() - { - return (string)(_jvmObject.Invoke("getInputCol")); - } - + public string GetInputCol() => (string)(_jvmObject.Invoke("getInputCol")); + /// /// Sets the column that the should read from and convert into /// buckets /// /// The name of the column to as the source /// New object - public IDFModel SetInputCol(string value) - { - return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); - } + public IDFModel SetInputCol(string value) => + WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); /// /// The will create a new column in the , /// this is the name of the new column. /// /// string, the output column - public string GetOutputCol() - { - return (string)(_jvmObject.Invoke("getOutputCol")); - } - + public string GetOutputCol() => (string)(_jvmObject.Invoke("getOutputCol")); + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. @@ -81,42 +68,24 @@ public string GetOutputCol() /// The name of the new column which contains the tokens /// /// New object - public IDFModel SetOutputCol(string value) - { - return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); - } - + public IDFModel SetOutputCol(string value) => + WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); + /// /// Minimum of documents in which a term should appear for filtering /// /// Minimum number of documents a term should appear - public int GetMinDocFreq() - { - return (int)_jvmObject.Invoke("getMinDocFreq"); - } - + public int GetMinDocFreq() => (int)_jvmObject.Invoke("getMinDocFreq"); + /// /// Executes the and transforms the to /// include the new column or columns with the tokens. /// /// The to add the tokens to /// containing the original data and the tokens - public DataFrame Transform(DataFrame source) - { - return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - } + public DataFrame Transform(DataFrame source) => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } - /// /// Loads the that was previously saved using Save /// @@ -128,17 +97,7 @@ public static IDFModel Load(string path) SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_IDFModelClassName, "load", path)); } - - /// - /// Saves the so that it can be loaded later using Load - /// - /// The path to save the to - /// New object - public IDFModel Save(string path) - { - return WrapAsIDFModel(_jvmObject.Invoke("save", path)); - } - + private static IDFModel WrapAsIDFModel(object obj) => new IDFModel((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index c411309dc..cf5ad84f7 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -12,19 +12,16 @@ namespace Microsoft.Spark.ML.Feature /// A that converts the input string to lowercase and then splits it by /// white spaces. /// - public class Tokenizer : IJvmObjectReferenceProvider + public class Tokenizer : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_tokenizerClassName = "org.apache.spark.ml.feature.Tokenizer"; - private readonly JvmObjectReference _jvmObject; - /// /// Create a without any parameters /// - public Tokenizer() + public Tokenizer() : base(s_tokenizerClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName); } /// @@ -32,14 +29,12 @@ public Tokenizer() /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public Tokenizer(string uid) + public Tokenizer(string uid) : base(s_tokenizerClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName, uid); } - internal Tokenizer(JvmObjectReference jvmObject) + internal Tokenizer(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -48,42 +43,32 @@ internal Tokenizer(JvmObjectReference jvmObject) /// Gets the column that the should read from /// /// string, input column - public string GetInputCol() - { - return (string)(_jvmObject.Invoke("getInputCol")); - } - + public string GetInputCol() => (string)(_jvmObject.Invoke("getInputCol")); + /// /// Sets the column that the should read from /// /// The name of the column to as the source /// New object - public Tokenizer SetInputCol(string value) - { - return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); - } + public Tokenizer SetInputCol(string value) => + WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); /// /// The will create a new column in the DataFrame, this is the /// name of the new column. /// /// string, the output column - public string GetOutputCol() - { - return (string)(_jvmObject.Invoke("getOutputCol")); - } - + public string GetOutputCol() => (string)(_jvmObject.Invoke("getOutputCol")); + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. /// /// The name of the new column /// New object - public Tokenizer SetOutputCol(string value) - { - return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value)); - } - + public Tokenizer SetOutputCol(string value) => + WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value)); + /// /// Executes the and transforms the DataFrame to include the new /// column @@ -92,22 +77,9 @@ public Tokenizer SetOutputCol(string value) /// /// New object with the source transformed /// - public DataFrame Transform(DataFrame source) - { - return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - } + public DataFrame Transform(DataFrame source) => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); - /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } - /// /// Loads the that was previously saved using Save /// @@ -120,16 +92,6 @@ public static Tokenizer Load(string path) s_tokenizerClassName, "load", path)); } - /// - /// Saves the so that it can be loaded later using Load - /// - /// The path to save the to - /// New object - public Tokenizer Save(string path) - { - return WrapAsTokenizer(_jvmObject.Invoke("save", path)); - } - private static Tokenizer WrapAsTokenizer(object obj) => new Tokenizer((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Word2Vec.cs b/src/csharp/Microsoft.Spark/ML/Feature/Word2Vec.cs index 977194c8a..d272b1921 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Word2Vec.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Word2Vec.cs @@ -8,36 +8,29 @@ namespace Microsoft.Spark.ML.Feature { - public class Word2Vec : IJvmObjectReferenceProvider + public class Word2Vec : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_word2VecClassName = "org.apache.spark.ml.feature.Word2Vec"; - - private readonly JvmObjectReference _jvmObject; - + /// - /// Create a without any parameters. Once you have created a - /// you must call , - /// , and . + /// Create a without any parameters /// - public Word2Vec() + public Word2Vec() : base(s_word2VecClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_word2VecClassName); } /// /// Create a with a UID that is used to give the - /// a unique ID. + /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public Word2Vec(string uid) + public Word2Vec(string uid) : base(s_word2VecClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_word2VecClassName, uid); } - internal Word2Vec(JvmObjectReference jvmObject) + internal Word2Vec(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -190,30 +183,13 @@ public Word2VecModel Fit(DataFrame dataFrame) => new Word2VecModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the . - public string Uid() => (string)_jvmObject.Invoke("uid"); - - /// - /// Loads the that was previously saved using - /// . + /// Loads the that was previously saved using Save(string). /// /// The path the previous was saved to /// New object, loaded from path. public static Word2Vec Load(string path) => WrapAsWord2Vec( SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_word2VecClassName, "load", path)); - - /// - /// Saves the so that it can be loaded later using - /// . - /// - /// The path to save the to. - /// New object. - public Word2Vec Save(string path) => WrapAsWord2Vec(_jvmObject.Invoke("save", path)); - + private static Word2Vec WrapAsWord2Vec(object obj) => new Word2Vec((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Word2VecModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/Word2VecModel.cs index 2d3ca704a..b49223619 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Word2VecModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Word2VecModel.cs @@ -8,19 +8,16 @@ namespace Microsoft.Spark.ML.Feature { - public class Word2VecModel : IJvmObjectReferenceProvider + public class Word2VecModel : FeatureBase, IJvmObjectReferenceProvider { private static readonly string s_word2VecModelClassName = "org.apache.spark.ml.feature.Word2VecModel"; - - private readonly JvmObjectReference _jvmObject; /// /// Create a without any parameters /// - public Word2VecModel() + public Word2VecModel() : base(s_word2VecModelClassName) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_word2VecModelClassName); } /// @@ -28,14 +25,12 @@ public Word2VecModel() /// a unique ID /// /// An immutable unique ID for the object and its derivatives. - public Word2VecModel(string uid) + public Word2VecModel(string uid) : base(s_word2VecModelClassName, uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_word2VecModelClassName, uid); } - internal Word2VecModel(JvmObjectReference jvmObject) + internal Word2VecModel(JvmObjectReference jvmObject) : base(jvmObject) { - _jvmObject = jvmObject; } JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -60,8 +55,7 @@ public DataFrame FindSynonyms(string word, int num) => new DataFrame((JvmObjectReference)_jvmObject.Invoke("findSynonyms", word, num)); /// - /// Loads the that was previously saved using - /// . + /// Loads the that was previously saved using Save(string). /// /// /// The path the previous was saved to @@ -71,23 +65,6 @@ public static Word2VecModel Load(string path) => WrapAsWord2VecModel( SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_word2VecModelClassName, "load", path)); - /// - /// Saves the so that it can be loaded later using - /// . - /// - /// The path to save the to. - /// New object. - public Word2VecModel Save(string path) => - WrapAsWord2VecModel(_jvmObject.Invoke("save", path)); - - /// - /// The UID that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the . - public string Uid() => (string)_jvmObject.Invoke("uid"); - private static Word2VecModel WrapAsWord2VecModel(object obj) => new Word2VecModel((JvmObjectReference)obj); } diff --git a/src/csharp/Microsoft.Spark/ML/Util/Identifiable.cs b/src/csharp/Microsoft.Spark/ML/Util/Identifiable.cs new file mode 100644 index 000000000..565b8d63b --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Util/Identifiable.cs @@ -0,0 +1,15 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.Spark.ML.Feature +{ + public interface Identifiable + { + /// + /// The UID of the object. + /// + /// string UID identifying the object + string Uid(); + } +} From f803aa8c890ec6f530d63a366056e8de0a9e18ac Mon Sep 17 00:00:00 2001 From: Andrew Fogarty Date: Fri, 10 Jul 2020 14:21:42 -0700 Subject: [PATCH 22/27] Run Delta Lake tests against 0.6.1 (#588) --- .../Microsoft.Spark.Extensions.Delta.E2ETest/DeltaFixture.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaFixture.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaFixture.cs index 9c0472485..9ca3851f0 100644 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaFixture.cs +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.Delta.E2ETest/DeltaFixture.cs @@ -16,7 +16,7 @@ public DeltaFixture() { Environment.SetEnvironmentVariable( SparkFixture.EnvironmentVariableNames.ExtraSparkSubmitArgs, - "--packages io.delta:delta-core_2.11:0.6.0 " + + "--packages io.delta:delta-core_2.11:0.6.1 " + "--conf spark.databricks.delta.snapshotPartitions=2 " + "--conf spark.sql.sources.parallelPartitionDiscovery.parallelism=5"); SparkFixture = new SparkFixture(); From 01433ca28e28c1a9f113e0bc97104f4c52bdffa1 Mon Sep 17 00:00:00 2001 From: Usman Mohammed <38691403+usmanmohammed@users.noreply.github.com> Date: Fri, 24 Jul 2020 20:46:22 +0100 Subject: [PATCH 23/27] Add more DataFrame operations examples (#599) --- .../Sql/Batch/Basic.cs | 19 +++++++++++++++++++ .../Sql/Basic.fs | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs index fe57f7d1b..e09c79e20 100644 --- a/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs +++ b/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs @@ -108,6 +108,25 @@ public void Run(string[] args) DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); + + // Union of two data frames + DataFrame unionDf = df.Union(df); + unionDf.Show(); + + // Add new column to data frame + df.WithColumn("location", Lit("Seattle")).Show(); + + // Rename existing column + df.WithColumnRenamed("name", "fullname").Show(); + + // Filter rows with null age + df.Filter(Col("age").IsNull()).Show(); + + // Fill null values in age column with -1 + df.Na().Fill(-1, new[] { "age" }).Show(); + + // Drop age column + df.Drop(new[] { "age" }).Show(); spark.Stop(); } diff --git a/examples/Microsoft.Spark.FSharp.Examples/Sql/Basic.fs b/examples/Microsoft.Spark.FSharp.Examples/Sql/Basic.fs index 4e503fac9..6af1f81f7 100644 --- a/examples/Microsoft.Spark.FSharp.Examples/Sql/Basic.fs +++ b/examples/Microsoft.Spark.FSharp.Examples/Sql/Basic.fs @@ -78,6 +78,25 @@ type Basic() = let joinedDf3 = df.Join(df, df.["name"].EqualTo(df.["name"]), "outer") joinedDf3.Show() + + // Union of two data frames + let unionDf = df.Union(df) + unionDf.Show() + + // Add new column to data frame + df.WithColumn("location", Functions.Lit("Seattle")).Show() + + // Rename existing column + df.WithColumnRenamed("name", "fullname").Show() + + // Filter rows with null age + df.Filter(df.["age"].IsNull()).Show() + + // Fill null values in age column with -1 + df.Na().Fill(-1L, ["age"]).Show() + + // Drop age column + df.Drop(df.["age"]).Show() spark.Stop() 0 From a75d6531a4bf4757aaef7d5e669f114fb4d75bc1 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Fri, 24 Jul 2020 20:47:18 +0100 Subject: [PATCH 24/27] Add Param and methods to Spark.ML (#586) --- .ionide/symbolCache.db | Bin 0 -> 28672 bytes .../IpcTests/ML/Feature/BucketizerTests.cs | 14 +++ .../IpcTests/ML/Param/ParamTests.cs | 35 ++++++++ .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 44 +++++++++- src/csharp/Microsoft.Spark/ML/Param/Param.cs | 83 ++++++++++++++++++ 5 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 .ionide/symbolCache.db create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Param/ParamTests.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Param/Param.cs diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db new file mode 100644 index 0000000000000000000000000000000000000000..43e567d6d682d85dd32b3baebb0fdf61f67c1643 GIT binary patch literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 literal 0 HcmV?d00001 diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index a075334de..e9193fd0b 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.IO; using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.ML.Feature.Param; using Microsoft.Spark.Sql; using Microsoft.Spark.UnitTest.TestUtils; using Xunit; @@ -58,6 +59,19 @@ public void TestBucketizer() Bucketizer loadedBucketizer = Bucketizer.Load(savePath); Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); } + + Assert.NotEmpty(bucketizer.ExplainParams()); + + Param handleInvalidParam = bucketizer.GetParam("handleInvalid"); + Assert.NotEmpty(handleInvalidParam.Doc); + Assert.NotEmpty(handleInvalidParam.Name); + Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid()); + + Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam)); + bucketizer.Set(handleInvalidParam, "keep"); + Assert.Equal("keep", bucketizer.GetHandleInvalid()); + + Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid()); } [Fact] diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Param/ParamTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Param/ParamTests.cs new file mode 100644 index 000000000..ecb9166e1 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Param/ParamTests.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.ML.Feature.Param; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.ParamTests +{ + [Collection("Spark E2E Tests")] + public class ParamTests + { + private readonly SparkSession _spark; + + public ParamTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void Test() + { + const string expectedParent = "parent"; + const string expectedName = "name"; + const string expectedDoc = "doc"; + + var param = new Param(expectedParent, expectedName, expectedDoc); + + Assert.Equal(expectedParent, param.Parent); + Assert.Equal(expectedDoc, param.Doc); + Assert.Equal(expectedName, param.Name); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index d47339178..fcc90b43d 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -56,7 +56,49 @@ internal FeatureBase(JvmObjectReference jvmObject) public T Save(string path) => WrapAsType((JvmObjectReference)_jvmObject.Invoke("save", path)); - private T WrapAsType(JvmObjectReference reference) + /// + /// Clears any value that was previously set for this . The value is + /// reset to the default value. + /// + /// The to set back to its original value + /// Object reference that was used to clear the + public T Clear(Param.Param param) => + WrapAsType((JvmObjectReference)_jvmObject.Invoke("clear", param)); + + /// + /// Returns a description of how a specific works and is currently set. + /// + /// The to explain + /// Description of the + public string ExplainParam(Param.Param param) => + (string)_jvmObject.Invoke("explainParam", param); + + /// + /// Returns a description of how all of the 's that apply to this object + /// work and how they are currently set. + /// + /// Description of all the applicable 's + public string ExplainParams() => (string)_jvmObject.Invoke("explainParams"); + + /// + /// Retrieves a so that it can be used to set the value of the + /// on the object. + /// + /// The name of the to get. + /// that can be used to set the actual value + public Param.Param GetParam(string paramName) => + new Param.Param((JvmObjectReference)_jvmObject.Invoke("getParam", paramName)); + + /// + /// Sets the value of a specific . + /// + /// to set the value of + /// The value to use + /// The object that contains the newly set + public T Set(Param.Param param, object value) => + WrapAsType((JvmObjectReference)_jvmObject.Invoke("set", param, value)); + + private static T WrapAsType(JvmObjectReference reference) { ConstructorInfo constructor = typeof(T) .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) diff --git a/src/csharp/Microsoft.Spark/ML/Param/Param.cs b/src/csharp/Microsoft.Spark/ML/Param/Param.cs new file mode 100644 index 000000000..f524ea012 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Param/Param.cs @@ -0,0 +1,83 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.ML.Feature.Param +{ + /// + /// A with self-contained documentation and optionally default value. + /// + /// A references an individual parameter that includes documentation, the + /// name of the parameter and optionally a default value. Params can either be set using the + /// generic methods or by using explicit methods. For example + /// has SetHandleInvalid or you can call + /// GetParam("handleInvalid")and then . Set using the + /// and the value you want to use. + /// + public class Param : IJvmObjectReferenceProvider + { + private static readonly string s_ParamClassName = + "org.apache.spark.ml.param.Param"; + + private readonly JvmObjectReference _jvmObject; + + /// + /// Creates a new instance of a which will be attached to the parent + /// specified. The most likely use case for a is being read from a + /// parent object such as rather than independently + /// The parent object to assign the to + /// The name of this + /// The documentation for this + /// + public Param(Identifiable parent, string name, string doc) + : this(SparkEnvironment.JvmBridge.CallConstructor( + s_ParamClassName, parent.Uid(), name, doc)) + { + } + + /// + /// Creates a new instance of a which will be attached to the parent + /// with the UID specified. The most likely use case for a is being + /// read from a parent object such as rather than independently + /// + /// The UID of the parent object to assign the to + /// + /// The name of this + /// The documentation for this + /// + public Param(string parent, string name, string doc) + : this(SparkEnvironment.JvmBridge.CallConstructor(s_ParamClassName, parent, name, doc)) + { + } + + internal Param(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// The description of what the does and how it works including any + /// defaults and the current value + /// + /// A description of how the works + public string Doc => (string)_jvmObject.Invoke("doc"); + + /// + /// The name of the + /// + /// The name of the + public string Name => (string)_jvmObject.Invoke("name"); + + /// + /// The object that contains the + /// + /// The UID of the parent oject that this belongs to + public string Parent => (string)_jvmObject.Invoke("parent"); + } +} From 7a6dd0c5700988e3f634b27fe38380d72e2f93fd Mon Sep 17 00:00:00 2001 From: Usman Mohammed <38691403+usmanmohammed@users.noreply.github.com> Date: Sat, 25 Jul 2020 04:25:07 +0100 Subject: [PATCH 25/27] Update ubuntu instructions for running the example app (#603) --- docs/building/ubuntu-instructions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/building/ubuntu-instructions.md b/docs/building/ubuntu-instructions.md index 0e3dbdf40..b259768e5 100644 --- a/docs/building/ubuntu-instructions.md +++ b/docs/building/ubuntu-instructions.md @@ -185,7 +185,7 @@ Once you build the samples, you can use `spark-submit` to submit your .NET Core --class org.apache.spark.deploy.dotnet.DotnetRunner \ --master local \ ~/dotnet.spark/src/scala/microsoft-spark-/target/microsoft-spark-.jar \ - Microsoft.Spark.CSharp.Examples Sql.Batch.Basic $SPARK_HOME/examples/src/main/resources/people.json + ./Microsoft.Spark.CSharp.Examples Sql.Batch.Basic $SPARK_HOME/examples/src/main/resources/people.json ``` - **[Microsoft.Spark.Examples.Sql.Streaming.StructuredNetworkWordCount](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredNetworkWordCount.cs)** ```bash @@ -193,7 +193,7 @@ Once you build the samples, you can use `spark-submit` to submit your .NET Core --class org.apache.spark.deploy.dotnet.DotnetRunner \ --master local \ ~/dotnet.spark/src/scala/microsoft-spark-/target/microsoft-spark-.jar \ - Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredNetworkWordCount localhost 9999 + ./Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredNetworkWordCount localhost 9999 ``` - **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (maven accessible)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)** ```bash @@ -202,7 +202,7 @@ Once you build the samples, you can use `spark-submit` to submit your .NET Core --class org.apache.spark.deploy.dotnet.DotnetRunner \ --master local \ ~/dotnet.spark/src/scala/microsoft-spark-/target/microsoft-spark-.jar \ - Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test + ./Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test ``` - **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (jars provided)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)** ```bash @@ -211,7 +211,7 @@ Once you build the samples, you can use `spark-submit` to submit your .NET Core --class org.apache.spark.deploy.dotnet.DotnetRunner \ --master local \ ~/dotnet.spark/src/scala/microsoft-spark-/target/microsoft-spark-.jar \ - Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test + ./Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test ``` Feel this experience is complicated? Help us by taking up [Simplify User Experience for Running an App](https://github.com/dotnet/spark/issues/6) From 606235beda67657bf14529326e29f983f7683de7 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Sat, 1 Aug 2020 08:39:01 +0100 Subject: [PATCH 26/27] Update .gitignore to include .ionide folder (#609) --- .gitignore | 3 +++ .ionide/symbolCache.db | Bin 28672 -> 0 bytes 2 files changed, 3 insertions(+) delete mode 100644 .ionide/symbolCache.db diff --git a/.gitignore b/.gitignore index 251cfa7e2..faada9c8a 100644 --- a/.gitignore +++ b/.gitignore @@ -367,3 +367,6 @@ hs_err_pid* # The target folder contains the output of building **/target/** + +# F# vs code +.ionide/ diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db deleted file mode 100644 index 43e567d6d682d85dd32b3baebb0fdf61f67c1643..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 From 61cfeb574627bd5d8dc4e9615a9af74937f0fdda Mon Sep 17 00:00:00 2001 From: bolcman Date: Tue, 4 Aug 2020 23:42:24 -0400 Subject: [PATCH 27/27] udf-broadcast-variable-concurrency-fix (#612) --- .../Processor/BroadcastVariableProcessor.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 41c817d02..bf8f48ed8 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -54,7 +54,8 @@ internal BroadcastVariables Process(Stream stream) else { string path = SerDe.ReadString(stream); - using FileStream fStream = File.Open(path, FileMode.Open, FileAccess.Read); + using FileStream fStream = + File.Open(path, FileMode.Open, FileAccess.Read, FileShare.Read); object value = formatter.Deserialize(fStream); BroadcastRegistry.Add(bid, value); }