-
Notifications
You must be signed in to change notification settings - Fork 315
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FileSystem extension skeleton #787
Changes from 5 commits
7c25fcb
cc8237f
f4e141b
bcde071
06d8b24
dfad910
50bc611
00d68a6
ec52232
4adbb64
16abe0a
3310897
d79e702
bd8579b
a3bf508
4fc9718
1bacdb2
53351a6
ac57039
8a85d60
9936e9f
c896d63
e1f33e8
a856829
6433b3c
4b906e0
dcf07d5
e41e1ef
f798618
8cc2378
74ce69d
257ce6c
9f92c7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
namespace Microsoft.Spark.Extensions.FileSystem.E2ETest | ||
{ | ||
/// <summary> | ||
/// Constants related to the FileSystem test suite. | ||
/// </summary> | ||
internal class Constants | ||
{ | ||
public const string FileSystemTestContainerName = "FileSystem Tests"; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.Spark.E2ETest; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.Extensions.FileSystem.E2ETest | ||
{ | ||
[CollectionDefinition(Constants.FileSystemTestContainerName)] | ||
public class FileSystemTestCollection : ICollectionFixture<SparkFixture> | ||
{ | ||
// This class has no code, and is never created. Its purpose is simply | ||
// to be the place to apply [CollectionDefinition] and all the | ||
// ICollectionFixture<> interfaces. | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.IO; | ||
using Microsoft.Spark.E2ETest; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.UnitTest.TestUtils; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.Extensions.FileSystem.E2ETest | ||
{ | ||
[Collection(Constants.FileSystemTestContainerName)] | ||
public class FileSystemTests | ||
{ | ||
private readonly SparkSession _spark; | ||
|
||
public FileSystemTests(SparkFixture fixture) | ||
{ | ||
_spark = fixture.Spark; | ||
} | ||
|
||
/// <summary> | ||
/// Test that methods return the expected signature. | ||
/// </summary> | ||
[Fact] | ||
public void TestSignatures() | ||
{ | ||
using FileSystem fs = FileSystem.Get(_spark.SparkContext); | ||
|
||
using var tempDirectory = new TemporaryDirectory(); | ||
string path = Path.Combine(tempDirectory.Path, "temp-table"); | ||
_spark.Range(25).Write().Format("parquet").Save(path); | ||
|
||
Assert.True(fs.Delete(path, true)); | ||
Assert.False(fs.Delete(path, true)); | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add another test that validates the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>netcoreapp3.1</TargetFramework> | ||
<IsPackable>false</IsPackable> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we setting this to false? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm, I just copied this from the other extension |
||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\Microsoft.Spark.E2ETest\Microsoft.Spark.E2ETest.csproj" /> | ||
<ProjectReference Include="..\..\Microsoft.Spark\Microsoft.Spark.csproj" /> | ||
<ProjectReference Include="..\Microsoft.Spark.Extensions.FileSystem\Microsoft.Spark.Extensions.FileSystem.csproj" /> | ||
</ItemGroup> | ||
AFFogarty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
</Project> |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
|
||
namespace Microsoft.Spark.Extensions.FileSystem | ||
{ | ||
/// <summary> | ||
/// An abstract base class for a fairly generic filesystem. It may be implemented as a distributed filesystem, or | ||
/// as a "local" one that reflects the locally-connected disk. The local version exists for small Hadoop instances | ||
/// and for testing. | ||
/// | ||
/// All user code that may potentially use the Hadoop Distributed File System should be written to use an FileSystem | ||
AFFogarty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// object. The Hadoop DFS is a multi-machine system that appears as a single disk.It's useful because of its fault | ||
AFFogarty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// tolerance and potentially very large capacity. | ||
/// </summary> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please reformat to keep each line within the 110 character limit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops, I thought we increased it to 120 for some reason. Fixed. |
||
public abstract class FileSystem : IDisposable | ||
{ | ||
/// <summary> | ||
/// Returns the configured FileSystem implementation. | ||
/// </summary> | ||
/// <param name="sparkContext">The SparkContext whose configuration will be used.</param> | ||
/// <returns>The FileSystem.</returns> | ||
public static FileSystem Get(SparkContext sparkContext) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is weird. Is this like a factory? How can I create a new type of FileSystem? Why not just expose the Hadoop FileSystem directly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the pattern from the Hadoop
For my .NET implementation, I've added an override of If we expose the class There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are mimicking the hadoop There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added For this PR, I just wanted to provide an MVP skeleton so that it would be easy for community members to contribute APIs in additional PRs. I'm thinking that we can invite others to contribute There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like we cannot define What if we add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thoughts @rapoth? I know you wanted to put There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
// TODO: Expose hadoopConfiguration as a .NET class and add an override for Get() that takes it. | ||
JvmObjectReference hadoopConfiguration = (JvmObjectReference) | ||
((IJvmObjectReferenceProvider)sparkContext).Reference.Invoke("hadoopConfiguration"); | ||
|
||
return new JvmReferenceFileSystem( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the JVM implementation, |
||
(JvmObjectReference)SparkEnvironment.JvmBridge.CallStaticJavaMethod( | ||
"org.apache.hadoop.fs.FileSystem", | ||
"get", | ||
hadoopConfiguration)); | ||
} | ||
|
||
/// <summary> | ||
/// Delete a file. | ||
/// </summary> | ||
/// <param name="path">The path to delete.</param> | ||
/// <param name="recursive">If path is a directory and set to true, the directory is deleted else throws an | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exceeding 110 character limit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
/// exception. In case of a file the recursive can be set to either true or false.</param> | ||
/// <returns>True if delete is successful else false.</returns> | ||
public abstract bool Delete(string path, bool recursive = true); | ||
|
||
public abstract void Dispose(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
|
||
namespace Microsoft.Spark.Extensions.FileSystem | ||
{ | ||
/// <summary> | ||
/// <see cref="FileSystem"/> implementation that wraps a corresponding FileSystem object in the JVM. | ||
/// </summary> | ||
public class JvmReferenceFileSystem : FileSystem, IJvmObjectReferenceProvider | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am thinking that we should just put APIs into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And just name the package as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I'll just make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed implementation to use |
||
{ | ||
private readonly JvmObjectReference _jvmObject; | ||
|
||
internal JvmReferenceFileSystem(JvmObjectReference jvmObject) | ||
{ | ||
_jvmObject = jvmObject; | ||
} | ||
|
||
JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; | ||
|
||
/// <summary> | ||
/// Delete a file. | ||
/// </summary> | ||
/// <param name="path">The path to delete.</param> | ||
/// <param name="recursive">If path is a directory and set to true, the directory is deleted else throws an | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exceeding 110 character limit |
||
/// exception. In case of a file the recursive can be set to either true or false.</param> | ||
/// <returns>True if delete is successful else false.</returns> | ||
public override bool Delete(string path, bool recursive = true) | ||
{ | ||
JvmObjectReference pathObject = | ||
SparkEnvironment.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); | ||
|
||
return (bool)_jvmObject.Invoke("delete", pathObject, recursive); | ||
} | ||
|
||
public override void Dispose() | ||
{ | ||
_jvmObject.Invoke("close"); | ||
} | ||
AFFogarty marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>netstandard2.0</TargetFramework> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\Microsoft.Spark\Microsoft.Spark.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this file needed? What else are we expecting to be added here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the pattern we're using in all the test projects.