Skip to content

Commit

Permalink
remove RX as the primary pipe for gRPC communications (#8281)
Browse files Browse the repository at this point in the history
### Issue describing the changes in this PR

resolves #8280; performance locally is a 5x+ speedup in throughput

the main crux of the new flow is described in src/WebJobs.Script.Grpc/Eventing/GrpcEventExtensions.cs, but in short: we establish a per-worker `Channel<InboundGrpcEvent>` and `Channel<OutboundGrpcEvent>` to handle the backlog; instead of publishing to RX, we publish to the writer of the relevant queue. Separately, we have an async worker deque data from the queues, and process accordingly. This is much more direct, avoids a *lot* of RX machinery, and creates isolation between workers.

### Pull request checklist

* [ ] My changes **do not** require documentation changes
    * [ ] Otherwise: Documentation issue linked to PR
* [ ] My changes **should not** be added to the release notes for the next release
    * [ ] Otherwise: I've added my notes to `release_notes.md`
* [ ] My changes **do not** need to be backported to a previous version
    * [ ] Otherwise: Backport tracked by issue/PR #issue_or_pr
* [ ] I have added all required tests (Unit tests, E2E tests)

<!-- Optional: delete if not applicable  -->
### Additional information

Additional PR information


Co-authored-by: Lilian Kasem <likasem@microsoft.com>
Co-authored-by: Marc Gravell <marcgravell@microsoft.com>
Co-authored-by: Nick Craver <nickcraver@microsoft.com>
Co-authored-by: Nick Craver <nrcraver@gmail.com>
  • Loading branch information
5 people authored Sep 2, 2022
1 parent 5d4e651 commit c112509
Show file tree
Hide file tree
Showing 18 changed files with 967 additions and 411 deletions.
340 changes: 278 additions & 62 deletions src/WebJobs.Script.Grpc/Channel/GrpcWorkerChannel.cs

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/WebJobs.Script.Grpc/Channel/GrpcWorkerChannelFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Reactive.Linq;
using Microsoft.Azure.WebJobs.Script.Diagnostics;
using Microsoft.Azure.WebJobs.Script.Eventing;
using Microsoft.Azure.WebJobs.Script.Grpc.Eventing;
using Microsoft.Azure.WebJobs.Script.Workers;
using Microsoft.Azure.WebJobs.Script.Workers.Rpc;
using Microsoft.Azure.WebJobs.Script.Workers.SharedMemoryDataTransfer;
Expand Down Expand Up @@ -47,6 +48,7 @@ public IRpcWorkerChannel Create(string scriptRootPath, string runtime, IMetricsL
throw new InvalidOperationException($"WorkerCofig for runtime: {runtime} not found");
}
string workerId = Guid.NewGuid().ToString();
_eventManager.AddGrpcChannels(workerId); // prepare the inbound/outbound dedicated channels
ILogger workerLogger = _loggerFactory.CreateLogger($"Worker.LanguageWorkerChannel.{runtime}.{workerId}");
IWorkerProcess rpcWorkerProcess = _rpcWorkerProcessFactory.Create(workerId, runtime, scriptRootPath, languageWorkerConfig);
return new GrpcWorkerChannel(
Expand Down
67 changes: 67 additions & 0 deletions src/WebJobs.Script.Grpc/Eventing/GrpcEventExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for license information.

using System;
using System.Threading.Channels;
using Microsoft.Azure.WebJobs.Script.Eventing;

namespace Microsoft.Azure.WebJobs.Script.Grpc.Eventing;

internal static class GrpcEventExtensions
{
// flow here is:
// 1) external request is proxied to the the GrpcWorkerChannel via one of the many Send* APIs, which writes
// to outbound-writer; this means we can have concurrent writes to outbound
// 2) if an out-of-process function is connected, a FunctionRpcService-EventStream will consume
// from outbound-reader (we'll allow for the multi-stream possibility, hence concurrent), and push it via gRPC
// 3) when the out-of-process function provides a response to FunctionRpcService-EventStream, it is written to
// inbound-writer (note we will allow for multi-stream possibility)
// 4) the GrpcWorkerChannel has a single dedicated consumer of inbound-reader, which it then marries to
// in-flight operations
internal static readonly UnboundedChannelOptions InboundOptions = new UnboundedChannelOptions
{
SingleReader = true, // see 4
SingleWriter = false, // see 3
AllowSynchronousContinuations = false,
};

internal static readonly UnboundedChannelOptions OutboundOptions = new UnboundedChannelOptions
{
SingleReader = false, // see 2
SingleWriter = false, // see 1
AllowSynchronousContinuations = false,
};

public static void AddGrpcChannels(this IScriptEventManager manager, string workerId)
{
var inbound = Channel.CreateUnbounded<InboundGrpcEvent>(InboundOptions);
if (manager.TryAddWorkerState(workerId, inbound))
{
var outbound = Channel.CreateUnbounded<OutboundGrpcEvent>(OutboundOptions);
if (manager.TryAddWorkerState(workerId, outbound))
{
return; // successfully added both
}
// we added the inbound but not the outbound; revert
manager.TryRemoveWorkerState(workerId, out inbound);
}
// this is not anticipated, so don't panic abount the allocs above
throw new ArgumentException("Duplicate worker id: " + workerId, nameof(workerId));
}

public static bool TryGetGrpcChannels(this IScriptEventManager manager, string workerId, out Channel<InboundGrpcEvent> inbound, out Channel<OutboundGrpcEvent> outbound)
=> manager.TryGetWorkerState(workerId, out inbound) & manager.TryGetWorkerState(workerId, out outbound);

public static void RemoveGrpcChannels(this IScriptEventManager manager, string workerId)
{
// remove any channels, and shut them down
if (manager.TryGetWorkerState<Channel<InboundGrpcEvent>>(workerId, out var inbound))
{
inbound.Writer.TryComplete();
}
if (manager.TryGetWorkerState<Channel<OutboundGrpcEvent>>(workerId, out var outbound))
{
outbound.Writer.TryComplete();
}
}
}
22 changes: 0 additions & 22 deletions src/WebJobs.Script.Grpc/Extensions/InboundGrpcEventExtensions.cs

This file was deleted.

113 changes: 64 additions & 49 deletions src/WebJobs.Script.Grpc/Server/FunctionRpcService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
// Licensed under the MIT License. See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Reactive.Concurrency;
using System.Reactive.Linq;
using System.Threading;
using System.Threading.Channels;
using System.Threading.Tasks;
using Grpc.Core;
using Microsoft.Azure.WebJobs.Script.Eventing;
Expand All @@ -21,7 +20,6 @@ namespace Microsoft.Azure.WebJobs.Script.Grpc
// TODO: move to WebJobs.Script.Grpc package and provide event stream abstraction
internal class FunctionRpcService : FunctionRpc.FunctionRpcBase
{
private readonly SemaphoreSlim _writeLock = new SemaphoreSlim(1, 1);
private readonly IScriptEventManager _eventManager;
private readonly ILogger _logger;

Expand All @@ -33,64 +31,48 @@ public FunctionRpcService(IScriptEventManager eventManager, ILogger<FunctionRpcS

public override async Task EventStream(IAsyncStreamReader<StreamingMessage> requestStream, IServerStreamWriter<StreamingMessage> responseStream, ServerCallContext context)
{
var cancelSource = new TaskCompletionSource<bool>();
IDictionary<string, IDisposable> outboundEventSubscriptions = new Dictionary<string, IDisposable>();

var cancelSource = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var cts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
CancellationTokenRegistration ctr = cts.Token.Register(static state => ((TaskCompletionSource<bool>)state).TrySetResult(false), cancelSource);
try
{
context.CancellationToken.Register(() => cancelSource.TrySetResult(false));
Func<Task<bool>> messageAvailable = async () =>
static Task<Task<bool>> MoveNextAsync(IAsyncStreamReader<StreamingMessage> requestStream, TaskCompletionSource<bool> cancelSource)
{
// GRPC does not accept cancellation tokens for individual reads, hence wrapper
var requestTask = requestStream.MoveNext(CancellationToken.None);
var completed = await Task.WhenAny(cancelSource.Task, requestTask);
return completed.Result;
};
return Task.WhenAny(cancelSource.Task, requestTask);
}

if (await messageAvailable())
if (await await MoveNextAsync(requestStream, cancelSource))
{
string workerId = requestStream.Current.StartStream.WorkerId;
_logger.LogDebug("Established RPC channel. WorkerId: {workerId}", workerId);
outboundEventSubscriptions.Add(workerId, _eventManager.OfType<OutboundGrpcEvent>()
.Where(evt => evt.WorkerId == workerId)
.ObserveOn(NewThreadScheduler.Default)
.Subscribe(async evt =>
var currentMessage = requestStream.Current;
// expect first operation (and only the first; we don't support re-registration) to be StartStream
if (currentMessage.ContentCase == MsgType.StartStream)
{
var workerId = currentMessage.StartStream?.WorkerId;
if (!string.IsNullOrEmpty(workerId) && _eventManager.TryGetGrpcChannels(workerId, out var inbound, out var outbound))
{
try
{
if (evt.MessageType == MsgType.InvocationRequest)
{
_logger.LogTrace("Writing invocation request invocationId: {invocationId} to workerId: {workerId}", evt.Message.InvocationRequest.InvocationId, workerId);
}
// send work
_ = PushFromOutboundToGrpc(workerId, responseStream, outbound.Reader, cts.Token);

try
// this loop is "pull from gRPC and push to inbound"
do
{
currentMessage = requestStream.Current;
if (currentMessage.ContentCase == MsgType.InvocationResponse && !string.IsNullOrEmpty(currentMessage.InvocationResponse?.InvocationId))
{
// WriteAsync only allows one pending write at a time, so we
// serialize access to the stream for each subscription
await _writeLock.WaitAsync();
await responseStream.WriteAsync(evt.Message);
_logger.LogTrace("Received invocation response for invocationId: {invocationId} from workerId: {workerId}", currentMessage.InvocationResponse.InvocationId, workerId);
}
finally
var newInbound = new InboundGrpcEvent(workerId, currentMessage);
if (!inbound.Writer.TryWrite(newInbound))
{
_writeLock.Release();
await inbound.Writer.WriteAsync(newInbound);
}
currentMessage = null; // allow old messages to be collected while we wait
}
catch (Exception subscribeEventEx)
{
_logger.LogError(subscribeEventEx, "Error writing message type {messageType} to workerId: {workerId}", evt.MessageType, workerId);
}
}));

do
{
var currentMessage = requestStream.Current;
if (currentMessage.InvocationResponse != null && !string.IsNullOrEmpty(currentMessage.InvocationResponse.InvocationId))
{
_logger.LogTrace("Received invocation response for invocationId: {invocationId} from workerId: {workerId}", currentMessage.InvocationResponse.InvocationId, workerId);
while (await await MoveNextAsync(requestStream, cancelSource));
}
_eventManager.Publish(new InboundGrpcEvent(workerId, currentMessage));
}
while (await messageAvailable());
}
}
catch (Exception rpcException)
Expand All @@ -101,14 +83,47 @@ public override async Task EventStream(IAsyncStreamReader<StreamingMessage> requ
}
finally
{
foreach (var sub in outboundEventSubscriptions)
{
sub.Value?.Dispose();
}
cts.Cancel();
ctr.Dispose();

// ensure cancellationSource task completes
cancelSource.TrySetResult(false);
}
}

private async Task PushFromOutboundToGrpc(string workerId, IServerStreamWriter<StreamingMessage> responseStream, ChannelReader<OutboundGrpcEvent> source, CancellationToken cancellationToken)
{
try
{
_logger.LogDebug("Established RPC channel. WorkerId: {workerId}", workerId);
await Task.Yield(); // free up the caller
while (await source.WaitToReadAsync(cancellationToken))
{
while (source.TryRead(out var evt))
{
if (evt.MessageType == MsgType.InvocationRequest)
{
_logger.LogTrace("Writing invocation request invocationId: {invocationId} to workerId: {workerId}", evt.Message.InvocationRequest.InvocationId, workerId);
}
try
{
await responseStream.WriteAsync(evt.Message);
}
catch (Exception subscribeEventEx)
{
_logger.LogError(subscribeEventEx, "Error writing message type {messageType} to workerId: {workerId}", evt.MessageType, workerId);
}
}
}
}
catch (OperationCanceledException oce) when (oce.CancellationToken == cancellationToken)
{
// that's fine; normaly exit through cancellation
}
catch (Exception ex)
{
_logger.LogError(ex, "Error pushing from outbound to gRPC");
}
}
}
}
1 change: 1 addition & 0 deletions src/WebJobs.Script.Grpc/WebJobs.Script.Grpc.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
<PackageReference Include="StyleCop.Analyzers" Version="1.1.0-beta004">
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="System.Threading.Channels" Version="6.0.0" />
</ItemGroup>

<ItemGroup>
Expand Down
6 changes: 6 additions & 0 deletions src/WebJobs.Script/Eventing/IScriptEventManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@ namespace Microsoft.Azure.WebJobs.Script.Eventing
public interface IScriptEventManager : IObservable<ScriptEvent>
{
void Publish(ScriptEvent scriptEvent);

bool TryAddWorkerState<T>(string workerId, T state);

bool TryGetWorkerState<T>(string workerId, out T state);

bool TryRemoveWorkerState<T>(string workerId, out T state);
}
}
35 changes: 34 additions & 1 deletion src/WebJobs.Script/Eventing/ScriptEventManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
// Licensed under the MIT License. See License.txt in the project root for license information.

using System;
using System.Collections.Concurrent;
using System.Reactive.Subjects;

namespace Microsoft.Azure.WebJobs.Script.Eventing
{
public sealed class ScriptEventManager : IScriptEventManager, IDisposable
public class ScriptEventManager : IScriptEventManager, IDisposable
{
private readonly Subject<ScriptEvent> _subject = new Subject<ScriptEvent>();
private readonly ConcurrentDictionary<(string, Type), object> _workerState = new ();

private bool _disposed = false;

public void Publish(ScriptEvent scriptEvent)
Expand Down Expand Up @@ -47,5 +50,35 @@ private void Dispose(bool disposing)
}

public void Dispose() => Dispose(true);

bool IScriptEventManager.TryAddWorkerState<T>(string workerId, T state)
{
var key = (workerId, typeof(T));
return _workerState.TryAdd(key, state);
}

bool IScriptEventManager.TryGetWorkerState<T>(string workerId, out T state)
{
var key = (workerId, typeof(T));
if (_workerState.TryGetValue(key, out var tmp) && tmp is T typed)
{
state = typed;
return true;
}
state = default;
return false;
}

bool IScriptEventManager.TryRemoveWorkerState<T>(string workerId, out T state)
{
var key = (workerId, typeof(T));
if (_workerState.TryRemove(key, out var tmp) && tmp is T typed)
{
state = typed;
return true;
}
state = default;
return false;
}
}
}
1 change: 1 addition & 0 deletions src/WebJobs.Script/Workers/Rpc/RpcWorkerConstants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ public static class RpcWorkerConstants

// Host Capabilities
public const string V2Compatable = "V2Compatable";
public const string MultiStream = nameof(MultiStream);

// dotnet executable file path components
public const string DotNetExecutableName = "dotnet";
Expand Down
Loading

0 comments on commit c112509

Please sign in to comment.