Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[src] CUDA Online/Offline pipelines + light batched nnet3 driver #3568

Merged
merged 14 commits into from
May 1, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/cudadecoder/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ endif

TESTFILES =

OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
OBJFILES = cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o \
batched-threaded-nnet3-cuda-online-pipeline.o \
batched-threaded-nnet3-cuda-pipeline.o \
batched-threaded-nnet3-cuda-pipeline2.o \
batched-static-nnet3.o batched-static-nnet3-kernels.o \
decodable-cumatrix.o

LIBNAME = kaldi-cudadecoder

Expand Down
208 changes: 208 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// cudadecoder/batched-static-nnet3-kernels.cu
//
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "cudadecoder/batched-static-nnet3-kernels.h"

#include <stdio.h>
namespace kaldi {
namespace cuda_decoder {

__global__ void build_batch_with_context_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
const BaseFloat *d_batch_slot_features = batch_assign.d_features;
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];

int n_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
int n_frames_to_set = n_frames_available;
int n_left_context_frames_from_frame0 = 0;
if (batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. Generating left context by duplicating
// frame0
n_frames_to_set += params.total_nnet_left_context;
n_left_context_frames_from_frame0 = params.total_nnet_left_context;
}

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
if (iframe < n_left_context_frames_from_frame0) {
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride + idim] =
d_batch_slot_features[0 + idim]; // frame 0
} else if (iframe < (n_left_context_frames_from_frame0 +
batch_assign.n_frames_already_in_context)) {
// Those are the frames coming from context
int src_iframe_in_saved_context =
iframe - n_left_context_frames_from_frame0;
d_batch_slot_with_context[iframe *
params
.d_batch_with_context_frame_stride +
idim] =
d_channel_context[src_iframe_in_saved_context *
params.d_all_context_frames_frame_stride +
idim];
} else {
// Now we are moving the frames coming from the new chunk
int src_iframe_in_new_chunk =
iframe - n_left_context_frames_from_frame0 -
batch_assign.n_frames_already_in_context;
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride + idim] =
d_batch_slot_features[src_iframe_in_new_chunk *
params.d_features_frame_stride +
idim];
}
}

if (iframe == 0 &&
params.d_batch_ivectors) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim;
idim += blockDim.x) {
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride +
idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_kernel<<<grid, block, 0, stream>>>(params);
}

__global__ void build_batch_with_context_context_flush_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];

int n_frames_in_context = batch_assign.n_frames_already_in_context;
int n_frames_to_set = n_frames_in_context + params.total_nnet_right_context;

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
if (iframe < n_frames_in_context) {
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride +
idim] = d_channel_context
[iframe * params.d_all_context_frames_frame_stride + idim];
} else if (iframe < n_frames_to_set) {
// Generating right context from last frame
int src_iframe_in_saved_context = n_frames_in_context - 1;
d_batch_slot_with_context[iframe *
params
.d_batch_with_context_frame_stride +
idim] =
d_channel_context[src_iframe_in_saved_context *
params.d_all_context_frames_frame_stride +
idim];
}
}

if (iframe == 0 &&
params.d_batch_ivectors) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim;
idim += blockDim.x) {
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride +
idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernelContextFlush(
const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_context_flush_kernel<<<grid, block, 0, stream>>>(
params);
}

__global__ void save_context_from_batch_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];

// Real frames : does not include frame0 copies for left context
int n_real_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
// total frames : includes frame0 copies
int total_frames_in_batch_slot = n_real_frames_available;
if (batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. We generated left context by
// duplicating frame0
total_frames_in_batch_slot += params.total_nnet_left_context;
}
// total frames : includes frame0 copies
int n_to_copy = min(total_frames_in_batch_slot, params.total_nnet_context);
int copy_from_frame = total_frames_in_batch_slot - n_to_copy;
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];

for (int dst_iframe = blockIdx.y; dst_iframe < n_to_copy;
dst_iframe += gridDim.y) {
int src_iframe = copy_from_frame + dst_iframe;
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
d_channel_context[dst_iframe *
params.d_all_context_frames_frame_stride +
idim] = d_batch_slot_with_context
[src_iframe * params.d_batch_with_context_frame_stride + idim];
}
}
}
}

void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
save_context_from_batch_kernel<<<grid, block, 0, stream>>>(params);
}

} // namespace cuda_decoder
} // namespace kaldi
87 changes: 87 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// cudadecoder/batched-static-nnet3-kernels.h
//
// Copyright (c) 2019; NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License; Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing; software
// distributed under the License is distributed on an "AS IS" BASIS;
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND; either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1

#include <cuda_runtime_api.h>
#include "base/kaldi-types.h"

#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#define KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_

namespace kaldi {
namespace cuda_decoder {

// Describe what each batch slot is made of. Used by the context switch kernels
struct BatchSlotAssignment {
BaseFloat *d_features;
BaseFloat *d_ivectors;
int ichannel;
int n_frames_already_in_context;
int n_new_frames;
};

struct BatchedStaticNnet3KernelParams {
const BaseFloat *d_all_new_features;
const BatchSlotAssignment *d_batch_slot_assignement;
BaseFloat *d_all_context_frames;
BaseFloat *d_batch_with_context;
BaseFloat *d_batch_ivectors;
int d_batch_ivectors_stride;
int batch_size;
int d_features_frame_stride;
int d_ivectors_frame_stride;
int d_all_context_frames_frame_stride;
int d_batch_with_context_frame_stride;
int d_all_context_frames_channel_stride;
int d_batch_with_context_batch_stride;
int input_dim;
int ivector_dim;
int total_nnet_context;
int total_nnet_left_context;
int total_nnet_right_context;
int input_frames_per_chunk_with_context;
};

// Takes as a input strided new chunks ptrs [chk0, chk1, chk2..]
// associated to channels [ch0, ch1, ch2...]
// And build a continuous batch such as:
// Batch with context:
// row0: [left_context(ch0), chk0]
// row0: [left_context(ch1), chk1]
// row0: [left_context(ch2), chk2]
// With left context being either part of a previous chunk for that channel, or
// just duplications of frame0 if this is the first chunk for that channel The
// end of each chunk for each row will then be used as a right context
void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

// Same thing than BuildBatchWithContextKernelContextFlush, except that the
// final frame is replicated to create the right context
void BuildBatchWithContextKernelContextFlush(
const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);
void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

} // namespace cuda_decoder
} // namespace kaldi

#endif // KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#endif // HAVE_CUDA
Loading