Skip to content

Commit

Permalink
[src] CUDA Online/Offline pipelines + light batched nnet3 driver (#3568)
Browse files Browse the repository at this point in the history
  • Loading branch information
hugovbraun authored May 1, 2020
1 parent 7a0bf8a commit 0bca93e
Show file tree
Hide file tree
Showing 22 changed files with 4,122 additions and 929 deletions.
8 changes: 6 additions & 2 deletions src/cudadecoder/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ endif

TESTFILES =

OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
OBJFILES = cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o \
batched-threaded-nnet3-cuda-online-pipeline.o \
batched-threaded-nnet3-cuda-pipeline.o \
batched-threaded-nnet3-cuda-pipeline2.o \
batched-static-nnet3.o batched-static-nnet3-kernels.o \
decodable-cumatrix.o

LIBNAME = kaldi-cudadecoder

Expand Down
208 changes: 208 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
// cudadecoder/batched-static-nnet3-kernels.cu
//
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "cudadecoder/batched-static-nnet3-kernels.h"

#include <stdio.h>
namespace kaldi {
namespace cuda_decoder {

__global__ void build_batch_with_context_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
const BaseFloat *d_batch_slot_features = batch_assign.d_features;
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];

int n_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
int n_frames_to_set = n_frames_available;
int n_left_context_frames_from_frame0 = 0;
if (batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. Generating left context by duplicating
// frame0
n_frames_to_set += params.total_nnet_left_context;
n_left_context_frames_from_frame0 = params.total_nnet_left_context;
}

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
if (iframe < n_left_context_frames_from_frame0) {
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride + idim] =
d_batch_slot_features[0 + idim]; // frame 0
} else if (iframe < (n_left_context_frames_from_frame0 +
batch_assign.n_frames_already_in_context)) {
// Those are the frames coming from context
int src_iframe_in_saved_context =
iframe - n_left_context_frames_from_frame0;
d_batch_slot_with_context[iframe *
params
.d_batch_with_context_frame_stride +
idim] =
d_channel_context[src_iframe_in_saved_context *
params.d_all_context_frames_frame_stride +
idim];
} else {
// Now we are moving the frames coming from the new chunk
int src_iframe_in_new_chunk =
iframe - n_left_context_frames_from_frame0 -
batch_assign.n_frames_already_in_context;
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride + idim] =
d_batch_slot_features[src_iframe_in_new_chunk *
params.d_features_frame_stride +
idim];
}
}

if (iframe == 0 &&
params.d_batch_ivectors) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim;
idim += blockDim.x) {
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride +
idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_kernel<<<grid, block, 0, stream>>>(params);
}

__global__ void build_batch_with_context_context_flush_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];

int n_frames_in_context = batch_assign.n_frames_already_in_context;
int n_frames_to_set = n_frames_in_context + params.total_nnet_right_context;

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
if (iframe < n_frames_in_context) {
d_batch_slot_with_context
[iframe * params.d_batch_with_context_frame_stride +
idim] = d_channel_context
[iframe * params.d_all_context_frames_frame_stride + idim];
} else if (iframe < n_frames_to_set) {
// Generating right context from last frame
int src_iframe_in_saved_context = n_frames_in_context - 1;
d_batch_slot_with_context[iframe *
params
.d_batch_with_context_frame_stride +
idim] =
d_channel_context[src_iframe_in_saved_context *
params.d_all_context_frames_frame_stride +
idim];
}
}

if (iframe == 0 &&
params.d_batch_ivectors) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim;
idim += blockDim.x) {
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride +
idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernelContextFlush(
const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_context_flush_kernel<<<grid, block, 0, stream>>>(
params);
}

__global__ void save_context_from_batch_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];

// Real frames : does not include frame0 copies for left context
int n_real_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
// total frames : includes frame0 copies
int total_frames_in_batch_slot = n_real_frames_available;
if (batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. We generated left context by
// duplicating frame0
total_frames_in_batch_slot += params.total_nnet_left_context;
}
// total frames : includes frame0 copies
int n_to_copy = min(total_frames_in_batch_slot, params.total_nnet_context);
int copy_from_frame = total_frames_in_batch_slot - n_to_copy;
BaseFloat *d_batch_slot_with_context =
&params.d_batch_with_context[params.d_batch_with_context_batch_stride *
batch_slot];
BaseFloat *d_channel_context =
&params
.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];

for (int dst_iframe = blockIdx.y; dst_iframe < n_to_copy;
dst_iframe += gridDim.y) {
int src_iframe = copy_from_frame + dst_iframe;
for (int idim = threadIdx.x; idim < params.input_dim;
idim += blockDim.x) {
d_channel_context[dst_iframe *
params.d_all_context_frames_frame_stride +
idim] = d_batch_slot_with_context
[src_iframe * params.d_batch_with_context_frame_stride + idim];
}
}
}
}

void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
save_context_from_batch_kernel<<<grid, block, 0, stream>>>(params);
}

} // namespace cuda_decoder
} // namespace kaldi
87 changes: 87 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// cudadecoder/batched-static-nnet3-kernels.h
//
// Copyright (c) 2019; NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License; Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing; software
// distributed under the License is distributed on an "AS IS" BASIS;
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND; either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1

#include <cuda_runtime_api.h>
#include "base/kaldi-types.h"

#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#define KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_

namespace kaldi {
namespace cuda_decoder {

// Describe what each batch slot is made of. Used by the context switch kernels
struct BatchSlotAssignment {
BaseFloat *d_features;
BaseFloat *d_ivectors;
int ichannel;
int n_frames_already_in_context;
int n_new_frames;
};

struct BatchedStaticNnet3KernelParams {
const BaseFloat *d_all_new_features;
const BatchSlotAssignment *d_batch_slot_assignement;
BaseFloat *d_all_context_frames;
BaseFloat *d_batch_with_context;
BaseFloat *d_batch_ivectors;
int d_batch_ivectors_stride;
int batch_size;
int d_features_frame_stride;
int d_ivectors_frame_stride;
int d_all_context_frames_frame_stride;
int d_batch_with_context_frame_stride;
int d_all_context_frames_channel_stride;
int d_batch_with_context_batch_stride;
int input_dim;
int ivector_dim;
int total_nnet_context;
int total_nnet_left_context;
int total_nnet_right_context;
int input_frames_per_chunk_with_context;
};

// Takes as a input strided new chunks ptrs [chk0, chk1, chk2..]
// associated to channels [ch0, ch1, ch2...]
// And build a continuous batch such as:
// Batch with context:
// row0: [left_context(ch0), chk0]
// row0: [left_context(ch1), chk1]
// row0: [left_context(ch2), chk2]
// With left context being either part of a previous chunk for that channel, or
// just duplications of frame0 if this is the first chunk for that channel The
// end of each chunk for each row will then be used as a right context
void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

// Same thing than BuildBatchWithContextKernelContextFlush, except that the
// final frame is replicated to create the right context
void BuildBatchWithContextKernelContextFlush(
const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);
void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

} // namespace cuda_decoder
} // namespace kaldi

#endif // KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#endif // HAVE_CUDA
Loading

4 comments on commit 0bca93e

@nalbion
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are some of the files .cu? The Windows build is expecting to find .cc files:

ERROR?: file D:\a\vosk-api\vosk-api\kaldi\windows..\src\cudadecoder\cuda-decoder-kernels.cc not found - project kaldi-cudadecoder
ERROR?: file D:\a\vosk-api\vosk-api\kaldi\windows..\src\cudadecoder\batched-static-nnet3-kernels.cc not found - project kaldi-cudadecoder

https://github.com/nalbion/vosk-api/runs/677387581?check_suite_focus=true#step:10:53

@danpovey
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA code. I don't think we've ever tested building with CUDA on Windows.

@jtrmal
Copy link
Contributor

@jtrmal jtrmal commented on 0bca93e May 15, 2020 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Hap-Zhang
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, all. I noticed that in the file "src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc" which mentioned "IMPORTANT: This pipeline is deprecated. Please switch to cudadecoderbin/batch-wav-nnet3-cuda2 (binary) or cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h (class)". Is any problem in "src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc"? We are using this version now, I want to know if it is necessary to switch to the latest version, thanks.

Please sign in to comment.