-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[src] CUDA Online/Offline pipelines + light batched nnet3 driver (#3568)
- Loading branch information
1 parent
7a0bf8a
commit 0bca93e
Showing
22 changed files
with
4,122 additions
and
929 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
// cudadecoder/batched-static-nnet3-kernels.cu | ||
// | ||
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. | ||
// Hugo Braun | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "cudadecoder/batched-static-nnet3-kernels.h" | ||
|
||
#include <stdio.h> | ||
namespace kaldi { | ||
namespace cuda_decoder { | ||
|
||
__global__ void build_batch_with_context_kernel( | ||
BatchedStaticNnet3KernelParams params) { | ||
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size; | ||
batch_slot += gridDim.z) { | ||
BatchSlotAssignment batch_assign = | ||
params.d_batch_slot_assignement[batch_slot]; | ||
const BaseFloat *d_batch_slot_features = batch_assign.d_features; | ||
BaseFloat *d_channel_context = | ||
¶ms | ||
.d_all_context_frames[batch_assign.ichannel * | ||
params.d_all_context_frames_channel_stride]; | ||
BaseFloat *d_batch_slot_with_context = | ||
¶ms.d_batch_with_context[params.d_batch_with_context_batch_stride * | ||
batch_slot]; | ||
|
||
int n_frames_available = | ||
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames; | ||
int n_frames_to_set = n_frames_available; | ||
int n_left_context_frames_from_frame0 = 0; | ||
if (batch_assign.n_frames_already_in_context == 0) { | ||
// First chunk for that utterance. Generating left context by duplicating | ||
// frame0 | ||
n_frames_to_set += params.total_nnet_left_context; | ||
n_left_context_frames_from_frame0 = params.total_nnet_left_context; | ||
} | ||
|
||
for (int iframe = blockIdx.y; iframe < n_frames_to_set; | ||
iframe += gridDim.y) { | ||
for (int idim = threadIdx.x; idim < params.input_dim; | ||
idim += blockDim.x) { | ||
if (iframe < n_left_context_frames_from_frame0) { | ||
d_batch_slot_with_context | ||
[iframe * params.d_batch_with_context_frame_stride + idim] = | ||
d_batch_slot_features[0 + idim]; // frame 0 | ||
} else if (iframe < (n_left_context_frames_from_frame0 + | ||
batch_assign.n_frames_already_in_context)) { | ||
// Those are the frames coming from context | ||
int src_iframe_in_saved_context = | ||
iframe - n_left_context_frames_from_frame0; | ||
d_batch_slot_with_context[iframe * | ||
params | ||
.d_batch_with_context_frame_stride + | ||
idim] = | ||
d_channel_context[src_iframe_in_saved_context * | ||
params.d_all_context_frames_frame_stride + | ||
idim]; | ||
} else { | ||
// Now we are moving the frames coming from the new chunk | ||
int src_iframe_in_new_chunk = | ||
iframe - n_left_context_frames_from_frame0 - | ||
batch_assign.n_frames_already_in_context; | ||
d_batch_slot_with_context | ||
[iframe * params.d_batch_with_context_frame_stride + idim] = | ||
d_batch_slot_features[src_iframe_in_new_chunk * | ||
params.d_features_frame_stride + | ||
idim]; | ||
} | ||
} | ||
|
||
if (iframe == 0 && | ||
params.d_batch_ivectors) { // one CTA moves the ivectors | ||
for (int idim = threadIdx.x; idim < params.ivector_dim; | ||
idim += blockDim.x) { | ||
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride + | ||
idim] = batch_assign.d_ivectors[idim]; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block, | ||
const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms) { | ||
build_batch_with_context_kernel<<<grid, block, 0, stream>>>(params); | ||
} | ||
|
||
__global__ void build_batch_with_context_context_flush_kernel( | ||
BatchedStaticNnet3KernelParams params) { | ||
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size; | ||
batch_slot += gridDim.z) { | ||
BatchSlotAssignment batch_assign = | ||
params.d_batch_slot_assignement[batch_slot]; | ||
BaseFloat *d_channel_context = | ||
¶ms | ||
.d_all_context_frames[batch_assign.ichannel * | ||
params.d_all_context_frames_channel_stride]; | ||
BaseFloat *d_batch_slot_with_context = | ||
¶ms.d_batch_with_context[params.d_batch_with_context_batch_stride * | ||
batch_slot]; | ||
|
||
int n_frames_in_context = batch_assign.n_frames_already_in_context; | ||
int n_frames_to_set = n_frames_in_context + params.total_nnet_right_context; | ||
|
||
for (int iframe = blockIdx.y; iframe < n_frames_to_set; | ||
iframe += gridDim.y) { | ||
for (int idim = threadIdx.x; idim < params.input_dim; | ||
idim += blockDim.x) { | ||
if (iframe < n_frames_in_context) { | ||
d_batch_slot_with_context | ||
[iframe * params.d_batch_with_context_frame_stride + | ||
idim] = d_channel_context | ||
[iframe * params.d_all_context_frames_frame_stride + idim]; | ||
} else if (iframe < n_frames_to_set) { | ||
// Generating right context from last frame | ||
int src_iframe_in_saved_context = n_frames_in_context - 1; | ||
d_batch_slot_with_context[iframe * | ||
params | ||
.d_batch_with_context_frame_stride + | ||
idim] = | ||
d_channel_context[src_iframe_in_saved_context * | ||
params.d_all_context_frames_frame_stride + | ||
idim]; | ||
} | ||
} | ||
|
||
if (iframe == 0 && | ||
params.d_batch_ivectors) { // one CTA moves the ivectors | ||
for (int idim = threadIdx.x; idim < params.ivector_dim; | ||
idim += blockDim.x) { | ||
params.d_batch_ivectors[batch_slot * params.d_batch_ivectors_stride + | ||
idim] = batch_assign.d_ivectors[idim]; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
void BuildBatchWithContextKernelContextFlush( | ||
const dim3 &grid, const dim3 &block, const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms) { | ||
build_batch_with_context_context_flush_kernel<<<grid, block, 0, stream>>>( | ||
params); | ||
} | ||
|
||
__global__ void save_context_from_batch_kernel( | ||
BatchedStaticNnet3KernelParams params) { | ||
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size; | ||
batch_slot += gridDim.z) { | ||
BatchSlotAssignment batch_assign = | ||
params.d_batch_slot_assignement[batch_slot]; | ||
|
||
// Real frames : does not include frame0 copies for left context | ||
int n_real_frames_available = | ||
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames; | ||
// total frames : includes frame0 copies | ||
int total_frames_in_batch_slot = n_real_frames_available; | ||
if (batch_assign.n_frames_already_in_context == 0) { | ||
// First chunk for that utterance. We generated left context by | ||
// duplicating frame0 | ||
total_frames_in_batch_slot += params.total_nnet_left_context; | ||
} | ||
// total frames : includes frame0 copies | ||
int n_to_copy = min(total_frames_in_batch_slot, params.total_nnet_context); | ||
int copy_from_frame = total_frames_in_batch_slot - n_to_copy; | ||
BaseFloat *d_batch_slot_with_context = | ||
¶ms.d_batch_with_context[params.d_batch_with_context_batch_stride * | ||
batch_slot]; | ||
BaseFloat *d_channel_context = | ||
¶ms | ||
.d_all_context_frames[batch_assign.ichannel * | ||
params.d_all_context_frames_channel_stride]; | ||
|
||
for (int dst_iframe = blockIdx.y; dst_iframe < n_to_copy; | ||
dst_iframe += gridDim.y) { | ||
int src_iframe = copy_from_frame + dst_iframe; | ||
for (int idim = threadIdx.x; idim < params.input_dim; | ||
idim += blockDim.x) { | ||
d_channel_context[dst_iframe * | ||
params.d_all_context_frames_frame_stride + | ||
idim] = d_batch_slot_with_context | ||
[src_iframe * params.d_batch_with_context_frame_stride + idim]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block, | ||
const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms) { | ||
save_context_from_batch_kernel<<<grid, block, 0, stream>>>(params); | ||
} | ||
|
||
} // namespace cuda_decoder | ||
} // namespace kaldi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
// cudadecoder/batched-static-nnet3-kernels.h | ||
// | ||
// Copyright (c) 2019; NVIDIA CORPORATION. All rights reserved. | ||
// Hugo Braun | ||
// | ||
// Licensed under the Apache License; Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing; software | ||
// distributed under the License is distributed on an "AS IS" BASIS; | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND; either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#if HAVE_CUDA == 1 | ||
|
||
#include <cuda_runtime_api.h> | ||
#include "base/kaldi-types.h" | ||
|
||
#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ | ||
#define KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ | ||
|
||
namespace kaldi { | ||
namespace cuda_decoder { | ||
|
||
// Describe what each batch slot is made of. Used by the context switch kernels | ||
struct BatchSlotAssignment { | ||
BaseFloat *d_features; | ||
BaseFloat *d_ivectors; | ||
int ichannel; | ||
int n_frames_already_in_context; | ||
int n_new_frames; | ||
}; | ||
|
||
struct BatchedStaticNnet3KernelParams { | ||
const BaseFloat *d_all_new_features; | ||
const BatchSlotAssignment *d_batch_slot_assignement; | ||
BaseFloat *d_all_context_frames; | ||
BaseFloat *d_batch_with_context; | ||
BaseFloat *d_batch_ivectors; | ||
int d_batch_ivectors_stride; | ||
int batch_size; | ||
int d_features_frame_stride; | ||
int d_ivectors_frame_stride; | ||
int d_all_context_frames_frame_stride; | ||
int d_batch_with_context_frame_stride; | ||
int d_all_context_frames_channel_stride; | ||
int d_batch_with_context_batch_stride; | ||
int input_dim; | ||
int ivector_dim; | ||
int total_nnet_context; | ||
int total_nnet_left_context; | ||
int total_nnet_right_context; | ||
int input_frames_per_chunk_with_context; | ||
}; | ||
|
||
// Takes as a input strided new chunks ptrs [chk0, chk1, chk2..] | ||
// associated to channels [ch0, ch1, ch2...] | ||
// And build a continuous batch such as: | ||
// Batch with context: | ||
// row0: [left_context(ch0), chk0] | ||
// row0: [left_context(ch1), chk1] | ||
// row0: [left_context(ch2), chk2] | ||
// With left context being either part of a previous chunk for that channel, or | ||
// just duplications of frame0 if this is the first chunk for that channel The | ||
// end of each chunk for each row will then be used as a right context | ||
void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block, | ||
const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms); | ||
|
||
// Same thing than BuildBatchWithContextKernelContextFlush, except that the | ||
// final frame is replicated to create the right context | ||
void BuildBatchWithContextKernelContextFlush( | ||
const dim3 &grid, const dim3 &block, const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms); | ||
void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block, | ||
const cudaStream_t &stream, | ||
const BatchedStaticNnet3KernelParams ¶ms); | ||
|
||
} // namespace cuda_decoder | ||
} // namespace kaldi | ||
|
||
#endif // KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_ | ||
#endif // HAVE_CUDA |
Oops, something went wrong.
0bca93e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are some of the files
.cu
? The Windows build is expecting to find.cc
files:https://github.com/nalbion/vosk-api/runs/677387581?check_suite_focus=true#step:10:53
0bca93e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA code. I don't think we've ever tested building with CUDA on Windows.
0bca93e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
0bca93e
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, all. I noticed that in the file "src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc" which mentioned "IMPORTANT: This pipeline is deprecated. Please switch to cudadecoderbin/batch-wav-nnet3-cuda2 (binary) or cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h (class)". Is any problem in "src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc"? We are using this version now, I want to know if it is necessary to switch to the latest version, thanks.