Skip to content

Commit

Permalink
Unified pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
hugovbraun committed Feb 14, 2020
1 parent 4fdccff commit 19241b8
Show file tree
Hide file tree
Showing 23 changed files with 3,586 additions and 160 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
!/src/doc/*
!/src/*/Makefile
!/src/*/README
!/src/cudadecoder/deprecated

# Compiled Object files and python ciles
*.slo
Expand Down
8 changes: 6 additions & 2 deletions src/cudadecoder/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,12 @@ endif

TESTFILES =

OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
OBJFILES = cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o \
batched-threaded-nnet3-cuda-online-pipeline.o \
batched-threaded-nnet3-cuda-pipeline.o \
batched-threaded-nnet3-cuda-pipeline2.o \
batched-static-nnet3.o batched-static-nnet3-kernels.o \
deprecated/decodable-cumatrix.o

LIBNAME = kaldi-cudadecoder

Expand Down
168 changes: 168 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// cudadecoder/batched-static-nnet3-kernels.cu
//
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "cudadecoder/batched-static-nnet3-kernels.h"

#include <stdio.h>
namespace kaldi {
namespace cuda_decoder {

__global__ void build_batch_with_context_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {

BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
const BaseFloat *d_batch_slot_features = batch_assign.d_features;
BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];


int n_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
int n_frames_to_set = n_frames_available;
int n_left_context_frames_from_frame0 = 0;
if(batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. Generating left context by duplicating frame0
n_frames_to_set += params.total_nnet_left_context;
n_left_context_frames_from_frame0 = params.total_nnet_left_context;
}

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x) {
if(iframe < n_left_context_frames_from_frame0) {
d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_batch_slot_features[0 + idim]; // frame 0
}
else if (iframe < (n_left_context_frames_from_frame0+batch_assign.n_frames_already_in_context)) {
// Those are the frames coming from context
int src_iframe_in_saved_context = iframe-n_left_context_frames_from_frame0;
d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[src_iframe_in_saved_context*params.d_all_context_frames_frame_stride+ idim];
} else {
// Now we are moving the frames coming from the new chunk
int src_iframe_in_new_chunk = iframe-n_left_context_frames_from_frame0-batch_assign.n_frames_already_in_context;
d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_batch_slot_features[src_iframe_in_new_chunk*params.d_features_frame_stride + idim];
}
}

if(iframe == 0) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim; idim += blockDim.x) {
//printf("%i ivector[%i] = %f \n", batch_slot, idim, batch_assign.d_ivectors[idim]);
params.d_batch_ivectors[batch_slot*params.d_batch_ivectors_stride +idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_kernel<<<grid, block, 0, stream>>>(params);
}

__global__ void build_batch_with_context_context_flush_kernel(
BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {

BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];
BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];
BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];


int n_frames_in_context =
batch_assign.n_frames_already_in_context;
int n_frames_to_set = n_frames_in_context + params.total_nnet_right_context;
// printf("will set %i frames, %i in context \n", n_frames_to_set, n_frames_in_context);

for (int iframe = blockIdx.y; iframe < n_frames_to_set;
iframe += gridDim.y) {
for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x) {
if(iframe < n_frames_in_context) {
float f= d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[iframe*params.d_all_context_frames_frame_stride + idim];
// if(batch_slot==0) printf("A f%02dd%02d=%f\n", iframe, idim, f);
}
else if (iframe < n_frames_to_set) {
// Generating right context from last frame
int src_iframe_in_saved_context = n_frames_in_context-1;
float f = d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[src_iframe_in_saved_context*params.d_all_context_frames_frame_stride+ idim];
// if(batch_slot==0) printf("B f%02dd%02d=%f\n", iframe, idim, f);
}
}

if(iframe == 0) { // one CTA moves the ivectors
for (int idim = threadIdx.x; idim < params.ivector_dim; idim += blockDim.x) {
params.d_batch_ivectors[batch_slot*params.d_batch_ivectors_stride +idim] = batch_assign.d_ivectors[idim];
}
}
}
}
}

void BuildBatchWithContextKernelContextFlush(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
build_batch_with_context_context_flush_kernel<<<grid, block, 0, stream>>>(params);
}

__global__ void save_context_from_batch_kernel(BatchedStaticNnet3KernelParams params) {
for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
batch_slot += gridDim.z) {
BatchSlotAssignment batch_assign =
params.d_batch_slot_assignement[batch_slot];


// Real frames : does not include frame0 copies for left context
int n_real_frames_available =
batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
// total frames : includes frame0 copies
int total_frames_in_batch_slot = n_real_frames_available;
if(batch_assign.n_frames_already_in_context == 0) {
// First chunk for that utterance. We generated left context by duplicating frame0
total_frames_in_batch_slot += params.total_nnet_left_context;
}
// total frames : includes frame0 copies
int n_to_copy = min(total_frames_in_batch_slot, params.total_nnet_context);
int copy_from_frame = total_frames_in_batch_slot - n_to_copy;
BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];
BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
params.d_all_context_frames_channel_stride];

for (int dst_iframe= blockIdx.y;
dst_iframe < n_to_copy; dst_iframe += gridDim.y) {
int src_iframe = copy_from_frame + dst_iframe;
for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x) {
d_channel_context[dst_iframe*params.d_all_context_frames_frame_stride+ idim] = d_batch_slot_with_context[src_iframe * params.d_batch_with_context_frame_stride + idim];
}

}
}
}

void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params) {
save_context_from_batch_kernel<<<grid, block, 0, stream>>>(params);
}

} // cuda_decoder
} // kaldi
87 changes: 87 additions & 0 deletions src/cudadecoder/batched-static-nnet3-kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// cudadecoder/batched-static-nnet3-kernels.h
//
// Copyright (c) 2019; NVIDIA CORPORATION. All rights reserved.
// Hugo Braun
//
// Licensed under the Apache License; Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing; software
// distributed under the License is distributed on an "AS IS" BASIS;
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND; either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if HAVE_CUDA == 1

#include <cuda_runtime_api.h>
#include "base/kaldi-types.h"

#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#define KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_

namespace kaldi {
namespace cuda_decoder {

// Describe what each batch slot is made of. Used by the context switch kernels
struct BatchSlotAssignment {
BaseFloat *d_features;
BaseFloat *d_ivectors;
int ichannel;
int n_frames_already_in_context;
int n_new_frames;
};

struct BatchedStaticNnet3KernelParams {
const BaseFloat *d_all_new_features;
const BatchSlotAssignment *d_batch_slot_assignement;
BaseFloat *d_all_context_frames;
BaseFloat *d_batch_with_context;
BaseFloat *d_batch_ivectors;
int d_batch_ivectors_stride;
int batch_size;
int d_features_frame_stride;
int d_ivectors_frame_stride;
int d_all_context_frames_frame_stride;
int d_batch_with_context_frame_stride;
int d_all_context_frames_channel_stride;
int d_batch_with_context_batch_stride;
int input_dim;
int ivector_dim;
int total_nnet_context;
int total_nnet_left_context;
int total_nnet_right_context;
int input_frames_per_chunk_with_context;
};

// Takes as a input strided new chunks ptrs [chk0, chk1, chk2..]
// associated to channels [ch0, ch1, ch2...]
// And build a continuous batch such as:
// Batch with context:
// row0: [left_context(ch0), chk0]
// row0: [left_context(ch1), chk1]
// row0: [left_context(ch2), chk2]
// With left context being either part of a previous chunk for that channel, or
// just duplications of frame0 if this is the first chunk for that channel The
// end of each chunk for each row will then be used as a right context
void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

// Same thing than BuildBatchWithContextKernelContextFlush, except that the
// final frame is replicated to create the right context
void BuildBatchWithContextKernelContextFlush(
const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);
void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
const cudaStream_t &stream,
const BatchedStaticNnet3KernelParams &params);

} // namespace cuda_decoder
} // namespace kaldi

#endif // KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
#endif // HAVE_CUDA
Loading

0 comments on commit 19241b8

Please sign in to comment.