Unified pipelines

kaldi-asr · Feb 14, 2020 · 19241b8 · 19241b8
1 parent 4fdccff
commit 19241b8
Show file tree

Hide file tree

Showing 23 changed files with 3,586 additions and 160 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 !/src/doc/*
 !/src/*/Makefile
 !/src/*/README
+!/src/cudadecoder/deprecated
 
 # Compiled Object files and python ciles
 *.slo

diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
@@ -13,8 +13,12 @@ endif
 
 TESTFILES =
 
-OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
-           cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
+OBJFILES = cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o \
+	   batched-threaded-nnet3-cuda-online-pipeline.o \
+	   batched-threaded-nnet3-cuda-pipeline.o \
+	   batched-threaded-nnet3-cuda-pipeline2.o \
+	   batched-static-nnet3.o batched-static-nnet3-kernels.o \
+	   deprecated/decodable-cumatrix.o
 
 LIBNAME = kaldi-cudadecoder
 

diff --git a/src/cudadecoder/batched-static-nnet3-kernels.cu b/src/cudadecoder/batched-static-nnet3-kernels.cu
@@ -0,0 +1,168 @@
+// cudadecoder/batched-static-nnet3-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudadecoder/batched-static-nnet3-kernels.h"
+
+#include <stdio.h>
+namespace kaldi {
+namespace cuda_decoder {
+
+	__global__ void build_batch_with_context_kernel(
+			BatchedStaticNnet3KernelParams params) {
+		for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
+				batch_slot += gridDim.z) {
+
+			BatchSlotAssignment batch_assign =
+				params.d_batch_slot_assignement[batch_slot];
+			const BaseFloat *d_batch_slot_features = batch_assign.d_features;
+			BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
+				params.d_all_context_frames_channel_stride];
+			BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];
+
+
+			int n_frames_available =
+				batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
+			int n_frames_to_set = n_frames_available; 
+			int n_left_context_frames_from_frame0 = 0; 
+			if(batch_assign.n_frames_already_in_context == 0) {
+				// First chunk for that utterance. Generating left context by duplicating frame0
+				n_frames_to_set += params.total_nnet_left_context; 
+				n_left_context_frames_from_frame0 = params.total_nnet_left_context;
+			}	
+
+			for (int iframe = blockIdx.y; iframe < n_frames_to_set;
+					iframe += gridDim.y) {
+				for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x) {
+					if(iframe < n_left_context_frames_from_frame0) {
+						d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_batch_slot_features[0 + idim]; // frame 0
+					}
+					else if (iframe < (n_left_context_frames_from_frame0+batch_assign.n_frames_already_in_context)) {
+						// Those are the frames coming from context
+						int src_iframe_in_saved_context = iframe-n_left_context_frames_from_frame0;
+						d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[src_iframe_in_saved_context*params.d_all_context_frames_frame_stride+ idim];
+					} else {
+						// Now we are moving the frames coming from the new chunk
+						int src_iframe_in_new_chunk = iframe-n_left_context_frames_from_frame0-batch_assign.n_frames_already_in_context;
+						d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_batch_slot_features[src_iframe_in_new_chunk*params.d_features_frame_stride + idim];
+					}
+				}
+
+				if(iframe == 0) { // one CTA moves the ivectors
+					for (int idim = threadIdx.x; idim < params.ivector_dim; idim += blockDim.x) {
+						//printf("%i ivector[%i] = %f \n", batch_slot, idim, batch_assign.d_ivectors[idim]);
+						params.d_batch_ivectors[batch_slot*params.d_batch_ivectors_stride +idim] = batch_assign.d_ivectors[idim];
+					}
+				}
+			}
+		} 
+	}
+
+void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &stream,
+                                 const BatchedStaticNnet3KernelParams &params) {
+  build_batch_with_context_kernel<<<grid, block, 0, stream>>>(params);
+}
+
+	__global__ void build_batch_with_context_context_flush_kernel(
+			BatchedStaticNnet3KernelParams params) {
+		for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
+				batch_slot += gridDim.z) {
+
+			BatchSlotAssignment batch_assign =
+				params.d_batch_slot_assignement[batch_slot];
+			BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
+				params.d_all_context_frames_channel_stride];
+			BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];
+
+
+			int n_frames_in_context =
+				batch_assign.n_frames_already_in_context;
+			int n_frames_to_set = n_frames_in_context + params.total_nnet_right_context;  
+//			printf("will set %i frames, %i in context \n", n_frames_to_set, n_frames_in_context);
+
+			for (int iframe = blockIdx.y; iframe < n_frames_to_set;
+					iframe += gridDim.y) {
+				for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x) {
+					if(iframe < n_frames_in_context) {
+float f=						d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[iframe*params.d_all_context_frames_frame_stride + idim];
+//						if(batch_slot==0) printf("A f%02dd%02d=%f\n", iframe, idim, f);
+					}
+					else if (iframe < n_frames_to_set) {
+						// Generating right context from last frame
+						int src_iframe_in_saved_context = n_frames_in_context-1;
+float f =						d_batch_slot_with_context[iframe * params.d_batch_with_context_frame_stride+ idim] = d_channel_context[src_iframe_in_saved_context*params.d_all_context_frames_frame_stride+ idim];
+//						if(batch_slot==0) printf("B f%02dd%02d=%f\n", iframe, idim, f);
+					} 
+				}
+
+				if(iframe == 0) { // one CTA moves the ivectors
+					for (int idim = threadIdx.x; idim < params.ivector_dim; idim += blockDim.x) {
+						params.d_batch_ivectors[batch_slot*params.d_batch_ivectors_stride +idim] = batch_assign.d_ivectors[idim];
+					}
+				}
+			}
+		} 
+	}
+
+void BuildBatchWithContextKernelContextFlush(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &stream,
+                                 const BatchedStaticNnet3KernelParams &params) {
+  build_batch_with_context_context_flush_kernel<<<grid, block, 0, stream>>>(params);
+}
+
+__global__ void save_context_from_batch_kernel(BatchedStaticNnet3KernelParams params) {
+	for (int batch_slot = blockIdx.z; batch_slot < params.batch_size;
+			batch_slot += gridDim.z) {
+		BatchSlotAssignment batch_assign =
+			params.d_batch_slot_assignement[batch_slot];
+
+
+		// Real frames : does not include frame0 copies for left context
+		int n_real_frames_available =
+			batch_assign.n_frames_already_in_context + batch_assign.n_new_frames;
+		// total frames : includes frame0 copies
+		int total_frames_in_batch_slot = n_real_frames_available; 
+		if(batch_assign.n_frames_already_in_context == 0) {
+			// First chunk for that utterance. We generated left context by duplicating frame0
+			total_frames_in_batch_slot += params.total_nnet_left_context; 
+		}
+		// total frames : includes frame0 copies
+		int n_to_copy = min(total_frames_in_batch_slot, params.total_nnet_context);
+		int copy_from_frame = total_frames_in_batch_slot - n_to_copy;
+		BaseFloat *d_batch_slot_with_context = &params.d_batch_with_context[params.d_batch_with_context_batch_stride * batch_slot];
+		BaseFloat *d_channel_context = &params.d_all_context_frames[batch_assign.ichannel *
+			params.d_all_context_frames_channel_stride];
+
+		for (int dst_iframe= blockIdx.y;
+				dst_iframe < n_to_copy; dst_iframe += gridDim.y) {
+			int src_iframe = copy_from_frame + dst_iframe;
+			for (int idim = threadIdx.x; idim < params.input_dim; idim += blockDim.x)  {
+				d_channel_context[dst_iframe*params.d_all_context_frames_frame_stride+ idim] = d_batch_slot_with_context[src_iframe * params.d_batch_with_context_frame_stride + idim];
+			}
+
+		}
+	}
+}
+
+void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &stream,
+                                const BatchedStaticNnet3KernelParams &params) {
+  save_context_from_batch_kernel<<<grid, block, 0, stream>>>(params);
+}
+
+}  // cuda_decoder
+}  // kaldi
diff --git a/src/cudadecoder/batched-static-nnet3-kernels.h b/src/cudadecoder/batched-static-nnet3-kernels.h
@@ -0,0 +1,87 @@
+// cudadecoder/batched-static-nnet3-kernels.h
+//
+// Copyright (c) 2019; NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun
+//
+// Licensed under the Apache License; Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing; software
+// distributed under the License is distributed on an "AS IS" BASIS;
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND; either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include <cuda_runtime_api.h>
+#include "base/kaldi-types.h"
+
+#ifndef KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
+#define KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// Describe what each batch slot is made of. Used by the context switch kernels
+struct BatchSlotAssignment {
+  BaseFloat *d_features;
+  BaseFloat *d_ivectors;
+  int ichannel;
+  int n_frames_already_in_context;
+  int n_new_frames;
+};
+
+struct BatchedStaticNnet3KernelParams {
+  const BaseFloat *d_all_new_features;
+  const BatchSlotAssignment *d_batch_slot_assignement;
+  BaseFloat *d_all_context_frames;
+  BaseFloat *d_batch_with_context;
+  BaseFloat *d_batch_ivectors;
+  int d_batch_ivectors_stride;
+  int batch_size;
+  int d_features_frame_stride;
+  int d_ivectors_frame_stride;
+  int d_all_context_frames_frame_stride;
+  int d_batch_with_context_frame_stride;
+  int d_all_context_frames_channel_stride;
+  int d_batch_with_context_batch_stride;
+  int input_dim;
+  int ivector_dim;
+  int total_nnet_context;
+  int total_nnet_left_context;
+  int total_nnet_right_context;
+  int input_frames_per_chunk_with_context;
+};
+
+// Takes as a input strided new chunks ptrs [chk0, chk1, chk2..]
+// associated to channels [ch0, ch1, ch2...]
+// And build a continuous batch such as:
+// Batch with context:
+// row0: [left_context(ch0), chk0]
+// row0: [left_context(ch1), chk1]
+// row0: [left_context(ch2), chk2]
+// With left context being either part of a previous chunk for that channel, or
+// just duplications of frame0 if this is the first chunk for that channel The
+// end of each chunk for each row will then be used as a right context
+void BuildBatchWithContextKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &stream,
+                                 const BatchedStaticNnet3KernelParams &params);
+
+// Same thing than BuildBatchWithContextKernelContextFlush, except that the
+// final frame is replicated to create the right context
+void BuildBatchWithContextKernelContextFlush(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &stream,
+    const BatchedStaticNnet3KernelParams &params);
+void SaveContextFromBatchKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &stream,
+                                const BatchedStaticNnet3KernelParams &params);
+
+}  // namespace cuda_decoder
+}  // namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_BATCHED_STATIC_NNET3_KERNELS_H_
+#endif  // HAVE_CUDA