kaldi-asr · hugovbraun · May 29, 2020 · Jul 6, 2020
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -217,7 +217,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
     const std::vector<CorrelationID> &corr_ids,
     const std::vector<SubVector<BaseFloat>> &wave_samples,
     const std::vector<bool> &is_first_chunk,
-    const std::vector<bool> &is_last_chunk) {
+    const std::vector<bool> &is_last_chunk,
+    std::vector<std::string *> *partial_hypotheses) {
   nvtxRangePushA("DecodeBatch");
   KALDI_ASSERT(corr_ids.size() > 0);
   KALDI_ASSERT(corr_ids.size() == wave_samples.size());
@@ -242,9 +243,25 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
     }
   }
   int features_frame_stride = d_all_features_.Stride();
+  if (partial_hypotheses) {
+    // We're going to have to generate the partial hypotheses
+    KALDI_ASSERT(
+        word_syms_ &&
+        "You need to set --word-symbol-table to use partial hypotheses");
+    cuda_decoder_->AllowPartialHypotheses();
+  }
   DecodeBatch(corr_ids, d_features_ptrs_, features_frame_stride,
               n_input_frames_valid_, d_ivectors_ptrs_, is_first_chunk,
               is_last_chunk, &channels_);
+
+  if (partial_hypotheses) {
+    partial_hypotheses->resize(channels_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      PartialHypothesis *partial_hypothesis;
+      cuda_decoder_->GetPartialHypothesis(channels_[i], &partial_hypothesis);
+      (*partial_hypotheses)[i] = &partial_hypothesis->out_str;
+    }
+  }
 }
 
 void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
@@ -433,7 +450,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding(
   }
 
   if (dlat.NumStates() > 0) {
-    if (word_syms_) {
+    // Used for debugging
+    if (false && word_syms_) {
       CompactLattice best_path_clat;
       CompactLatticeShortestPath(dlat, &best_path_clat);
 

diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
@@ -169,10 +169,16 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   // If it contains some last chunks for given utterances, it will call
   // FinalizeDecoding (building the final lattice, determinize it, etc.)
   // asynchronously. The callback for that utterance will then be called
+  //
+  // If partial_hypotheses is not null, generate and set the current partial
+  // hypotheses in partial_hypotheses The pointers in partial_hypotheses are
+  // only valid until the next DecodeBatch call - perform a deep copy if
+  // necessary
   void DecodeBatch(const std::vector<CorrelationID> &corr_ids,
                    const std::vector<SubVector<BaseFloat>> &wave_samples,
                    const std::vector<bool> &is_first_chunk,
-                   const std::vector<bool> &is_last_chunk);
+                   const std::vector<bool> &is_last_chunk,
+                   std::vector<std::string *> *partial_hypotheses = NULL);
 
   // Version providing directly the features. Only runs nnet3 & decoder
   // Used when we want to provide the final ivectors (offline case)
@@ -207,8 +213,12 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   // Maximum number of seconds per chunk
   BaseFloat GetSecondsPerChunk() { return seconds_per_chunk_; }
 
-  // Used when debugging. Used to Print the text when a decoding is done
-  void SetSymbolTable(fst::SymbolTable *word_syms) { word_syms_ = word_syms; }
+  // Used for partial hypotheses
+  void SetSymbolTable(fst::SymbolTable *word_syms) {
+    word_syms_ = word_syms;
+    KALDI_ASSERT(cuda_decoder_);
+    cuda_decoder_->SetSymbolTable(word_syms);
+  }
 
   // Wait for all lattice callbacks to complete
   // Can be called after DecodeBatch

diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
@@ -345,6 +345,46 @@ class DeviceChannelMatrix : public DeviceMatrix<T> {
   }
 };
 
+// InfoToken contains data that needs to be saved for the backtrack
+// in GetBestPath/GetRawLattice
+// We don't need the token.cost or token.next_state.
+struct __align__(8) InfoToken {
+  int32 prev_token;
+  int32 arc_idx;
+  bool IsUniqueTokenForStateAndFrame() {
+    // This is a trick used to save space and PCI-E bandwidth (cf
+    // preprocess_in_place kernel)
+    // This token is associated with a next_state s, created during the
+    // processing of frame f.
+    // If we have multiple tokens associated with the state s in the frame f,
+    // arc_idx < 0 and -arc_idx is the
+    // count of such tokens. We will then have to look at another list to read
+    // the actually arc_idx and prev_token values
+    // If the current token is the only one, prev_token and arc_idx are valid
+    // and can be used directly
+    return (arc_idx >= 0);
+  }
+
+  // Called if this token is linked to others tokens in the same frame (cf
+  // comments for IsUniqueTokenForStateAndFrame)
+  // return the {offset,size} pair necessary to list those tokens in the
+  // extra_prev_tokens list
+  // They are stored at offset "offset", and we have "size" of those
+  std::pair<int32, int32> GetSameFSTStateTokensList() {
+    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
+
+    return {prev_token, -arc_idx};
+  }
+};
+
+// Device function, used to set a in an InfoToken the [offset,size] related to
+// InfoToken.GetSameFSTStateTokensList
+__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
+                                                     InfoToken *info_token) {
+  // We always have size > 0
+  *info_token = {offset, -size};
+}
+
 // LaneCounters/ChannelCounters
 // The counters are all the singular values associated to a lane/channel
 // For instance  the main queue size. Or the min_cost of all tokens in that
@@ -431,6 +471,7 @@ struct LaneCounters {
   int32 n_within_lattice_beam;
   int32 has_reached_final;  // if there's at least one final token in the queue
   int32 prev_arg_min_int_cost;
+  InfoToken prev_arg_min_int_cost_token;
 };
 
 // Channel counters
@@ -471,46 +512,6 @@ class CudaDecoderException : public std::exception {
   const bool recoverable;
 };
 
-// InfoToken contains data that needs to be saved for the backtrack
-// in GetBestPath/GetRawLattice
-// We don't need the token.cost or token.next_state.
-struct __align__(8) InfoToken {
-  int32 prev_token;
-  int32 arc_idx;
-  bool IsUniqueTokenForStateAndFrame() {
-    // This is a trick used to save space and PCI-E bandwidth (cf
-    // preprocess_in_place kernel)
-    // This token is associated with a next_state s, created during the
-    // processing of frame f.
-    // If we have multiple tokens associated with the state s in the frame f,
-    // arc_idx < 0 and -arc_idx is the
-    // count of such tokens. We will then have to look at another list to read
-    // the actually arc_idx and prev_token values
-    // If the current token is the only one, prev_token and arc_idx are valid
-    // and can be used directly
-    return (arc_idx >= 0);
-  }
-
-  // Called if this token is linked to others tokens in the same frame (cf
-  // comments for IsUniqueTokenForStateAndFrame)
-  // return the {offset,size} pair necessary to list those tokens in the
-  // extra_prev_tokens list
-  // They are stored at offset "offset", and we have "size" of those
-  std::pair<int32, int32> GetSameFSTStateTokensList() {
-    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
-
-    return {prev_token, -arc_idx};
-  }
-};
-
-// Device function, used to set a in an InfoToken the [offset,size] related to
-// InfoToken.GetSameFSTStateTokensList
-__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
-                                                     InfoToken *info_token) {
-  // We always have size > 0
-  *info_token = {offset, -size};
-}
-
 // Used to store the index in the GPU hashmap of that FST state
 // The hashmap is only generated with the final main queue (post max_active_) of
 // each frame
@@ -558,6 +559,25 @@ enum OVERFLOW_TYPE {
 
 enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
 
+// Used internally to generate partial paths
+struct PartialPathArc {
+  int32 token_idx;
+  int32 arc_idx;
+};
+
+// Partial hypothesis formatted and meant to be used by user
+struct PartialHypothesis {
+  std::vector<int> arc_idx;
+  std::vector<int> olabel;
+  std::string out_str;
+
+  void clear() {
+    arc_idx.clear();
+    olabel.clear();
+    out_str.clear();
+  }
+};
+
 }  // end namespace cuda_decoder
 }  // end namespace kaldi
 

diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -1401,7 +1401,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
     const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
     int32 min_int_cost = lane_counters->min_int_cost;
     CostType min_cost = orderedIntToFloat(min_int_cost);
-    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int32 global_offset = lane_counters->main_q_global_offset;
     KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
       // Position of considered token in the main_q
       if (main_q_idx < main_q_end) {
@@ -1415,6 +1415,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
           channel_counters->min_int_cost_and_arg_without_final = {
               token_int_cost, global_offset + main_q_idx};
           lane_counters->prev_arg_min_int_cost = main_q_idx;
+          lane_counters->prev_arg_min_int_cost_token = cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
         } else {
           // remove offset = min_cost
           CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;