diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc index 6fe87ee3dc7..b55dd8a2ec2 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc @@ -217,7 +217,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch( const std::vector &corr_ids, const std::vector> &wave_samples, const std::vector &is_first_chunk, - const std::vector &is_last_chunk) { + const std::vector &is_last_chunk, + std::vector *partial_hypotheses) { nvtxRangePushA("DecodeBatch"); KALDI_ASSERT(corr_ids.size() > 0); KALDI_ASSERT(corr_ids.size() == wave_samples.size()); @@ -242,9 +243,26 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch( } } int features_frame_stride = d_all_features_.Stride(); + if (partial_hypotheses) { + // We're going to have to generate the partial hypotheses + if (word_syms_ == nullptr) { + KALDI_ERR << "You need to set --word-symbol-table to use " + << "partial hypotheses"; + } + cuda_decoder_->AllowPartialHypotheses(); + } DecodeBatch(corr_ids, d_features_ptrs_, features_frame_stride, n_input_frames_valid_, d_ivectors_ptrs_, is_first_chunk, is_last_chunk, &channels_); + + if (partial_hypotheses) { + partial_hypotheses->resize(channels_.size()); + for (size_t i = 0; i < channels_.size(); ++i) { + PartialHypothesis *partial_hypothesis; + cuda_decoder_->GetPartialHypothesis(channels_[i], &partial_hypothesis); + (*partial_hypotheses)[i] = &partial_hypothesis->out_str; + } + } } void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch( @@ -433,7 +451,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding( } if (dlat.NumStates() > 0) { - if (word_syms_) { + // Used for debugging + if (false && word_syms_) { CompactLattice best_path_clat; CompactLatticeShortestPath(dlat, &best_path_clat); diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h index ccb91cb2fc9..75f1dc15c45 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h @@ -169,10 +169,17 @@ class BatchedThreadedNnet3CudaOnlinePipeline { // If it contains some last chunks for given utterances, it will call // FinalizeDecoding (building the final lattice, determinize it, etc.) // asynchronously. The callback for that utterance will then be called - void DecodeBatch(const std::vector &corr_ids, - const std::vector> &wave_samples, - const std::vector &is_first_chunk, - const std::vector &is_last_chunk); + // + // If partial_hypotheses is not null, generate and set the current partial + // hypotheses in partial_hypotheses The pointers in partial_hypotheses are + // only valid until the next DecodeBatch call - perform a deep copy if + // necessary + void DecodeBatch( + const std::vector &corr_ids, + const std::vector> &wave_samples, + const std::vector &is_first_chunk, + const std::vector &is_last_chunk, + std::vector *partial_hypotheses = nullptr); // Version providing directly the features. Only runs nnet3 & decoder // Used when we want to provide the final ivectors (offline case) @@ -207,8 +214,12 @@ class BatchedThreadedNnet3CudaOnlinePipeline { // Maximum number of seconds per chunk BaseFloat GetSecondsPerChunk() { return seconds_per_chunk_; } - // Used when debugging. Used to Print the text when a decoding is done - void SetSymbolTable(fst::SymbolTable *word_syms) { word_syms_ = word_syms; } + // Used for partial hypotheses + void SetSymbolTable(const fst::SymbolTable &word_syms) { + word_syms_ = &word_syms; + KALDI_ASSERT(cuda_decoder_); + cuda_decoder_->SetSymbolTable(word_syms); + } // Wait for all lattice callbacks to complete // Can be called after DecodeBatch @@ -355,7 +366,7 @@ class BatchedThreadedNnet3CudaOnlinePipeline { std::unique_ptr thread_pool_; // Used for debugging - fst::SymbolTable *word_syms_; + const fst::SymbolTable *word_syms_; // Used when printing to stdout for debugging purposes std::mutex stdout_m_; }; diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h index e7f00910222..9710ca7508e 100644 --- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h +++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h @@ -245,7 +245,7 @@ class BatchedThreadedNnet3CudaPipeline2 { void WaitForAllTasks(); // Used for debug - void SetSymbolTable(fst::SymbolTable *word_syms) { + void SetSymbolTable(const fst::SymbolTable &word_syms) { cuda_online_pipeline_.SetSymbolTable(word_syms); } diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h index fc11ff894bb..0901ba902fc 100644 --- a/src/cudadecoder/cuda-decoder-common.h +++ b/src/cudadecoder/cuda-decoder-common.h @@ -345,6 +345,46 @@ class DeviceChannelMatrix : public DeviceMatrix { } }; +// InfoToken contains data that needs to be saved for the backtrack +// in GetBestPath/GetRawLattice +// We don't need the token.cost or token.next_state. +struct __align__(8) InfoToken { + int32 prev_token; + int32 arc_idx; + bool IsUniqueTokenForStateAndFrame() { + // This is a trick used to save space and PCI-E bandwidth (cf + // preprocess_in_place kernel) + // This token is associated with a next_state s, created during the + // processing of frame f. + // If we have multiple tokens associated with the state s in the frame f, + // arc_idx < 0 and -arc_idx is the + // count of such tokens. We will then have to look at another list to read + // the actually arc_idx and prev_token values + // If the current token is the only one, prev_token and arc_idx are valid + // and can be used directly + return (arc_idx >= 0); + } + + // Called if this token is linked to others tokens in the same frame (cf + // comments for IsUniqueTokenForStateAndFrame) + // return the {offset,size} pair necessary to list those tokens in the + // extra_prev_tokens list + // They are stored at offset "offset", and we have "size" of those + std::pair GetSameFSTStateTokensList() { + KALDI_ASSERT(!IsUniqueTokenForStateAndFrame()); + + return {prev_token, -arc_idx}; + } +}; + +// Device function, used to set a in an InfoToken the [offset,size] related to +// InfoToken.GetSameFSTStateTokensList +__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size, + InfoToken *info_token) { + // We always have size > 0 + *info_token = {offset, -size}; +} + // LaneCounters/ChannelCounters // The counters are all the singular values associated to a lane/channel // For instance the main queue size. Or the min_cost of all tokens in that @@ -431,6 +471,7 @@ struct LaneCounters { int32 n_within_lattice_beam; int32 has_reached_final; // if there's at least one final token in the queue int32 prev_arg_min_int_cost; + InfoToken prev_arg_min_int_cost_token; }; // Channel counters @@ -471,46 +512,6 @@ class CudaDecoderException : public std::exception { const bool recoverable; }; -// InfoToken contains data that needs to be saved for the backtrack -// in GetBestPath/GetRawLattice -// We don't need the token.cost or token.next_state. -struct __align__(8) InfoToken { - int32 prev_token; - int32 arc_idx; - bool IsUniqueTokenForStateAndFrame() { - // This is a trick used to save space and PCI-E bandwidth (cf - // preprocess_in_place kernel) - // This token is associated with a next_state s, created during the - // processing of frame f. - // If we have multiple tokens associated with the state s in the frame f, - // arc_idx < 0 and -arc_idx is the - // count of such tokens. We will then have to look at another list to read - // the actually arc_idx and prev_token values - // If the current token is the only one, prev_token and arc_idx are valid - // and can be used directly - return (arc_idx >= 0); - } - - // Called if this token is linked to others tokens in the same frame (cf - // comments for IsUniqueTokenForStateAndFrame) - // return the {offset,size} pair necessary to list those tokens in the - // extra_prev_tokens list - // They are stored at offset "offset", and we have "size" of those - std::pair GetSameFSTStateTokensList() { - KALDI_ASSERT(!IsUniqueTokenForStateAndFrame()); - - return {prev_token, -arc_idx}; - } -}; - -// Device function, used to set a in an InfoToken the [offset,size] related to -// InfoToken.GetSameFSTStateTokensList -__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size, - InfoToken *info_token) { - // We always have size > 0 - *info_token = {offset, -size}; -} - // Used to store the index in the GPU hashmap of that FST state // The hashmap is only generated with the final main queue (post max_active_) of // each frame @@ -558,6 +559,25 @@ enum OVERFLOW_TYPE { enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 }; +// Used internally to generate partial paths +struct PartialPathArc { + int32 token_idx; + int32 arc_idx; +}; + +// Partial hypothesis formatted and meant to be used by user +struct PartialHypothesis { + std::vector arc_idx; + std::vector olabel; + std::string out_str; + + void clear() { + arc_idx.clear(); + olabel.clear(); + out_str.clear(); + } +}; + } // end namespace cuda_decoder } // end namespace kaldi diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu index f2a0d16d317..ee436cc9ab8 100644 --- a/src/cudadecoder/cuda-decoder-kernels.cu +++ b/src/cudadecoder/cuda-decoder-kernels.cu @@ -1401,7 +1401,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params, const int32 main_q_end = lane_counters->main_q_narcs_and_end.y; int32 min_int_cost = lane_counters->min_int_cost; CostType min_cost = orderedIntToFloat(min_int_cost); - const int32 global_offset = channel_counters->prev_main_q_global_offset; + const int32 global_offset = lane_counters->main_q_global_offset; KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) { // Position of considered token in the main_q if (main_q_idx < main_q_end) { @@ -1415,6 +1415,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params, channel_counters->min_int_cost_and_arg_without_final = { token_int_cost, global_offset + main_q_idx}; lane_counters->prev_arg_min_int_cost = main_q_idx; + lane_counters->prev_arg_min_int_cost_token = cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx]; } else { // remove offset = min_cost CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost; diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc index 39bb93a087e..35ab5fc6c42 100644 --- a/src/cudadecoder/cuda-decoder.cc +++ b/src/cudadecoder/cuda-decoder.cc @@ -15,10 +15,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #if HAVE_CUDA == 1 -#include "cuda-decoder.h" #include "cuda-decoder-kernels.h" +#include "cuda-decoder.h" #include #include @@ -31,7 +32,9 @@ namespace kaldi { namespace cuda_decoder { CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes, int32 nchannels) - : fst_(fst), + : word_syms_(NULL), + generate_partial_hypotheses_(false), + fst_(fst), nlanes_(nlanes), nchannels_(nchannels), channel_lock_(nchannels + 1), @@ -88,15 +91,15 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, } void CudaDecoder::ReadConfig(const CudaDecoderConfig &cst_config) { - CudaDecoderConfig config = cst_config; // deep copy + config_ = cst_config; // deep copy // Sets the missing values using other values - config.ComputeConfig(); - default_beam_ = config.default_beam; - lattice_beam_ = config.lattice_beam; - ntokens_pre_allocated_ = config.ntokens_pre_allocated; - max_active_ = config.max_active; - aux_q_capacity_ = config.aux_q_capacity; - main_q_capacity_ = config.main_q_capacity; + config_.ComputeConfig(); + default_beam_ = config_.default_beam; + lattice_beam_ = config_.lattice_beam; + ntokens_pre_allocated_ = config_.ntokens_pre_allocated; + max_active_ = config_.max_active; + aux_q_capacity_ = config_.aux_q_capacity; + main_q_capacity_ = config_.main_q_capacity; KALDI_ASSERT(default_beam_ >= 0.0f); KALDI_ASSERT(lattice_beam_ >= 0.0f); @@ -184,6 +187,8 @@ void CudaDecoder::AllocateHostData() { h_all_tokens_acoustic_cost_.resize(nchannels_); h_all_tokens_extra_prev_tokens_.resize(nchannels_); h_all_tokens_info_.resize(nchannels_); + h_all_channels_partial_hypotheses_.resize(nchannels_); + h_all_channels_partial_hypotheses_out_.resize(nchannels_); for (int32 ichannel = 0; ichannel < nchannels_; ++ichannel) { h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel].reserve( ntokens_pre_allocated_); @@ -193,6 +198,8 @@ void CudaDecoder::AllocateHostData() { h_main_q_end_lane_offsets_.resize(nlanes_ + 1); h_emitting_main_q_end_lane_offsets_.resize(nlanes_ + 1); h_n_extra_prev_tokens_lane_offsets_.resize(nlanes_ + 1); + h_argmin_token_cost_.resize(nlanes_); + h_argmin_token_cost_token_.resize(nlanes_); frame_offsets_.resize(nchannels_); num_frames_decoded_.resize(nchannels_, -1); lanes2channels_todo_.reserve(nlanes_); @@ -334,6 +341,7 @@ void CudaDecoder::ComputeInitialChannel() { KALDI_ASSERT(ilane == 0); // Following kernels working channel_id std::vector channels = {init_channel_id_}; + num_frames_decoded_[init_channel_id_] = 0; SetChannelsInKernelParams(channels); // not calling LoadChannelsStateToLanes, // init_channel_id_ is a special case h_lanes_counters_.lane(ilane)->channel_to_compute = init_channel_id_; @@ -419,6 +427,8 @@ void CudaDecoder::InitDecoding(const std::vector &channels) { h_all_argmin_cost_[ichannel] = {-1, 0.0f}; frame_offsets_[ichannel].clear(); frame_offsets_[ichannel].push_back(n_initial_tokens); + h_all_channels_partial_hypotheses_[ichannel].clear(); + h_all_channels_partial_hypotheses_out_[ichannel].clear(); // TODO put it back // if (thread_pool_) { // thread_pool_->post([ichannel, this] { @@ -655,7 +665,16 @@ void CudaDecoder::CopyMainQueueDataToHost() { h_lanes_counters_.lane(ilane)->main_q_end_lane_offset; h_n_extra_prev_tokens_lane_offsets_[ilane] = h_lanes_counters_.lane(ilane)->main_q_n_extra_prev_tokens_lane_offset; - lanes2channels_todo_.push_back(channel_to_compute_[ilane]); + if (ilane < nlanes_used_) { + lanes2channels_todo_.push_back(channel_to_compute_[ilane]); + int32 global_offset = h_lanes_counters_.lane(ilane)->main_q_global_offset; + h_argmin_token_cost_[ilane] = + global_offset + h_lanes_counters_.lane(ilane)->prev_arg_min_int_cost; + h_argmin_token_cost_token_[ilane] = + h_lanes_counters_.lane(ilane)->prev_arg_min_int_cost_token; + const ChannelId ichannel = channel_to_compute_[ilane]; + ++num_frames_decoded_[ichannel]; + } } LaunchH2HCopies(); @@ -848,8 +867,6 @@ void CudaDecoder::AdvanceDecoding( for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) { const ChannelId ichannel = channel_to_compute_[ilane]; - // We're done processing that frame - ++num_frames_decoded_[ichannel]; const int32 main_q_end = h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y; // Saving frame offsets for GetRawLattice @@ -857,6 +874,17 @@ void CudaDecoder::AdvanceDecoding( main_q_end); } SaveChannelsStateFromLanes(); + + // Waiting for partial path to be ready (if set) + // They are computed async + WaitForPartialHypotheses(); +} + +void CudaDecoder::WaitForPartialHypotheses() { + if (!generate_partial_hypotheses_) return; + while (n_partial_hypotheses_threads_not_done_.load( + std::memory_order_acquire) > 0) + usleep(200); } void CudaDecoder::CheckOverflow() { @@ -991,6 +1019,57 @@ void CudaDecoder::GetBestCost(const std::vector &channels, } } +void CudaDecoder::GetBestPredecessor(int32 ichannel, int32 curr_token_idx, + int32 *prev_token_idx_out, + int32 *arc_idx_out) { + KALDI_ASSERT(curr_token_idx > 0); + KALDI_ASSERT(curr_token_idx < h_all_tokens_info_[ichannel].size()); + InfoToken token = h_all_tokens_info_[ichannel][curr_token_idx]; + // We want an arc with extra_cost == 0 + int32 arc_idx; + TokenId prev_token_idx; + if (token.IsUniqueTokenForStateAndFrame()) { + // If we have only one, it is an arc with + // extra_cost == 0 + arc_idx = token.arc_idx; + prev_token_idx = token.prev_token; + } else { + // Using the first arc with extra_cost == 0 + int32 offset, size; + std::tie(offset, size) = token.GetSameFSTStateTokensList(); + bool found_best = false; + for (auto i = 0; i < size; ++i) { + KALDI_ASSERT( + (offset + i) < + h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] + .size()); // TODO IF paranoid + CostType arc_extra_cost = + h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] + [offset + i] + .x; + // Picking one arc on the best path + // (extra_cost == 0) + if (arc_extra_cost == 0.0f) { + KALDI_ASSERT( + h_all_tokens_extra_prev_tokens_[ichannel].size() == + h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] + .size()); + KALDI_ASSERT((offset + i) < + h_all_tokens_extra_prev_tokens_[ichannel].size()); + InfoToken list_token = + h_all_tokens_extra_prev_tokens_[ichannel][offset + i]; + arc_idx = list_token.arc_idx; + prev_token_idx = list_token.prev_token; + found_best = true; + break; + } + } + KALDI_ASSERT(found_best); + } + *prev_token_idx_out = prev_token_idx; + *arc_idx_out = arc_idx; +} + void CudaDecoder::GetBestPath(const std::vector &channels, std::vector &fst_out_vec, bool use_final_probs) { @@ -1019,39 +1098,8 @@ void CudaDecoder::GetBestPath(const std::vector &channels, // it always has index 0 // We backtrack until that first token while (token_idx != 0) { - InfoToken token = h_all_tokens_info_[ichannel][token_idx]; - // We want an arc with extra_cost == 0 - int32 arc_idx; - TokenId prev_token_idx; - if (token.IsUniqueTokenForStateAndFrame()) { - // If we have only one, it is an arc with - // extra_cost == 0 - arc_idx = token.arc_idx; - prev_token_idx = token.prev_token; - } else { - // Using the first arc with extra_cost == 0 - int32 offset, size; - std::tie(offset, size) = token.GetSameFSTStateTokensList(); - bool found_best = false; - for (auto i = 0; i < size; ++i) { - CostType arc_extra_cost = - h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] - [offset + - i] - .x; - // Picking one arc on the best path - // (extra_cost == 0) - if (arc_extra_cost == 0.0f) { - InfoToken list_token = - h_all_tokens_extra_prev_tokens_[ichannel][offset + i]; - arc_idx = list_token.arc_idx; - prev_token_idx = list_token.prev_token; - found_best = true; - break; - } - } - KALDI_ASSERT(found_best); - } + int32 prev_token_idx, arc_idx; + GetBestPredecessor(ichannel, token_idx, &prev_token_idx, &arc_idx); reversed_path.push_back(arc_idx); token_idx = prev_token_idx; } @@ -1455,6 +1503,7 @@ void CudaDecoder::SwapPrevAndCurrLatticeMap( } void CudaDecoder::WaitForH2HCopies() { + Timer timer; std::unique_lock lk(n_h2h_task_not_done_mutex_); h2h_done_.wait(lk, [this] { return (n_h2h_task_not_done_ == 0); }); } @@ -1717,7 +1766,11 @@ void CudaDecoder::LaunchH2HCopies() { n_acoustic_h2h_copies_todo_.store(nlanes_used_ - 1); n_infotoken_h2h_copies_todo_.store(nlanes_used_ - 1); n_extra_prev_tokens_h2h_copies_todo_.store(nlanes_used_ - 1); - + if (generate_partial_hypotheses_) { + n_partial_hypotheses_format_output_todo_.store(nlanes_used_ - 1); + n_partial_hypotheses_threads_not_done_.store(thread_pool_ ? n_threads_used_ + : 1); + } { std::lock_guard n_h2h_not_done_lk(n_h2h_task_not_done_mutex_); n_h2h_task_not_done_ += thread_pool_ ? n_threads_used_ : 1; @@ -1742,6 +1795,86 @@ void CudaDecoder::ComputeH2HCopiesCPUWorker() { } } +void CudaDecoder::GeneratePartialPath(LaneId ilane, ChannelId ichannel) { + // Partial hypothesis + int curr_token_idx = h_argmin_token_cost_[ilane]; + if (curr_token_idx > 0) { + // Finding out what is that token's predecessor + InfoToken token = h_argmin_token_cost_token_[ilane]; + int prev_token_idx = token.prev_token; + int arc_idx = token.arc_idx; + std::list &partial_hypotheses = + h_all_channels_partial_hypotheses_[ichannel]; + + // Adding that link at the end of the partial path + partial_hypotheses.push_back({curr_token_idx, arc_idx}); + // Backtracking until we reconnect with our stored partial path + if (partial_hypotheses.size() > 1) { + auto it = std::prev(partial_hypotheses.end(), 2); + + int32 stored_prev_token_idx = it->token_idx; + if (stored_prev_token_idx != prev_token_idx) { + // The new partial best path is not directly to the previous partial + // best path We need to backtrack until we reconnect with the previous + // partial best path (or until we reach the root node) + + // Locking, we need the host channel data to backtrack + std::lock_guard channel_lk(channel_lock_[ichannel]); + + while (true) { + stored_prev_token_idx = it->token_idx; + if (stored_prev_token_idx == prev_token_idx) + break; // no need to rewrite existing partial path + curr_token_idx = prev_token_idx; + GetBestPredecessor(ichannel, curr_token_idx, &prev_token_idx, + &arc_idx); + *it = {curr_token_idx, arc_idx}; + + if (prev_token_idx == 0) break; + if (it == partial_hypotheses.begin()) { + // Our new path is longer than the previous one + // Adding some elts + partial_hypotheses.push_front( + {-1, -1}); // it will be set on next iteration + } + --it; + } + + if (prev_token_idx == 0) { + // We've reached the beginning, we need to purge any elts + partial_hypotheses.erase(partial_hypotheses.begin(), it); + } + } + } + } +} + +void CudaDecoder::BuildPartialHypothesisOutput(ChannelId ichannel) { + // We assume that we own the channel lock + std::list &partial_hypotheses_internal = + h_all_channels_partial_hypotheses_[ichannel]; + PartialHypothesis &out = h_all_channels_partial_hypotheses_out_[ichannel]; + out.olabel.clear(); + out.arc_idx.clear(); + std::ostringstream oss; + bool empty = true; + // We should only append one word when not backtrack was done in + // GeneratePartialPath + for (PartialPathArc &link : partial_hypotheses_internal) { + int arc_idx = link.arc_idx; + int olabel = fst_.h_arc_olabels_[arc_idx]; + if (olabel == 0) continue; + out.olabel.push_back(olabel); + out.arc_idx.push_back(arc_idx); + if (word_syms_) { + if (!empty) oss << ' '; + oss << word_syms_->Find(olabel); + empty = false; + } + } + out.out_str = oss.str(); +} + void CudaDecoder::ComputeH2HCopies() { // Waiting for either something to do or the instruction to stop the // threads @@ -1755,10 +1888,21 @@ void CudaDecoder::ComputeH2HCopies() { // If we are done, stop the wait and return now. // ComputeH2HCopiesCPUWorker will also return, stopping the thread if (!h2h_threads_running_) return; + + int32 ilane; + if (generate_partial_hypotheses_) { + while ((ilane = n_partial_hypotheses_format_output_todo_.fetch_sub(1)) >= + 0) { + int32 ichannel = lanes2channels_todo_[ilane]; + GeneratePartialPath(ilane, ichannel); + BuildPartialHypothesisOutput(ichannel); + } + n_partial_hypotheses_threads_not_done_.fetch_sub(1, + std::memory_order_release); + } // Waiting for the D2H copies. This is threadsafe // Step 1: acoustic costs cudaEventSynchronize(d2h_copy_acoustic_evt_); - int32 ilane; while ((ilane = n_acoustic_h2h_copies_todo_.fetch_sub(1)) >= 0) { int32 ichannel = lanes2channels_todo_[ilane]; // Lock Channel @@ -1786,7 +1930,9 @@ void CudaDecoder::ComputeH2HCopies() { h_infotoken_concat_, &h_all_tokens_info_); } - // Step 3: extra prev tokens + // Step 3: + // - extra prev tokens + // - partial path and endpointing cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_); while ((ilane = n_extra_prev_tokens_h2h_copies_todo_.fetch_sub(1)) >= 0) { int32 ichannel = lanes2channels_todo_[ilane]; @@ -1824,6 +1970,6 @@ void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool, } } // namespace cuda_decoder -} // end namespace kaldi +} // namespace kaldi #endif // HAVE_CUDA == 1 diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h index 95bc7cac130..b62661274f5 100644 --- a/src/cudadecoder/cuda-decoder.h +++ b/src/cudadecoder/cuda-decoder.h @@ -236,8 +236,17 @@ class CudaDecoder { std::vector &decodables, int32 max_num_frames = -1); + void AllowPartialHypotheses() { generate_partial_hypotheses_ = true; } + void GetPartialHypothesis(ChannelId ichannel, PartialHypothesis **out) { + KALDI_ASSERT(generate_partial_hypotheses_); + // No need to lock, all ops on h_all_channels_partial_hypotheses_out_ are + // done before returning InitDecoding or AdvanceDecoding + *out = &h_all_channels_partial_hypotheses_out_[ichannel]; + } + // Returns the number of frames already decoded in a given channel int32 NumFramesDecoded(ChannelId ichannel) const; + // GetBestPath gets the one-best decoding traceback. If // "use_final_probs" is true AND we reached a final state, it limits // itself to final states; otherwise it gets the most likely token not @@ -291,6 +300,11 @@ class CudaDecoder { void SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool, int32 nworkers); + // Used to generate partial results + void SetSymbolTable(const fst::SymbolTable &word_syms) { + word_syms_ = &word_syms; + } + private: // Data allocation. Called in constructor void AllocateDeviceData(); @@ -314,6 +328,10 @@ class CudaDecoder { // software threads into the registers of a CPU) void LoadChannelsStateToLanes(const std::vector &channels); void SaveChannelsStateFromLanes(); + // Given a token, get its best predecessor (lower cost predecessor) + // Used by GetBestPath or best path traceback + void GetBestPredecessor(int32 ichannel, int32 curr_token_idx, + int32 *prev_token_idx_out, int32 *arc_idx_out); // Expand the arcs, emitting stage. Must be called after // a preprocess_in_place, which happens in PostProcessingMainQueue. // ExpandArcsEmitting is called first when decoding a frame, @@ -409,6 +427,15 @@ class CudaDecoder { // We then have to unpack it and move it inside host memory // This is done by ComputeH2HCopies void ComputeH2HCopies(); + + // Used to generate the partial hypotheses + // Called by the worker threads async + void BuildPartialHypothesisOutput(ChannelId ichannel); + void GeneratePartialPath(LaneId ilane, ChannelId ichannel); + // Wait for the async partial hypotheses related tasks to be done + // before returning + void WaitForPartialHypotheses(); + // Takes care of preparing the data for ComputeH2HCopies // and check whether we can use the threadpool or we have to do the work // on the current thread @@ -436,6 +463,10 @@ class CudaDecoder { // Data members // + CudaDecoderConfig config_; + const fst::SymbolTable *word_syms_; // for partial hypotheses + bool generate_partial_hypotheses_; // set by AllowPartialHypotheses + // The CudaFst data structure contains the FST graph // in the CSR format, on both the GPU and CPU memory const CudaFst fst_; @@ -689,6 +720,19 @@ class CudaDecoder { std::vector h_main_q_end_lane_offsets_; std::vector h_emitting_main_q_end_lane_offsets_; std::vector h_n_extra_prev_tokens_lane_offsets_; + // Index of the best index for the last frame. Used by endpointing/partial + // results + std::vector h_argmin_token_cost_; + std::vector h_argmin_token_cost_token_; + // Partial path so far on a given channel + + // Partial hypotheses to be used by user + // Only valid between API calls (InitDecoding, AdvanceDecoding) + std::vector h_all_channels_partial_hypotheses_out_; + + // Used internally to store the state of the current partial hypotheses + std::vector> h_all_channels_partial_hypotheses_; + // Used when calling GetBestCost std::vector> argmins_; std::vector has_reached_final_; @@ -767,6 +811,11 @@ class CudaDecoder { std::condition_variable h2h_done_; std::condition_variable init_decoding_h2h_done_; std::atomic active_wait_; + + // Used for sync on partial hypotheses tasks + std::atomic_int32_t n_partial_hypotheses_format_output_todo_; + std::atomic_int32_t n_partial_hypotheses_threads_not_done_; + bool h2h_threads_running_; // Using the output from GetBestPath, we add the best tokens (as // selected in GetBestCost) from the final frame to the output lattice. diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc index f27eb54be6e..7bd97a739c8 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc @@ -99,11 +99,14 @@ int main(int argc, char *argv[]) { int num_todo = -1; int niterations = 3; int num_streaming_channels = 2000; + bool print_partial_hypotheses = false; ParseOptions po(usage); po.Register("write-lattice", &write_lattice, "Output lattice to a file. Setting to " "false is useful when " "benchmarking"); + po.Register("print-partial-hypotheses", &print_partial_hypotheses, + "Prints the partial hypotheses"); po.Register("word-symbol-table", &word_syms_rxfilename, "Symbol table for words [for debug output]"); po.Register("file-limit", &num_todo, @@ -170,7 +173,7 @@ int main(int argc, char *argv[]) { "table from file " << word_syms_rxfilename; else { - // cuda_pipeline.SetSymbolTable(word_syms); + cuda_pipeline.SetSymbolTable(*word_syms); } } @@ -248,6 +251,9 @@ int main(int argc, char *argv[]) { // Used when use_online_ivectors_ std::vector> batch_wave_samples; + // Partial hypotheses + std::vector partial_hypotheses; + double batch_valid_at = gettime_monotonic(); bool pipeline_starved_warning_printed = false; while (true) { @@ -339,7 +345,16 @@ int main(int argc, char *argv[]) { if (wait_for > 0) usleep(wait_for * 1e6); cuda_pipeline.DecodeBatch(batch_corr_ids, batch_wave_samples, - batch_is_first_chunk, batch_is_last_chunk); + batch_is_first_chunk, batch_is_last_chunk, + &partial_hypotheses); + if (print_partial_hypotheses) { + KALDI_LOG << "========== BEGIN OF PARTIAL HYPOTHESES =========="; + for (size_t i = 0; i < batch_corr_ids.size(); ++i) { + KALDI_LOG << "CORR_ID #" << batch_corr_ids[i] + << "\t: " << *partial_hypotheses[i]; + } + KALDI_LOG << "=========== END OF PARTIAL HYPOTHESES ==========="; + } batch_corr_ids.clear(); batch_is_first_chunk.clear(); batch_is_last_chunk.clear(); diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc index 83f5b6a0650..980658e00d4 100644 --- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc +++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc @@ -123,9 +123,7 @@ int main(int argc, char *argv[]) { if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename))) KALDI_ERR << "Could not read symbol table from file " << word_syms_rxfilename; - else { - // cuda_pipeline.SetSymbolTable(word_syms); - } + cuda_pipeline.SetSymbolTable(*word_syms); } int32 num_task_submitted = 0, num_err = 0;