Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[src] Partial hypothesis for cuda decoder #4101

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
const std::vector<CorrelationID> &corr_ids,
const std::vector<SubVector<BaseFloat>> &wave_samples,
const std::vector<bool> &is_first_chunk,
const std::vector<bool> &is_last_chunk) {
const std::vector<bool> &is_last_chunk,
std::vector<std::string *> *partial_hypotheses) {
nvtxRangePushA("DecodeBatch");
KALDI_ASSERT(corr_ids.size() > 0);
KALDI_ASSERT(corr_ids.size() == wave_samples.size());
Expand All @@ -242,9 +243,25 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
}
}
int features_frame_stride = d_all_features_.Stride();
if (partial_hypotheses) {
// We're going to have to generate the partial hypotheses
KALDI_ASSERT(
word_syms_ &&
"You need to set --word-symbol-table to use partial hypotheses");
hugovbraun marked this conversation as resolved.
Show resolved Hide resolved
cuda_decoder_->AllowPartialHypotheses();
}
DecodeBatch(corr_ids, d_features_ptrs_, features_frame_stride,
n_input_frames_valid_, d_ivectors_ptrs_, is_first_chunk,
is_last_chunk, &channels_);

if (partial_hypotheses) {
partial_hypotheses->resize(channels_.size());
for (size_t i = 0; i < channels_.size(); ++i) {
PartialHypothesis *partial_hypothesis;
cuda_decoder_->GetPartialHypothesis(channels_[i], &partial_hypothesis);
(*partial_hypotheses)[i] = &partial_hypothesis->out_str;
}
}
}

void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
Expand Down Expand Up @@ -433,7 +450,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding(
}

if (dlat.NumStates() > 0) {
if (word_syms_) {
// Used for debugging
if (false && word_syms_) {
CompactLattice best_path_clat;
CompactLatticeShortestPath(dlat, &best_path_clat);

Expand Down
16 changes: 13 additions & 3 deletions src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,16 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
// If it contains some last chunks for given utterances, it will call
// FinalizeDecoding (building the final lattice, determinize it, etc.)
// asynchronously. The callback for that utterance will then be called
//
// If partial_hypotheses is not null, generate and set the current partial
// hypotheses in partial_hypotheses The pointers in partial_hypotheses are
// only valid until the next DecodeBatch call - perform a deep copy if
// necessary
void DecodeBatch(const std::vector<CorrelationID> &corr_ids,
const std::vector<SubVector<BaseFloat>> &wave_samples,
const std::vector<bool> &is_first_chunk,
const std::vector<bool> &is_last_chunk);
const std::vector<bool> &is_last_chunk,
std::vector<std::string *> *partial_hypotheses = NULL);
hugovbraun marked this conversation as resolved.
Show resolved Hide resolved

// Version providing directly the features. Only runs nnet3 & decoder
// Used when we want to provide the final ivectors (offline case)
Expand Down Expand Up @@ -207,8 +213,12 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
// Maximum number of seconds per chunk
BaseFloat GetSecondsPerChunk() { return seconds_per_chunk_; }

// Used when debugging. Used to Print the text when a decoding is done
void SetSymbolTable(fst::SymbolTable *word_syms) { word_syms_ = word_syms; }
// Used for partial hypotheses
void SetSymbolTable(fst::SymbolTable *word_syms) {
hugovbraun marked this conversation as resolved.
Show resolved Hide resolved
word_syms_ = word_syms;
KALDI_ASSERT(cuda_decoder_);
cuda_decoder_->SetSymbolTable(word_syms);
}

// Wait for all lattice callbacks to complete
// Can be called after DecodeBatch
Expand Down
100 changes: 60 additions & 40 deletions src/cudadecoder/cuda-decoder-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,46 @@ class DeviceChannelMatrix : public DeviceMatrix<T> {
}
};

// InfoToken contains data that needs to be saved for the backtrack
// in GetBestPath/GetRawLattice
// We don't need the token.cost or token.next_state.
struct __align__(8) InfoToken {
int32 prev_token;
int32 arc_idx;
bool IsUniqueTokenForStateAndFrame() {
// This is a trick used to save space and PCI-E bandwidth (cf
// preprocess_in_place kernel)
// This token is associated with a next_state s, created during the
// processing of frame f.
// If we have multiple tokens associated with the state s in the frame f,
// arc_idx < 0 and -arc_idx is the
// count of such tokens. We will then have to look at another list to read
// the actually arc_idx and prev_token values
// If the current token is the only one, prev_token and arc_idx are valid
// and can be used directly
return (arc_idx >= 0);
}

// Called if this token is linked to others tokens in the same frame (cf
// comments for IsUniqueTokenForStateAndFrame)
// return the {offset,size} pair necessary to list those tokens in the
// extra_prev_tokens list
// They are stored at offset "offset", and we have "size" of those
std::pair<int32, int32> GetSameFSTStateTokensList() {
KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());

return {prev_token, -arc_idx};
}
};

// Device function, used to set a in an InfoToken the [offset,size] related to
// InfoToken.GetSameFSTStateTokensList
__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
InfoToken *info_token) {
// We always have size > 0
*info_token = {offset, -size};
}

// LaneCounters/ChannelCounters
// The counters are all the singular values associated to a lane/channel
// For instance the main queue size. Or the min_cost of all tokens in that
Expand Down Expand Up @@ -431,6 +471,7 @@ struct LaneCounters {
int32 n_within_lattice_beam;
int32 has_reached_final; // if there's at least one final token in the queue
int32 prev_arg_min_int_cost;
InfoToken prev_arg_min_int_cost_token;
};

// Channel counters
Expand Down Expand Up @@ -471,46 +512,6 @@ class CudaDecoderException : public std::exception {
const bool recoverable;
};

// InfoToken contains data that needs to be saved for the backtrack
// in GetBestPath/GetRawLattice
// We don't need the token.cost or token.next_state.
struct __align__(8) InfoToken {
int32 prev_token;
int32 arc_idx;
bool IsUniqueTokenForStateAndFrame() {
// This is a trick used to save space and PCI-E bandwidth (cf
// preprocess_in_place kernel)
// This token is associated with a next_state s, created during the
// processing of frame f.
// If we have multiple tokens associated with the state s in the frame f,
// arc_idx < 0 and -arc_idx is the
// count of such tokens. We will then have to look at another list to read
// the actually arc_idx and prev_token values
// If the current token is the only one, prev_token and arc_idx are valid
// and can be used directly
return (arc_idx >= 0);
}

// Called if this token is linked to others tokens in the same frame (cf
// comments for IsUniqueTokenForStateAndFrame)
// return the {offset,size} pair necessary to list those tokens in the
// extra_prev_tokens list
// They are stored at offset "offset", and we have "size" of those
std::pair<int32, int32> GetSameFSTStateTokensList() {
KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());

return {prev_token, -arc_idx};
}
};

// Device function, used to set a in an InfoToken the [offset,size] related to
// InfoToken.GetSameFSTStateTokensList
__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
InfoToken *info_token) {
// We always have size > 0
*info_token = {offset, -size};
}

// Used to store the index in the GPU hashmap of that FST state
// The hashmap is only generated with the final main queue (post max_active_) of
// each frame
Expand Down Expand Up @@ -558,6 +559,25 @@ enum OVERFLOW_TYPE {

enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };

// Used internally to generate partial paths
struct PartialPathArc {
int32 token_idx;
int32 arc_idx;
};

// Partial hypothesis formatted and meant to be used by user
struct PartialHypothesis {
std::vector<int> arc_idx;
std::vector<int> olabel;
std::string out_str;

void clear() {
arc_idx.clear();
olabel.clear();
out_str.clear();
}
};

} // end namespace cuda_decoder
} // end namespace kaldi

Expand Down
3 changes: 2 additions & 1 deletion src/cudadecoder/cuda-decoder-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1401,7 +1401,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
int32 min_int_cost = lane_counters->min_int_cost;
CostType min_cost = orderedIntToFloat(min_int_cost);
const int32 global_offset = channel_counters->prev_main_q_global_offset;
const int32 global_offset = lane_counters->main_q_global_offset;
KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
// Position of considered token in the main_q
if (main_q_idx < main_q_end) {
Expand All @@ -1415,6 +1415,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
channel_counters->min_int_cost_and_arg_without_final = {
token_int_cost, global_offset + main_q_idx};
lane_counters->prev_arg_min_int_cost = main_q_idx;
lane_counters->prev_arg_min_int_cost_token = cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
} else {
// remove offset = min_cost
CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;
Expand Down
Loading