diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
index 6fe87ee3dc7..b55dd8a2ec2 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.cc
@@ -217,7 +217,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
     const std::vector<CorrelationID> &corr_ids,
     const std::vector<SubVector<BaseFloat>> &wave_samples,
     const std::vector<bool> &is_first_chunk,
-    const std::vector<bool> &is_last_chunk) {
+    const std::vector<bool> &is_last_chunk,
+    std::vector<const std::string *> *partial_hypotheses) {
   nvtxRangePushA("DecodeBatch");
   KALDI_ASSERT(corr_ids.size() > 0);
   KALDI_ASSERT(corr_ids.size() == wave_samples.size());
@@ -242,9 +243,26 @@ void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
     }
   }
   int features_frame_stride = d_all_features_.Stride();
+  if (partial_hypotheses) {
+    // We're going to have to generate the partial hypotheses
+    if (word_syms_ == nullptr) {
+      KALDI_ERR << "You need to set --word-symbol-table to use "
+                << "partial hypotheses";
+    }
+    cuda_decoder_->AllowPartialHypotheses();
+  }
   DecodeBatch(corr_ids, d_features_ptrs_, features_frame_stride,
               n_input_frames_valid_, d_ivectors_ptrs_, is_first_chunk,
               is_last_chunk, &channels_);
+
+  if (partial_hypotheses) {
+    partial_hypotheses->resize(channels_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      PartialHypothesis *partial_hypothesis;
+      cuda_decoder_->GetPartialHypothesis(channels_[i], &partial_hypothesis);
+      (*partial_hypotheses)[i] = &partial_hypothesis->out_str;
+    }
+  }
 }
 
 void BatchedThreadedNnet3CudaOnlinePipeline::DecodeBatch(
@@ -433,7 +451,8 @@ void BatchedThreadedNnet3CudaOnlinePipeline::FinalizeDecoding(
   }
 
   if (dlat.NumStates() > 0) {
-    if (word_syms_) {
+    // Used for debugging
+    if (false && word_syms_) {
       CompactLattice best_path_clat;
       CompactLatticeShortestPath(dlat, &best_path_clat);
 
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
index ccb91cb2fc9..75f1dc15c45 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h
@@ -169,10 +169,17 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   // If it contains some last chunks for given utterances, it will call
   // FinalizeDecoding (building the final lattice, determinize it, etc.)
   // asynchronously. The callback for that utterance will then be called
-  void DecodeBatch(const std::vector<CorrelationID> &corr_ids,
-                   const std::vector<SubVector<BaseFloat>> &wave_samples,
-                   const std::vector<bool> &is_first_chunk,
-                   const std::vector<bool> &is_last_chunk);
+  //
+  // If partial_hypotheses is not null, generate and set the current partial
+  // hypotheses in partial_hypotheses The pointers in partial_hypotheses are
+  // only valid until the next DecodeBatch call - perform a deep copy if
+  // necessary
+  void DecodeBatch(
+      const std::vector<CorrelationID> &corr_ids,
+      const std::vector<SubVector<BaseFloat>> &wave_samples,
+      const std::vector<bool> &is_first_chunk,
+      const std::vector<bool> &is_last_chunk,
+      std::vector<const std::string *> *partial_hypotheses = nullptr);
 
   // Version providing directly the features. Only runs nnet3 & decoder
   // Used when we want to provide the final ivectors (offline case)
@@ -207,8 +214,12 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   // Maximum number of seconds per chunk
   BaseFloat GetSecondsPerChunk() { return seconds_per_chunk_; }
 
-  // Used when debugging. Used to Print the text when a decoding is done
-  void SetSymbolTable(fst::SymbolTable *word_syms) { word_syms_ = word_syms; }
+  // Used for partial hypotheses
+  void SetSymbolTable(const fst::SymbolTable &word_syms) {
+    word_syms_ = &word_syms;
+    KALDI_ASSERT(cuda_decoder_);
+    cuda_decoder_->SetSymbolTable(word_syms);
+  }
 
   // Wait for all lattice callbacks to complete
   // Can be called after DecodeBatch
@@ -355,7 +366,7 @@ class BatchedThreadedNnet3CudaOnlinePipeline {
   std::unique_ptr<ThreadPoolLight> thread_pool_;
 
   // Used for debugging
-  fst::SymbolTable *word_syms_;
+  const fst::SymbolTable *word_syms_;
   // Used when printing to stdout for debugging purposes
   std::mutex stdout_m_;
 };
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
index e7f00910222..9710ca7508e 100644
--- a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h
@@ -245,7 +245,7 @@ class BatchedThreadedNnet3CudaPipeline2 {
   void WaitForAllTasks();
 
   // Used for debug
-  void SetSymbolTable(fst::SymbolTable *word_syms) {
+  void SetSymbolTable(const fst::SymbolTable &word_syms) {
     cuda_online_pipeline_.SetSymbolTable(word_syms);
   }
 
diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
index fc11ff894bb..0901ba902fc 100644
--- a/src/cudadecoder/cuda-decoder-common.h
+++ b/src/cudadecoder/cuda-decoder-common.h
@@ -345,6 +345,46 @@ class DeviceChannelMatrix : public DeviceMatrix<T> {
   }
 };
 
+// InfoToken contains data that needs to be saved for the backtrack
+// in GetBestPath/GetRawLattice
+// We don't need the token.cost or token.next_state.
+struct __align__(8) InfoToken {
+  int32 prev_token;
+  int32 arc_idx;
+  bool IsUniqueTokenForStateAndFrame() {
+    // This is a trick used to save space and PCI-E bandwidth (cf
+    // preprocess_in_place kernel)
+    // This token is associated with a next_state s, created during the
+    // processing of frame f.
+    // If we have multiple tokens associated with the state s in the frame f,
+    // arc_idx < 0 and -arc_idx is the
+    // count of such tokens. We will then have to look at another list to read
+    // the actually arc_idx and prev_token values
+    // If the current token is the only one, prev_token and arc_idx are valid
+    // and can be used directly
+    return (arc_idx >= 0);
+  }
+
+  // Called if this token is linked to others tokens in the same frame (cf
+  // comments for IsUniqueTokenForStateAndFrame)
+  // return the {offset,size} pair necessary to list those tokens in the
+  // extra_prev_tokens list
+  // They are stored at offset "offset", and we have "size" of those
+  std::pair<int32, int32> GetSameFSTStateTokensList() {
+    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
+
+    return {prev_token, -arc_idx};
+  }
+};
+
+// Device function, used to set a in an InfoToken the [offset,size] related to
+// InfoToken.GetSameFSTStateTokensList
+__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
+                                                     InfoToken *info_token) {
+  // We always have size > 0
+  *info_token = {offset, -size};
+}
+
 // LaneCounters/ChannelCounters
 // The counters are all the singular values associated to a lane/channel
 // For instance  the main queue size. Or the min_cost of all tokens in that
@@ -431,6 +471,7 @@ struct LaneCounters {
   int32 n_within_lattice_beam;
   int32 has_reached_final;  // if there's at least one final token in the queue
   int32 prev_arg_min_int_cost;
+  InfoToken prev_arg_min_int_cost_token;
 };
 
 // Channel counters
@@ -471,46 +512,6 @@ class CudaDecoderException : public std::exception {
   const bool recoverable;
 };
 
-// InfoToken contains data that needs to be saved for the backtrack
-// in GetBestPath/GetRawLattice
-// We don't need the token.cost or token.next_state.
-struct __align__(8) InfoToken {
-  int32 prev_token;
-  int32 arc_idx;
-  bool IsUniqueTokenForStateAndFrame() {
-    // This is a trick used to save space and PCI-E bandwidth (cf
-    // preprocess_in_place kernel)
-    // This token is associated with a next_state s, created during the
-    // processing of frame f.
-    // If we have multiple tokens associated with the state s in the frame f,
-    // arc_idx < 0 and -arc_idx is the
-    // count of such tokens. We will then have to look at another list to read
-    // the actually arc_idx and prev_token values
-    // If the current token is the only one, prev_token and arc_idx are valid
-    // and can be used directly
-    return (arc_idx >= 0);
-  }
-
-  // Called if this token is linked to others tokens in the same frame (cf
-  // comments for IsUniqueTokenForStateAndFrame)
-  // return the {offset,size} pair necessary to list those tokens in the
-  // extra_prev_tokens list
-  // They are stored at offset "offset", and we have "size" of those
-  std::pair<int32, int32> GetSameFSTStateTokensList() {
-    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
-
-    return {prev_token, -arc_idx};
-  }
-};
-
-// Device function, used to set a in an InfoToken the [offset,size] related to
-// InfoToken.GetSameFSTStateTokensList
-__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
-                                                     InfoToken *info_token) {
-  // We always have size > 0
-  *info_token = {offset, -size};
-}
-
 // Used to store the index in the GPU hashmap of that FST state
 // The hashmap is only generated with the final main queue (post max_active_) of
 // each frame
@@ -558,6 +559,25 @@ enum OVERFLOW_TYPE {
 
 enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
 
+// Used internally to generate partial paths
+struct PartialPathArc {
+  int32 token_idx;
+  int32 arc_idx;
+};
+
+// Partial hypothesis formatted and meant to be used by user
+struct PartialHypothesis {
+  std::vector<int> arc_idx;
+  std::vector<int> olabel;
+  std::string out_str;
+
+  void clear() {
+    arc_idx.clear();
+    olabel.clear();
+    out_str.clear();
+  }
+};
+
 }  // end namespace cuda_decoder
 }  // end namespace kaldi
 
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
index f2a0d16d317..ee436cc9ab8 100644
--- a/src/cudadecoder/cuda-decoder-kernels.cu
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -1401,7 +1401,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
     const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
     int32 min_int_cost = lane_counters->min_int_cost;
     CostType min_cost = orderedIntToFloat(min_int_cost);
-    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int32 global_offset = lane_counters->main_q_global_offset;
     KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
       // Position of considered token in the main_q
       if (main_q_idx < main_q_end) {
@@ -1415,6 +1415,7 @@ __global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
           channel_counters->min_int_cost_and_arg_without_final = {
               token_int_cost, global_offset + main_q_idx};
           lane_counters->prev_arg_min_int_cost = main_q_idx;
+          lane_counters->prev_arg_min_int_cost_token = cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
         } else {
           // remove offset = min_cost
           CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
index 39bb93a087e..35ab5fc6c42 100644
--- a/src/cudadecoder/cuda-decoder.cc
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -15,10 +15,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
 #if HAVE_CUDA == 1
 
-#include "cuda-decoder.h"
 #include "cuda-decoder-kernels.h"
+#include "cuda-decoder.h"
 
 #include <cuda_runtime_api.h>
 #include <nvToolsExt.h>
@@ -31,7 +32,9 @@ namespace kaldi {
 namespace cuda_decoder {
 CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
                          int32 nlanes, int32 nchannels)
-    : fst_(fst),
+    : word_syms_(NULL),
+      generate_partial_hypotheses_(false),
+      fst_(fst),
       nlanes_(nlanes),
       nchannels_(nchannels),
       channel_lock_(nchannels + 1),
@@ -88,15 +91,15 @@ CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
 }
 
 void CudaDecoder::ReadConfig(const CudaDecoderConfig &cst_config) {
-  CudaDecoderConfig config = cst_config;  // deep copy
+  config_ = cst_config;  // deep copy
   // Sets the missing values using other values
-  config.ComputeConfig();
-  default_beam_ = config.default_beam;
-  lattice_beam_ = config.lattice_beam;
-  ntokens_pre_allocated_ = config.ntokens_pre_allocated;
-  max_active_ = config.max_active;
-  aux_q_capacity_ = config.aux_q_capacity;
-  main_q_capacity_ = config.main_q_capacity;
+  config_.ComputeConfig();
+  default_beam_ = config_.default_beam;
+  lattice_beam_ = config_.lattice_beam;
+  ntokens_pre_allocated_ = config_.ntokens_pre_allocated;
+  max_active_ = config_.max_active;
+  aux_q_capacity_ = config_.aux_q_capacity;
+  main_q_capacity_ = config_.main_q_capacity;
 
   KALDI_ASSERT(default_beam_ >= 0.0f);
   KALDI_ASSERT(lattice_beam_ >= 0.0f);
@@ -184,6 +187,8 @@ void CudaDecoder::AllocateHostData() {
   h_all_tokens_acoustic_cost_.resize(nchannels_);
   h_all_tokens_extra_prev_tokens_.resize(nchannels_);
   h_all_tokens_info_.resize(nchannels_);
+  h_all_channels_partial_hypotheses_.resize(nchannels_);
+  h_all_channels_partial_hypotheses_out_.resize(nchannels_);
   for (int32 ichannel = 0; ichannel < nchannels_; ++ichannel) {
     h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel].reserve(
         ntokens_pre_allocated_);
@@ -193,6 +198,8 @@ void CudaDecoder::AllocateHostData() {
   h_main_q_end_lane_offsets_.resize(nlanes_ + 1);
   h_emitting_main_q_end_lane_offsets_.resize(nlanes_ + 1);
   h_n_extra_prev_tokens_lane_offsets_.resize(nlanes_ + 1);
+  h_argmin_token_cost_.resize(nlanes_);
+  h_argmin_token_cost_token_.resize(nlanes_);
   frame_offsets_.resize(nchannels_);
   num_frames_decoded_.resize(nchannels_, -1);
   lanes2channels_todo_.reserve(nlanes_);
@@ -334,6 +341,7 @@ void CudaDecoder::ComputeInitialChannel() {
   KALDI_ASSERT(ilane == 0);
   // Following kernels working channel_id
   std::vector<ChannelId> channels = {init_channel_id_};
+  num_frames_decoded_[init_channel_id_] = 0;
   SetChannelsInKernelParams(channels);  // not calling LoadChannelsStateToLanes,
                                         // init_channel_id_ is a special case
   h_lanes_counters_.lane(ilane)->channel_to_compute = init_channel_id_;
@@ -419,6 +427,8 @@ void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
     h_all_argmin_cost_[ichannel] = {-1, 0.0f};
     frame_offsets_[ichannel].clear();
     frame_offsets_[ichannel].push_back(n_initial_tokens);
+    h_all_channels_partial_hypotheses_[ichannel].clear();
+    h_all_channels_partial_hypotheses_out_[ichannel].clear();
     // TODO put it back
     // if (thread_pool_) {
     //  thread_pool_->post([ichannel, this] {
@@ -655,7 +665,16 @@ void CudaDecoder::CopyMainQueueDataToHost() {
         h_lanes_counters_.lane(ilane)->main_q_end_lane_offset;
     h_n_extra_prev_tokens_lane_offsets_[ilane] =
         h_lanes_counters_.lane(ilane)->main_q_n_extra_prev_tokens_lane_offset;
-    lanes2channels_todo_.push_back(channel_to_compute_[ilane]);
+    if (ilane < nlanes_used_) {
+      lanes2channels_todo_.push_back(channel_to_compute_[ilane]);
+      int32 global_offset = h_lanes_counters_.lane(ilane)->main_q_global_offset;
+      h_argmin_token_cost_[ilane] =
+          global_offset + h_lanes_counters_.lane(ilane)->prev_arg_min_int_cost;
+      h_argmin_token_cost_token_[ilane] =
+          h_lanes_counters_.lane(ilane)->prev_arg_min_int_cost_token;
+      const ChannelId ichannel = channel_to_compute_[ilane];
+      ++num_frames_decoded_[ichannel];
+    }
   }
 
   LaunchH2HCopies();
@@ -848,8 +867,6 @@ void CudaDecoder::AdvanceDecoding(
 
   for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
     const ChannelId ichannel = channel_to_compute_[ilane];
-    // We're done processing that frame
-    ++num_frames_decoded_[ichannel];
     const int32 main_q_end =
         h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
     // Saving frame offsets for GetRawLattice
@@ -857,6 +874,17 @@ void CudaDecoder::AdvanceDecoding(
                                        main_q_end);
   }
   SaveChannelsStateFromLanes();
+
+  // Waiting for partial path to be ready (if set)
+  // They are computed async
+  WaitForPartialHypotheses();
+}
+
+void CudaDecoder::WaitForPartialHypotheses() {
+  if (!generate_partial_hypotheses_) return;
+  while (n_partial_hypotheses_threads_not_done_.load(
+             std::memory_order_acquire) > 0)
+    usleep(200);
 }
 
 void CudaDecoder::CheckOverflow() {
@@ -991,6 +1019,57 @@ void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
   }
 }
 
+void CudaDecoder::GetBestPredecessor(int32 ichannel, int32 curr_token_idx,
+                                     int32 *prev_token_idx_out,
+                                     int32 *arc_idx_out) {
+  KALDI_ASSERT(curr_token_idx > 0);
+  KALDI_ASSERT(curr_token_idx < h_all_tokens_info_[ichannel].size());
+  InfoToken token = h_all_tokens_info_[ichannel][curr_token_idx];
+  // We want an arc with extra_cost == 0
+  int32 arc_idx;
+  TokenId prev_token_idx;
+  if (token.IsUniqueTokenForStateAndFrame()) {
+    // If we have only one, it is an arc with
+    // extra_cost == 0
+    arc_idx = token.arc_idx;
+    prev_token_idx = token.prev_token;
+  } else {
+    // Using the first arc with extra_cost == 0
+    int32 offset, size;
+    std::tie(offset, size) = token.GetSameFSTStateTokensList();
+    bool found_best = false;
+    for (auto i = 0; i < size; ++i) {
+      KALDI_ASSERT(
+          (offset + i) <
+          h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+              .size());  // TODO IF paranoid
+      CostType arc_extra_cost =
+          h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                 [offset + i]
+                                                                     .x;
+      // Picking one arc on the best path
+      // (extra_cost == 0)
+      if (arc_extra_cost == 0.0f) {
+        KALDI_ASSERT(
+            h_all_tokens_extra_prev_tokens_[ichannel].size() ==
+            h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                .size());
+        KALDI_ASSERT((offset + i) <
+                     h_all_tokens_extra_prev_tokens_[ichannel].size());
+        InfoToken list_token =
+            h_all_tokens_extra_prev_tokens_[ichannel][offset + i];
+        arc_idx = list_token.arc_idx;
+        prev_token_idx = list_token.prev_token;
+        found_best = true;
+        break;
+      }
+    }
+    KALDI_ASSERT(found_best);
+  }
+  *prev_token_idx_out = prev_token_idx;
+  *arc_idx_out = arc_idx;
+}
+
 void CudaDecoder::GetBestPath(const std::vector<ChannelId> &channels,
                               std::vector<Lattice *> &fst_out_vec,
                               bool use_final_probs) {
@@ -1019,39 +1098,8 @@ void CudaDecoder::GetBestPath(const std::vector<ChannelId> &channels,
     // it always has index 0
     // We backtrack until that first token
     while (token_idx != 0) {
-      InfoToken token = h_all_tokens_info_[ichannel][token_idx];
-      // We want an arc with extra_cost == 0
-      int32 arc_idx;
-      TokenId prev_token_idx;
-      if (token.IsUniqueTokenForStateAndFrame()) {
-        // If we have only one, it is an arc with
-        // extra_cost == 0
-        arc_idx = token.arc_idx;
-        prev_token_idx = token.prev_token;
-      } else {
-        // Using the first arc with extra_cost == 0
-        int32 offset, size;
-        std::tie(offset, size) = token.GetSameFSTStateTokensList();
-        bool found_best = false;
-        for (auto i = 0; i < size; ++i) {
-          CostType arc_extra_cost =
-              h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
-                                                                     [offset +
-                                                                      i]
-                                                                         .x;
-          // Picking one arc on the best path
-          // (extra_cost == 0)
-          if (arc_extra_cost == 0.0f) {
-            InfoToken list_token =
-                h_all_tokens_extra_prev_tokens_[ichannel][offset + i];
-            arc_idx = list_token.arc_idx;
-            prev_token_idx = list_token.prev_token;
-            found_best = true;
-            break;
-          }
-        }
-        KALDI_ASSERT(found_best);
-      }
+      int32 prev_token_idx, arc_idx;
+      GetBestPredecessor(ichannel, token_idx, &prev_token_idx, &arc_idx);
       reversed_path.push_back(arc_idx);
       token_idx = prev_token_idx;
     }
@@ -1455,6 +1503,7 @@ void CudaDecoder::SwapPrevAndCurrLatticeMap(
 }
 
 void CudaDecoder::WaitForH2HCopies() {
+  Timer timer;
   std::unique_lock<std::mutex> lk(n_h2h_task_not_done_mutex_);
   h2h_done_.wait(lk, [this] { return (n_h2h_task_not_done_ == 0); });
 }
@@ -1717,7 +1766,11 @@ void CudaDecoder::LaunchH2HCopies() {
   n_acoustic_h2h_copies_todo_.store(nlanes_used_ - 1);
   n_infotoken_h2h_copies_todo_.store(nlanes_used_ - 1);
   n_extra_prev_tokens_h2h_copies_todo_.store(nlanes_used_ - 1);
-
+  if (generate_partial_hypotheses_) {
+    n_partial_hypotheses_format_output_todo_.store(nlanes_used_ - 1);
+    n_partial_hypotheses_threads_not_done_.store(thread_pool_ ? n_threads_used_
+                                                              : 1);
+  }
   {
     std::lock_guard<std::mutex> n_h2h_not_done_lk(n_h2h_task_not_done_mutex_);
     n_h2h_task_not_done_ += thread_pool_ ? n_threads_used_ : 1;
@@ -1742,6 +1795,86 @@ void CudaDecoder::ComputeH2HCopiesCPUWorker() {
   }
 }
 
+void CudaDecoder::GeneratePartialPath(LaneId ilane, ChannelId ichannel) {
+  // Partial hypothesis
+  int curr_token_idx = h_argmin_token_cost_[ilane];
+  if (curr_token_idx > 0) {
+    // Finding out what is that token's predecessor
+    InfoToken token = h_argmin_token_cost_token_[ilane];
+    int prev_token_idx = token.prev_token;
+    int arc_idx = token.arc_idx;
+    std::list<PartialPathArc> &partial_hypotheses =
+        h_all_channels_partial_hypotheses_[ichannel];
+
+    // Adding that link at the end of the partial path
+    partial_hypotheses.push_back({curr_token_idx, arc_idx});
+    // Backtracking until we reconnect with our stored partial path
+    if (partial_hypotheses.size() > 1) {
+      auto it = std::prev(partial_hypotheses.end(), 2);
+
+      int32 stored_prev_token_idx = it->token_idx;
+      if (stored_prev_token_idx != prev_token_idx) {
+        // The new partial best path is not directly to the previous partial
+        // best path We need to backtrack until we reconnect with the previous
+        // partial best path (or until we reach the root node)
+
+        // Locking, we need the host channel data to backtrack
+        std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+
+        while (true) {
+          stored_prev_token_idx = it->token_idx;
+          if (stored_prev_token_idx == prev_token_idx)
+            break;  // no need to rewrite existing partial path
+          curr_token_idx = prev_token_idx;
+          GetBestPredecessor(ichannel, curr_token_idx, &prev_token_idx,
+                             &arc_idx);
+          *it = {curr_token_idx, arc_idx};
+
+          if (prev_token_idx == 0) break;
+          if (it == partial_hypotheses.begin()) {
+            // Our new path is longer than the previous one
+            // Adding some elts
+            partial_hypotheses.push_front(
+                {-1, -1});  // it will be set on next iteration
+          }
+          --it;
+        }
+
+        if (prev_token_idx == 0) {
+          // We've reached the beginning, we need to purge any elts
+          partial_hypotheses.erase(partial_hypotheses.begin(), it);
+        }
+      }
+    }
+  }
+}
+
+void CudaDecoder::BuildPartialHypothesisOutput(ChannelId ichannel) {
+  // We assume that we own the channel lock
+  std::list<PartialPathArc> &partial_hypotheses_internal =
+      h_all_channels_partial_hypotheses_[ichannel];
+  PartialHypothesis &out = h_all_channels_partial_hypotheses_out_[ichannel];
+  out.olabel.clear();
+  out.arc_idx.clear();
+  std::ostringstream oss;
+  bool empty = true;
+  // We should only append one word when not backtrack was done in
+  // GeneratePartialPath
+  for (PartialPathArc &link : partial_hypotheses_internal) {
+    int arc_idx = link.arc_idx;
+    int olabel = fst_.h_arc_olabels_[arc_idx];
+    if (olabel == 0) continue;
+    out.olabel.push_back(olabel);
+    out.arc_idx.push_back(arc_idx);
+    if (word_syms_) {
+      if (!empty) oss << ' ';
+      oss << word_syms_->Find(olabel);
+      empty = false;
+    }
+  }
+  out.out_str = oss.str();
+}
+
 void CudaDecoder::ComputeH2HCopies() {
   // Waiting for either something to do or the instruction to stop the
   // threads
@@ -1755,10 +1888,21 @@ void CudaDecoder::ComputeH2HCopies() {
   // If we are done, stop the wait and return now.
   // ComputeH2HCopiesCPUWorker will also return, stopping the thread
   if (!h2h_threads_running_) return;
+
+  int32 ilane;
+  if (generate_partial_hypotheses_) {
+    while ((ilane = n_partial_hypotheses_format_output_todo_.fetch_sub(1)) >=
+           0) {
+      int32 ichannel = lanes2channels_todo_[ilane];
+      GeneratePartialPath(ilane, ichannel);
+      BuildPartialHypothesisOutput(ichannel);
+    }
+    n_partial_hypotheses_threads_not_done_.fetch_sub(1,
+                                                     std::memory_order_release);
+  }
   // Waiting for the D2H copies. This is threadsafe
   // Step 1: acoustic costs
   cudaEventSynchronize(d2h_copy_acoustic_evt_);
-  int32 ilane;
   while ((ilane = n_acoustic_h2h_copies_todo_.fetch_sub(1)) >= 0) {
     int32 ichannel = lanes2channels_todo_[ilane];
     // Lock Channel
@@ -1786,7 +1930,9 @@ void CudaDecoder::ComputeH2HCopies() {
                                  h_infotoken_concat_, &h_all_tokens_info_);
   }
 
-  // Step 3: extra prev tokens
+  // Step 3:
+  // - extra prev tokens
+  // - partial path and endpointing
   cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_);
   while ((ilane = n_extra_prev_tokens_h2h_copies_todo_.fetch_sub(1)) >= 0) {
     int32 ichannel = lanes2channels_todo_[ilane];
@@ -1824,6 +1970,6 @@ void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool,
 }
 
 }  // namespace cuda_decoder
-}  // end namespace kaldi
+}  // namespace kaldi
 
 #endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
index 95bc7cac130..b62661274f5 100644
--- a/src/cudadecoder/cuda-decoder.h
+++ b/src/cudadecoder/cuda-decoder.h
@@ -236,8 +236,17 @@ class CudaDecoder {
                        std::vector<CudaDecodableInterface *> &decodables,
                        int32 max_num_frames = -1);
 
+  void AllowPartialHypotheses() { generate_partial_hypotheses_ = true; }
+  void GetPartialHypothesis(ChannelId ichannel, PartialHypothesis **out) {
+    KALDI_ASSERT(generate_partial_hypotheses_);
+    // No need to lock, all ops on h_all_channels_partial_hypotheses_out_ are
+    // done before returning InitDecoding or AdvanceDecoding
+    *out = &h_all_channels_partial_hypotheses_out_[ichannel];
+  }
+
   // Returns the number of frames already decoded in a given channel
   int32 NumFramesDecoded(ChannelId ichannel) const;
+
   // GetBestPath gets the one-best decoding traceback. If
   // "use_final_probs" is true AND we reached a final state, it limits
   // itself to final states; otherwise it gets the most likely token not
@@ -291,6 +300,11 @@ class CudaDecoder {
   void SetThreadPoolAndStartCPUWorkers(ThreadPoolLight *thread_pool,
                                        int32 nworkers);
 
+  // Used to generate partial results
+  void SetSymbolTable(const fst::SymbolTable &word_syms) {
+    word_syms_ = &word_syms;
+  }
+
  private:
   // Data allocation. Called in constructor
   void AllocateDeviceData();
@@ -314,6 +328,10 @@ class CudaDecoder {
   // software threads into the registers of a CPU)
   void LoadChannelsStateToLanes(const std::vector<ChannelId> &channels);
   void SaveChannelsStateFromLanes();
+  // Given a token, get its best predecessor (lower cost predecessor)
+  // Used by GetBestPath or best path traceback
+  void GetBestPredecessor(int32 ichannel, int32 curr_token_idx,
+                          int32 *prev_token_idx_out, int32 *arc_idx_out);
   // Expand the arcs, emitting stage. Must be called after
   // a preprocess_in_place, which happens in PostProcessingMainQueue.
   // ExpandArcsEmitting is called first when decoding a frame,
@@ -409,6 +427,15 @@ class CudaDecoder {
   // We then have to unpack it and move it inside host memory
   // This is done by ComputeH2HCopies
   void ComputeH2HCopies();
+
+  // Used to generate the partial hypotheses
+  // Called by the worker threads async
+  void BuildPartialHypothesisOutput(ChannelId ichannel);
+  void GeneratePartialPath(LaneId ilane, ChannelId ichannel);
+  // Wait for the async partial hypotheses related tasks to be done
+  // before returning
+  void WaitForPartialHypotheses();
+
   // Takes care of preparing the data for ComputeH2HCopies
   // and check whether we can use the threadpool or we have to do the work
   // on the current thread
@@ -436,6 +463,10 @@ class CudaDecoder {
   // Data members
   //
 
+  CudaDecoderConfig config_;
+  const fst::SymbolTable *word_syms_;  // for partial hypotheses
+  bool generate_partial_hypotheses_;   // set by AllowPartialHypotheses
+
   // The CudaFst data structure contains the FST graph
   // in the CSR format, on both the GPU and CPU memory
   const CudaFst fst_;
@@ -689,6 +720,19 @@ class CudaDecoder {
   std::vector<int32> h_main_q_end_lane_offsets_;
   std::vector<int32> h_emitting_main_q_end_lane_offsets_;
   std::vector<int32> h_n_extra_prev_tokens_lane_offsets_;
+  // Index of the best index for the last frame. Used by endpointing/partial
+  // results
+  std::vector<int32> h_argmin_token_cost_;
+  std::vector<InfoToken> h_argmin_token_cost_token_;
+  // Partial path so far on a given channel
+
+  // Partial hypotheses to be used by user
+  // Only valid between API calls (InitDecoding, AdvanceDecoding)
+  std::vector<PartialHypothesis> h_all_channels_partial_hypotheses_out_;
+
+  // Used internally to store the state of the current partial hypotheses
+  std::vector<std::list<PartialPathArc>> h_all_channels_partial_hypotheses_;
+
   // Used when calling GetBestCost
   std::vector<std::pair<int32, CostType>> argmins_;
   std::vector<bool> has_reached_final_;
@@ -767,6 +811,11 @@ class CudaDecoder {
   std::condition_variable h2h_done_;
   std::condition_variable init_decoding_h2h_done_;
   std::atomic<bool> active_wait_;
+
+  // Used for sync on partial hypotheses tasks
+  std::atomic_int32_t n_partial_hypotheses_format_output_todo_;
+  std::atomic_int32_t n_partial_hypotheses_threads_not_done_;
+
   bool h2h_threads_running_;
   // Using the output from GetBestPath, we add the best tokens (as
   // selected in GetBestCost) from the final frame to the output lattice.
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
index f27eb54be6e..7bd97a739c8 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda-online.cc
@@ -99,11 +99,14 @@ int main(int argc, char *argv[]) {
     int num_todo = -1;
     int niterations = 3;
     int num_streaming_channels = 2000;
+    bool print_partial_hypotheses = false;
     ParseOptions po(usage);
     po.Register("write-lattice", &write_lattice,
                 "Output lattice to a file. Setting to "
                 "false is useful when "
                 "benchmarking");
+    po.Register("print-partial-hypotheses", &print_partial_hypotheses,
+                "Prints the partial hypotheses");
     po.Register("word-symbol-table", &word_syms_rxfilename,
                 "Symbol table for words [for debug output]");
     po.Register("file-limit", &num_todo,
@@ -170,7 +173,7 @@ int main(int argc, char *argv[]) {
                      "table from file "
                   << word_syms_rxfilename;
       else {
-        //        cuda_pipeline.SetSymbolTable(word_syms);
+        cuda_pipeline.SetSymbolTable(*word_syms);
       }
     }
 
@@ -248,6 +251,9 @@ int main(int argc, char *argv[]) {
     // Used when use_online_ivectors_
     std::vector<SubVector<BaseFloat>> batch_wave_samples;
 
+    // Partial hypotheses
+    std::vector<const std::string *> partial_hypotheses;
+
     double batch_valid_at = gettime_monotonic();
     bool pipeline_starved_warning_printed = false;
     while (true) {
@@ -339,7 +345,16 @@ int main(int argc, char *argv[]) {
           if (wait_for > 0) usleep(wait_for * 1e6);
 
           cuda_pipeline.DecodeBatch(batch_corr_ids, batch_wave_samples,
-                                    batch_is_first_chunk, batch_is_last_chunk);
+                                    batch_is_first_chunk, batch_is_last_chunk,
+                                    &partial_hypotheses);
+          if (print_partial_hypotheses) {
+            KALDI_LOG << "========== BEGIN OF PARTIAL HYPOTHESES ==========";
+            for (size_t i = 0; i < batch_corr_ids.size(); ++i) {
+              KALDI_LOG << "CORR_ID #" << batch_corr_ids[i]
+                        << "\t: " << *partial_hypotheses[i];
+            }
+            KALDI_LOG << "=========== END OF PARTIAL HYPOTHESES ===========";
+          }
           batch_corr_ids.clear();
           batch_is_first_chunk.clear();
           batch_is_last_chunk.clear();
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
index 83f5b6a0650..980658e00d4 100644
--- a/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda2.cc
@@ -123,9 +123,7 @@ int main(int argc, char *argv[]) {
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-      else {
-        //        cuda_pipeline.SetSymbolTable(word_syms);
-      }
+      cuda_pipeline.SetSymbolTable(*word_syms);
     }
 
     int32 num_task_submitted = 0, num_err = 0;