[src] Add binary that functions as a TCP server (#2938)

kaldi-asr · Mar 20, 2019 · 252690f · 252690f
1 parent f9276a5
commit 252690f
Show file tree

Hide file tree

Showing 10 changed files with 578 additions and 7 deletions.
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
@@ -438,6 +438,70 @@ and downloadable models that can be used with online nnet3 decoding, please
 see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
 includes instructions in a README file).
 
+\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding
+
+The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
+~/src/online2bin folder. The usage is as follows:
+
+\verbatim
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table> <listen-port>
+\endverbatim
+
+For example:
+
+\verbatim
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
+the server outputs word strings. Endpointing is mandatory to make the operation of the
+program reasonable. Other, non-standard options include:
+    - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
+    - chunk-length - length of signal being processed by decoder at each step
+    - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
+    - num-threads-startup - number of threads used when initializing iVector extractor
+
+The TCP protocol simply takes RAW signal on input (16-bit signed integer
+encoding at chosen sampling frequency) and outputs simple text using the following
+logic:
+    - each refresh period (output-freq argument) the current state of decoding is output
+    - each line is terminated by '\r'
+    - once an utterance boundary is detected due to endpointing a '\n' char is output
+
+Each output string (delimited by '\r') should be treated as uncertain and can change
+entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
+specifically in order to make the output look neat in the terminal. It is possible to
+use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.
+
+To run the program from the terminal you can use one of the following commands. First,
+make sure the server is running and accepting connections. Using the Aspire models, the
+command should look like this:
+\verbatim
+online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
+    --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
+sent to the socket:
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
+\endverbatim
+
+It is possible to play audio (almost) simultaneously as decoding. It may require installing the
+'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):
+
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
+    tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
+    pv -L 16000 -q | nc -N localhost 5050
+\endverbatim
+
+Finally, it is possible to send audio from the microphone directly into the server:
+
+\verbatim
+rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
+\endverbatim
 
 
 */

diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
@@ -30,6 +30,7 @@ DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1),
     info_(info),
+    frame_offset_(0),
     input_features_(input_features),
     ivector_features_(ivector_features),
     computer_(info_.opts.compute_config, info_.computation,
@@ -66,7 +67,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
   if (input_finished) {
     // if the input has finished,... we'll pad with duplicates of the last frame
     // as needed to get the required right context.
-    return (features_ready + sf - 1) / sf;
+    return (features_ready + sf - 1) / sf - frame_offset_;
   } else {
     // note: info_.right_context_ includes both the model context and any
     // extra_right_context_ (but this
@@ -78,7 +79,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
     // doesn't need any attention to rounding because info_.frames_per_chunk
     // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
     // in decodable-simple-looped.cc).
-    return num_chunks_ready * info_.frames_per_chunk / sf;
+    return num_chunks_ready * info_.frames_per_chunk / sf - frame_offset_;
   }
 }
 
@@ -105,9 +106,14 @@ bool DecodableNnetLoopedOnlineBase::IsLastFrame(
     return false;
   int32 sf = info_.opts.frame_subsampling_factor,
      num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
-  return (subsampled_frame == num_subsampled_frames_ready - 1);
+  return (subsampled_frame + frame_offset_ == num_subsampled_frames_ready - 1);
 }
 
+void DecodableNnetLoopedOnlineBase::SetFrameOffset(int32 frame_offset) {
+  KALDI_ASSERT(0 <= frame_offset &&
+               frame_offset <= frame_offset_ + NumFramesReady());
+  frame_offset_ = frame_offset;
+}
 
 void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   // Prepare the input data for the next chunk of features.
@@ -231,6 +237,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
 
 BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   // note: we index by 'inde
   return current_log_post_(
@@ -241,6 +248,7 @@ BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
 
 BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,

diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
@@ -81,6 +81,17 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
     return info_.opts.frame_subsampling_factor;
   }
 
+  /// Sets the frame offset value. Frame offset is initialized to 0 when the
+  /// decodable object is constructed and stays as 0 unless this method is
+  /// called. This method is useful when we want to reset the decoder state,
+  /// i.e. call decoder.InitDecoding(), but we want to keep using the same
+  /// decodable object, e.g. in case of an endpoint. The frame offset affects
+  /// the behavior of IsLastFrame(), NumFramesReady() and LogLikelihood()
+  /// methods.
+  void SetFrameOffset(int32 frame_offset);
+
+  /// Returns the frame offset value.
+  int32 GetFrameOffset() const { return frame_offset_; }
 
  protected:
 
@@ -111,6 +122,11 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 
   const DecodableNnetSimpleLoopedInfo &info_;
 
+  // IsLastFrame(), NumFramesReady() and LogLikelihood() methods take into
+  // account this offset value. We initialize frame_offset_ as 0 and it stays as
+  // 0 unless SetFrameOffset() method is called.
+  int32 frame_offset_;
+
  private:
 
   // This function does the computation for the next chunk.  It will change

diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
@@ -166,7 +166,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
 
   // This is supplied for debug purposes.
   void GetAsMatrix(Matrix<BaseFloat> *feats);
-  
+
   void FreezeCmvn();  // stop it from moving further (do this when you start
                       // using fMLLR). This will crash if NumFramesReady() == 0.
 

diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online-nnet2-feature-pipeline.cc
@@ -128,6 +128,21 @@ void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
   return final_feature_->GetFrame(frame, feat);
 }
 
+void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
+    const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+    int32 frame_offset) {
+  if (frame_offset == 0) {
+    IvectorFeature()->UpdateFrameWeights(delta_weights);
+  } else {
+    std::vector<std::pair<int32, BaseFloat> > offset_delta_weights;
+    for (size_t i = 0; i < delta_weights.size(); i++) {
+      offset_delta_weights.push_back(std::make_pair(
+          delta_weights[i].first + frame_offset, delta_weights[i].second));
+    }
+    IvectorFeature()->UpdateFrameWeights(offset_delta_weights);
+  }
+}
+
 void OnlineNnet2FeaturePipeline::SetAdaptationState(
     const OnlineIvectorExtractorAdaptationState &adaptation_state) {
   if (info_.use_ivectors) {

diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
@@ -196,6 +196,20 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   virtual int32 NumFramesReady() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  /// If you are downweighting silence, you can call
+  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
+  /// class using UpdateFrameWeights().  The reason why this call happens
+  /// outside this class, rather than this class pulling in the data weights,
+  /// relates to multi-threaded operation and also from not wanting this class
+  /// to have excessive dependencies.
+  ///
+  /// You must either always call this as soon as new data becomes available,
+  /// ideally just after calling AcceptWaveform(), or never call it for the
+  /// lifetime of this object.
+  void UpdateFrameWeights(
+      const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+      int32 frame_offset = 0);
+
   /// Set the adaptation state to a particular value, e.g. reflecting previous
   /// utterances of the same speaker; this will generally be called after
   /// Copy().

diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
@@ -41,6 +41,12 @@ SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
   decoder_.InitDecoding();
 }
 
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::AdvanceDecoding() {
   decoder_.AdvanceDecoding(&decodable_);
@@ -56,7 +62,6 @@ int32 SingleUtteranceNnet3DecoderTpl<FST>::NumFramesDecoded() const {
   return decoder_.NumFramesDecoded();
 }
 
-
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
                                              CompactLattice *clat) const {

diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
@@ -60,7 +60,13 @@ class SingleUtteranceNnet3DecoderTpl {
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
 
-  /// advance the decoding as far as we can.
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
   void AdvanceDecoding();
 
   /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the

diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
@@ -11,7 +11,8 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
      online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
-     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar
+     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
+     online2-tcp-nnet3-decode-faster
 
 OBJFILES =