Changes done for v2.3.2.

IBMStreams · Jan 10, 2022 · cb7ae97 · cb7ae97
1 parent f17e1e3
commit cb7ae97
Show file tree

Hide file tree

Showing 14 changed files with 214 additions and 24 deletions.
diff --git a/com.ibm.streamsx.sttgateway/CHANGELOG.md b/com.ibm.streamsx.sttgateway/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changes
 
+## v2.3.2
+* Jan/10/2022
+* Fixed a problem where the call start date time values were not always correctly included in the STT result.
+* Added these three new parameters to the WatsonSTT operator: speechDetectorSensitivity, backgroundAudioSuppression, characterInsertionBias
+
 ## v2.3.1
 * Sep/20/2021
 * Dynamically change the maximum concurrent calls allowed value.

diff --git a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml
@@ -510,7 +510,7 @@
 
       <parameter>
         <name>customizationWeight</name>
-        <description>This parameter specifies a relative weight for a custom language model as a float64 between 0.0 to 1.0 (Default is 0.0)</description>
+        <description>This parameter specifies a relative weight for a custom language model as a float64 from 0.0 to 1.0 (Default is 0.0)</description>
         <optional>true</optional>
         <rewriteAllowed>true</rewriteAllowed>
         <expressionMode>AttributeFree</expressionMode>
@@ -666,6 +666,36 @@
         <cardinality>1</cardinality>
       </parameter>
 
+      <parameter>
+        <name>speechDetectorSensitivity</name>
+        <description>This parameter specifies a float64 value from 0.0 to 1.0 to adjust the sensitivity of speech activity detection (Default is 0.5)</description>
+        <optional>true</optional>
+        <rewriteAllowed>true</rewriteAllowed>
+        <expressionMode>AttributeFree</expressionMode>
+        <type>float64</type>
+        <cardinality>1</cardinality>
+      </parameter>  
+
+      <parameter>
+        <name>backgroundAudioSuppression</name>
+        <description>This parameter specifies a float64 value from 0.0 to 1.0 to suppress side conversations or background noise (Default is 0.0)</description>
+        <optional>true</optional>
+        <rewriteAllowed>true</rewriteAllowed>
+        <expressionMode>AttributeFree</expressionMode>
+        <type>float64</type>
+        <cardinality>1</cardinality>
+      </parameter>  
+
+      <parameter>
+        <name>characterInsertionBias</name>
+        <description>This parameter specifies a float64 value from -0.5 to 1.0 to change how prone the STT engine is to insert more transcribed characters (Default is 0.0)</description>
+        <optional>true</optional>
+        <rewriteAllowed>true</rewriteAllowed>
+        <expressionMode>AttributeFree</expressionMode>
+        <type>float64</type>
+        <cardinality>1</cardinality>
+      </parameter>  
+
     </parameters>
     <inputPorts>
       <inputPortSet>

diff --git a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt
@@ -8,7 +8,7 @@
 /*
 ============================================================
 First created on: Jul/01/2018
-Last modified on: Sep/12/2021
+Last modified on: Jan/04/2022
 
 Please refer to the sttgateway-tech-brief.txt file in the 
 top-level directory of this toolkit to read about 
@@ -459,6 +459,18 @@ https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-websocke
 
 	my $sttLiveMetricsUpdateNeeded = $model->getParameterByName("sttLiveMetricsUpdateNeeded");
 	$sttLiveMetricsUpdateNeeded = $sttLiveMetricsUpdateNeeded ? $sttLiveMetricsUpdateNeeded->getValueAt(0)->getCppExpression() : 1;
+	
+	my $speechDetectorSensitivity = $model->getParameterByName("speechDetectorSensitivity");
+	# Default: 0.5
+	$speechDetectorSensitivity = $speechDetectorSensitivity ? $speechDetectorSensitivity->getValueAt(0)->getCppExpression() : 0.5;
+	
+	my $backgroundAudioSuppression = $model->getParameterByName("backgroundAudioSuppression");
+	# Default: 0.0
+	$backgroundAudioSuppression = $backgroundAudioSuppression ? $backgroundAudioSuppression->getValueAt(0)->getCppExpression() : 0.0;
+
+	my $characterInsertionBias = $model->getParameterByName("characterInsertionBias");
+	# Default: 0.0
+	$characterInsertionBias = $characterInsertionBias ? $characterInsertionBias->getValueAt(0)->getCppExpression() : 0.0;
 %>
 
 #include <type_traits>
@@ -501,7 +513,10 @@ MY_OPERATOR::MY_OPERATOR()
 						<%=$redactionNeeded%>,
 						<%=$keywordsSpottingThreshold%>,
 						<%=$keywordsToBeSpotted%>,
-						<%=$isTranscriptionCompletedRequested%>
+						<%=$isTranscriptionCompletedRequested%>,
+						<%=$speechDetectorSensitivity%>,
+						<%=$backgroundAudioSuppression%>,
+						<%=$characterInsertionBias%>
 					}
 				)
 {}

diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp
@@ -47,6 +47,9 @@ struct WatsonSTTConfig {
 	SPL::float64 keywordsSpottingThreshold;
 	const SPL::list<SPL::rstring> keywordsToBeSpotted;
 	const bool isTranscriptionCompletedRequested;
+	SPL::float64 speechDetectorSensitivity;
+	SPL::float64 backgroundAudioSuppression;
+	SPL::float64 characterInsertionBias;
 
 	// Some definitions
 	//This time becomes effective, when the connectionAttemptsThreshold limit is exceeded

diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp
@@ -232,6 +232,18 @@ WatsonSTTImpl<OP, OT>::WatsonSTTImpl(OP & splOperator_,Conf config_)
 		throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::keywordsSpottingThreshold, "keywordsSpottingThreshold"));
 	}
 
+	if (Conf::speechDetectorSensitivity < 0.0 || Conf::speechDetectorSensitivity > 1.0) {
+		throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::speechDetectorSensitivity, "speechDetectorSensitivity"));
+	}
+
+	if (Conf::backgroundAudioSuppression < 0.0 || Conf::backgroundAudioSuppression > 1.0) {
+		throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::backgroundAudioSuppression, "backgroundAudioSuppression"));
+	}
+
+	if (Conf::characterInsertionBias < -0.5 || Conf::characterInsertionBias > 1.0) {
+		throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::characterInsertionBias, "characterInsertionBias"));
+	}
+
 	// If the keywords to be spotted list is empty, then disable keywords_spotting.
 	if (Conf::keywordsToBeSpotted.size() == 0) {
 		Conf::keywordsSpottingThreshold = 0.0;
@@ -246,9 +258,9 @@ WatsonSTTImpl<OP, OT>::WatsonSTTImpl(OP & splOperator_,Conf config_)
 	}
 
 	// The parameters maxUtteranceAlternatives, wordAlternativesThreshold, keywordsSpottingThreshold, keywordsToBeSpotted
-	// are not available in sttResultMose complete
+	// are not available in sttResultMode complete
 	// The COF getUtteranceNumber, isFinalizedUtterance, getConfidence, getUtteranceAlternatives
-	// are not available in sttResultMose complete
+	// are not available in sttResultMode complete
 
 	// Update the operator metric.
 	sttOutputResultModeMetric->setValueNoLock(Conf::sttOutputResultMode);
@@ -284,6 +296,9 @@ WatsonSTTImpl<OP, OT>::WatsonSTTImpl(OP & splOperator_,Conf config_)
 	<< "\nkeywordsSpottingThreshold               = " << Conf::keywordsSpottingThreshold
 	<< "\nkeywordsToBeSpotted                     = " << Conf::keywordsToBeSpotted
 	<< "\nisTranscriptionCompletedRequested       = " << Conf::isTranscriptionCompletedRequested
+	<< "\nspeechDetectorSensitivity               = " << Conf::speechDetectorSensitivity
+	<< "\nbackgroundAudioSuppression              = " << Conf::backgroundAudioSuppression
+	<< "\ncharacterInsertionBias                  = " << Conf::characterInsertionBias
 	<< "\nconnectionState.wsState.is_lock_free()  = " << Rec::wsState.is_lock_free()
 	<< "\nrecentOTuple.is_lock_free()             = " << Rec::recentOTuple.is_lock_free()
 	<< "\n----------------------------------------------------------------" << std::endl;

diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp
@@ -578,6 +578,20 @@ void WatsonSTTImplReceiver<OP, OT>::on_open(client* c, websocketpp::connection_h
 		msg += ", \"redaction\" : true";
 	}
 
+	// https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection
+	if (speechDetectorSensitivity >= 0.0) {
+		msg += ", \"speech_detector_sensitivity\" : " + boost::to_string(speechDetectorSensitivity);
+	}
+
+	// https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection
+	if (backgroundAudioSuppression >= 0.0) {
+		msg += ", \"background_audio_suppression\" : " + boost::to_string(backgroundAudioSuppression);
+	}
+
+	if (characterInsertionBias >= -5.0) {
+		msg += ", \"character_insertion_bias\" : " + boost::to_string(characterInsertionBias);
+	}
+
 	// https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#keyword_spotting
 	if (keywordsSpottingThreshold > 0.0) {
 		msg += ", \"keywords_threshold\" : " + boost::to_string(keywordsSpottingThreshold);

diff --git a/com.ibm.streamsx.sttgateway/info.xml b/com.ibm.streamsx.sttgateway/info.xml
@@ -14,7 +14,7 @@
 
     **Note:** This toolkit requires c++11 support.
     </description>
-    <version>2.3.1</version>
+    <version>2.3.2</version>
     <requiredProductVersion>4.2.1.6</requiredProductVersion>
   </identity>
   <dependencies>

diff --git a/...uterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl b/...uterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl
@@ -1,14 +1,14 @@
 /*
 ==============================================
 # Licensed Materials - Property of IBM
-# Copyright IBM Corp. 2018, 2021
+# Copyright IBM Corp. 2018, 2022
 ==============================================
 */
 
 /*
 ==============================================
 First created on: Nov/27/2020
-Last modified on: Sep/20/2021
+Last modified on: Jan/08/2022
 
 IMPORTANT NOTE
 --------------
@@ -846,6 +846,10 @@ public composite VgwDataRouterToWatsonSTT {
 					// After getting released, such UDP channels will become available for 
 					// doing speech to text work for any new voice calls.
 					mutable map<rstring, int32> _vgwSessionToCompletedUdpChannelMap = {};
+					// This map tells us the call start date time string for a given vgwSessionId.
+					mutable map<rstring, rstring> _vgwSessionToCallStartDateTime = {};
+					// This map tells us the call start time in epoch seconds for a given vgwSessionId.
+					mutable map<rstring, int64> _vgwSessionToCallStartTimeInEpochSeconds = {};
 					mutable BinarySpeech_t _oTuple = {};
 					mutable rstring _key = "";
 				}
@@ -994,6 +998,10 @@ public composite VgwDataRouterToWatsonSTT {
 									if(BSD.callStartTimeInEpochSeconds == 0l) {
 										BSD.callStartTimeInEpochSeconds = getSeconds(getTimestamp());
 									}
+
+									// Insert the call start date time values in the state variables.
+									insertM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId, BSD.callStartDateTime);
+									insertM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId, BSD.callStartTimeInEpochSeconds);
 
 									rstring socsFileName = dataDirectory() + "/" +
 										BSD.vgwSessionId + "-call-started.txt";
@@ -1055,6 +1063,12 @@ public composite VgwDataRouterToWatsonSTT {
 							(rstring)BSD.totalSpeechDataBytesReceived +
 							", speechEngineId=" + (rstring)BSD.speechEngineId +
 							", speechResultProcessorId=" + (rstring)BSD.speechResultProcessorId); 
+						// Set the call start date time values to the tuple attributes.
+						if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) {
+							BSD.callStartDateTime = _vgwSessionToCallStartDateTime[BSD.vgwSessionId];
+							BSD.callStartTimeInEpochSeconds = _vgwSessionToCallStartTimeInEpochSeconds[BSD.vgwSessionId];
+						}
+
 						// Submit this tuple.
 						submit(BSD, BSDF);
 					} else {
@@ -1209,6 +1223,12 @@ public composite VgwDataRouterToWatsonSTT {
 									// We are done. Remove it from the map as well.
 									removeM(_vgwSessionToCompletedUdpChannelMap, key2);
 								}
+
+								// Remove the call start date time values from the state variables.
+								if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) {
+									removeM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId);
+									removeM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId);
+								}
 
 								// At this time, the voice call for this VGW session id has ended.
 								// We can now write an "End of Call" indicator file in the
@@ -1353,6 +1373,15 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken;
 			(boolean)getSubmissionTimeValue("smartFormattingNeeded", "false");
 		expression<boolean> $redactionNeeded : 
 			(boolean)getSubmissionTimeValue("redactionNeeded", "false");
+		// Allowed value range for this is from 0.0 to 1.0.
+		expression<float64> $speechDetectorSensitivity : 
+			(float64)getSubmissionTimeValue("speechDetectorSensitivity", "0.5");
+		// Allowed value range for this is from 0.0 to 1.0.	
+		expression<float64> $backgroundAudioSuppression : 
+			(float64)getSubmissionTimeValue("backgroundAudioSuppression", "0.0");
+		// Allowed value range for this is from -0.5 to 1.0.	
+		expression<float64> $characterInsertionBias : 
+			(float64)getSubmissionTimeValue("characterInsertionBias", "0.0");
 		expression<float64> $keywordsSpottingThreshold : 
 			(float64)getSubmissionTimeValue("keywordsSpottingThreshold", "0.0");
 		expression<list<rstring>> $keywordsToBeSpotted : 
@@ -1466,6 +1495,9 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken;
 				wordAlternativesThreshold: $wordAlternativesThreshold;
 				smartFormattingNeeded: $smartFormattingNeeded;
 				redactionNeeded: $redactionNeeded;
+				speechDetectorSensitivity: $speechDetectorSensitivity;
+				backgroundAudioSuppression: $backgroundAudioSuppression;
+				characterInsertionBias: $characterInsertionBias;
 				keywordsSpottingThreshold: $keywordsSpottingThreshold;
 				keywordsToBeSpotted: $keywordsToBeSpotted;
 				websocketLoggingNeeded: $sttWebsocketLoggingNeeded;
@@ -1685,11 +1717,16 @@ public composite STTResultProcessor(input MyTranscriptionResult, BinarySpeechDat
                        MTR.callStartTimeInEpochSeconds =  _callStartTimeInEpochSeconds;
                     }
 
-					// If the user opted for include the time of the
+					// If the user opted for including the time of the
 					// utterance result reception time, let us add it to
 					// the transcription result.
 					if($includeUtteranceResultReceptionTime == true) {
+						// This is the current time expressed in ctime format.
 						MTR.utteranceResultReceptionTime = ctime(getTimestamp());
+						// We will also do the utterance reception time expressed
+						// in seconds elapsed since the start of the call.
+						MTR.utteranceRxTime = 
+							getSeconds(getTimestamp()) - MTR.callStartTimeInEpochSeconds;
 					}
 
 					// We will write the transcription results to 

diff --git a/samples/VgwDataRouterToWatsonSTT/info.xml b/samples/VgwDataRouterToWatsonSTT/info.xml
@@ -4,13 +4,13 @@
  <info:identity>
    <info:name>VgwDataRouterToWatsonSTT</info:name>
    <info:description>Example that showcases STT on Cloud and STT on CP4D</info:description>
-   <info:version>1.0.4</info:version>
+   <info:version>1.0.5</info:version>
    <info:requiredProductVersion>4.2.1.6</info:requiredProductVersion>
  </info:identity>
  <info:dependencies>
    <info:toolkit>
      <common:name>com.ibm.streamsx.sttgateway</common:name>
-     <common:version>[2.3.1,7.0.0]</common:version>
+     <common:version>[2.3.2,7.0.0]</common:version>
    </info:toolkit>
    <info:toolkit>
      <common:name>com.ibm.streamsx.json</common:name>