diff --git a/com.ibm.streamsx.sttgateway/CHANGELOG.md b/com.ibm.streamsx.sttgateway/CHANGELOG.md index 2054e7e..f1e0edf 100644 --- a/com.ibm.streamsx.sttgateway/CHANGELOG.md +++ b/com.ibm.streamsx.sttgateway/CHANGELOG.md @@ -1,5 +1,10 @@ # Changes +## v2.3.2 +* Jan/10/2022 +* Fixed a problem where the call start date time values were not always correctly included in the STT result. +* Added these three new parameters to the WatsonSTT operator: speechDetectorSensitivity, backgroundAudioSuppression, characterInsertionBias + ## v2.3.1 * Sep/20/2021 * Dynamically change the maximum concurrent calls allowed value. diff --git a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml index eb50c90..f6a0c8f 100644 --- a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml +++ b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT.xml @@ -510,7 +510,7 @@ customizationWeight - This parameter specifies a relative weight for a custom language model as a float64 between 0.0 to 1.0 (Default is 0.0) + This parameter specifies a relative weight for a custom language model as a float64 from 0.0 to 1.0 (Default is 0.0) true true AttributeFree @@ -666,6 +666,36 @@ 1 + + speechDetectorSensitivity + This parameter specifies a float64 value from 0.0 to 1.0 to adjust the sensitivity of speech activity detection (Default is 0.5) + true + true + AttributeFree + float64 + 1 + + + + backgroundAudioSuppression + This parameter specifies a float64 value from 0.0 to 1.0 to suppress side conversations or background noise (Default is 0.0) + true + true + AttributeFree + float64 + 1 + + + + characterInsertionBias + This parameter specifies a float64 value from -0.5 to 1.0 to change how prone the STT engine is to insert more transcribed characters (Default is 0.0) + true + true + AttributeFree + float64 + 1 + + diff --git a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt index 7519566..3550835 100644 --- a/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt +++ b/com.ibm.streamsx.sttgateway/com.ibm.streamsx.sttgateway.watson/WatsonSTT/WatsonSTT_cpp.cgt @@ -8,7 +8,7 @@ /* ============================================================ First created on: Jul/01/2018 -Last modified on: Sep/12/2021 +Last modified on: Jan/04/2022 Please refer to the sttgateway-tech-brief.txt file in the top-level directory of this toolkit to read about @@ -459,6 +459,18 @@ https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-websocke my $sttLiveMetricsUpdateNeeded = $model->getParameterByName("sttLiveMetricsUpdateNeeded"); $sttLiveMetricsUpdateNeeded = $sttLiveMetricsUpdateNeeded ? $sttLiveMetricsUpdateNeeded->getValueAt(0)->getCppExpression() : 1; + + my $speechDetectorSensitivity = $model->getParameterByName("speechDetectorSensitivity"); + # Default: 0.5 + $speechDetectorSensitivity = $speechDetectorSensitivity ? $speechDetectorSensitivity->getValueAt(0)->getCppExpression() : 0.5; + + my $backgroundAudioSuppression = $model->getParameterByName("backgroundAudioSuppression"); + # Default: 0.0 + $backgroundAudioSuppression = $backgroundAudioSuppression ? $backgroundAudioSuppression->getValueAt(0)->getCppExpression() : 0.0; + + my $characterInsertionBias = $model->getParameterByName("characterInsertionBias"); + # Default: 0.0 + $characterInsertionBias = $characterInsertionBias ? $characterInsertionBias->getValueAt(0)->getCppExpression() : 0.0; %> #include @@ -501,7 +513,10 @@ MY_OPERATOR::MY_OPERATOR() <%=$redactionNeeded%>, <%=$keywordsSpottingThreshold%>, <%=$keywordsToBeSpotted%>, - <%=$isTranscriptionCompletedRequested%> + <%=$isTranscriptionCompletedRequested%>, + <%=$speechDetectorSensitivity%>, + <%=$backgroundAudioSuppression%>, + <%=$characterInsertionBias%> } ) {} diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp index 0816d27..4c48fed 100644 --- a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp +++ b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTConfig.hpp @@ -47,6 +47,9 @@ struct WatsonSTTConfig { SPL::float64 keywordsSpottingThreshold; const SPL::list keywordsToBeSpotted; const bool isTranscriptionCompletedRequested; + SPL::float64 speechDetectorSensitivity; + SPL::float64 backgroundAudioSuppression; + SPL::float64 characterInsertionBias; // Some definitions //This time becomes effective, when the connectionAttemptsThreshold limit is exceeded diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp index b2e473d..34324b4 100644 --- a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp +++ b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImpl.hpp @@ -232,6 +232,18 @@ WatsonSTTImpl::WatsonSTTImpl(OP & splOperator_,Conf config_) throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::keywordsSpottingThreshold, "keywordsSpottingThreshold")); } + if (Conf::speechDetectorSensitivity < 0.0 || Conf::speechDetectorSensitivity > 1.0) { + throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::speechDetectorSensitivity, "speechDetectorSensitivity")); + } + + if (Conf::backgroundAudioSuppression < 0.0 || Conf::backgroundAudioSuppression > 1.0) { + throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::backgroundAudioSuppression, "backgroundAudioSuppression")); + } + + if (Conf::characterInsertionBias < -0.5 || Conf::characterInsertionBias > 1.0) { + throw std::runtime_error(STTGW_INVALID_PARAM_VALUE_3("WatsonSTT", Conf::characterInsertionBias, "characterInsertionBias")); + } + // If the keywords to be spotted list is empty, then disable keywords_spotting. if (Conf::keywordsToBeSpotted.size() == 0) { Conf::keywordsSpottingThreshold = 0.0; @@ -246,9 +258,9 @@ WatsonSTTImpl::WatsonSTTImpl(OP & splOperator_,Conf config_) } // The parameters maxUtteranceAlternatives, wordAlternativesThreshold, keywordsSpottingThreshold, keywordsToBeSpotted - // are not available in sttResultMose complete + // are not available in sttResultMode complete // The COF getUtteranceNumber, isFinalizedUtterance, getConfidence, getUtteranceAlternatives - // are not available in sttResultMose complete + // are not available in sttResultMode complete // Update the operator metric. sttOutputResultModeMetric->setValueNoLock(Conf::sttOutputResultMode); @@ -284,6 +296,9 @@ WatsonSTTImpl::WatsonSTTImpl(OP & splOperator_,Conf config_) << "\nkeywordsSpottingThreshold = " << Conf::keywordsSpottingThreshold << "\nkeywordsToBeSpotted = " << Conf::keywordsToBeSpotted << "\nisTranscriptionCompletedRequested = " << Conf::isTranscriptionCompletedRequested + << "\nspeechDetectorSensitivity = " << Conf::speechDetectorSensitivity + << "\nbackgroundAudioSuppression = " << Conf::backgroundAudioSuppression + << "\ncharacterInsertionBias = " << Conf::characterInsertionBias << "\nconnectionState.wsState.is_lock_free() = " << Rec::wsState.is_lock_free() << "\nrecentOTuple.is_lock_free() = " << Rec::recentOTuple.is_lock_free() << "\n----------------------------------------------------------------" << std::endl; diff --git a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp index 310bd3a..b7dabe7 100644 --- a/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp +++ b/com.ibm.streamsx.sttgateway/impl/include/WatsonSTTImplReceiver.hpp @@ -578,6 +578,20 @@ void WatsonSTTImplReceiver::on_open(client* c, websocketpp::connection_h msg += ", \"redaction\" : true"; } + // https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection + if (speechDetectorSensitivity >= 0.0) { + msg += ", \"speech_detector_sensitivity\" : " + boost::to_string(speechDetectorSensitivity); + } + + // https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection + if (backgroundAudioSuppression >= 0.0) { + msg += ", \"background_audio_suppression\" : " + boost::to_string(backgroundAudioSuppression); + } + + if (characterInsertionBias >= -5.0) { + msg += ", \"character_insertion_bias\" : " + boost::to_string(characterInsertionBias); + } + // https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-output#keyword_spotting if (keywordsSpottingThreshold > 0.0) { msg += ", \"keywords_threshold\" : " + boost::to_string(keywordsSpottingThreshold); diff --git a/com.ibm.streamsx.sttgateway/info.xml b/com.ibm.streamsx.sttgateway/info.xml index 3e4b864..dc532b6 100644 --- a/com.ibm.streamsx.sttgateway/info.xml +++ b/com.ibm.streamsx.sttgateway/info.xml @@ -14,7 +14,7 @@ **Note:** This toolkit requires c++11 support. - 2.3.1 + 2.3.2 4.2.1.6 diff --git a/samples/VgwDataRouterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl b/samples/VgwDataRouterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl index 31b45d6..31f576e 100644 --- a/samples/VgwDataRouterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl +++ b/samples/VgwDataRouterToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl @@ -1,14 +1,14 @@ /* ============================================== # Licensed Materials - Property of IBM -# Copyright IBM Corp. 2018, 2021 +# Copyright IBM Corp. 2018, 2022 ============================================== */ /* ============================================== First created on: Nov/27/2020 -Last modified on: Sep/20/2021 +Last modified on: Jan/08/2022 IMPORTANT NOTE -------------- @@ -846,6 +846,10 @@ public composite VgwDataRouterToWatsonSTT { // After getting released, such UDP channels will become available for // doing speech to text work for any new voice calls. mutable map _vgwSessionToCompletedUdpChannelMap = {}; + // This map tells us the call start date time string for a given vgwSessionId. + mutable map _vgwSessionToCallStartDateTime = {}; + // This map tells us the call start time in epoch seconds for a given vgwSessionId. + mutable map _vgwSessionToCallStartTimeInEpochSeconds = {}; mutable BinarySpeech_t _oTuple = {}; mutable rstring _key = ""; } @@ -994,6 +998,10 @@ public composite VgwDataRouterToWatsonSTT { if(BSD.callStartTimeInEpochSeconds == 0l) { BSD.callStartTimeInEpochSeconds = getSeconds(getTimestamp()); } + + // Insert the call start date time values in the state variables. + insertM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId, BSD.callStartDateTime); + insertM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId, BSD.callStartTimeInEpochSeconds); rstring socsFileName = dataDirectory() + "/" + BSD.vgwSessionId + "-call-started.txt"; @@ -1055,6 +1063,12 @@ public composite VgwDataRouterToWatsonSTT { (rstring)BSD.totalSpeechDataBytesReceived + ", speechEngineId=" + (rstring)BSD.speechEngineId + ", speechResultProcessorId=" + (rstring)BSD.speechResultProcessorId); + // Set the call start date time values to the tuple attributes. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + BSD.callStartDateTime = _vgwSessionToCallStartDateTime[BSD.vgwSessionId]; + BSD.callStartTimeInEpochSeconds = _vgwSessionToCallStartTimeInEpochSeconds[BSD.vgwSessionId]; + } + // Submit this tuple. submit(BSD, BSDF); } else { @@ -1209,6 +1223,12 @@ public composite VgwDataRouterToWatsonSTT { // We are done. Remove it from the map as well. removeM(_vgwSessionToCompletedUdpChannelMap, key2); } + + // Remove the call start date time values from the state variables. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + removeM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId); + removeM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId); + } // At this time, the voice call for this VGW session id has ended. // We can now write an "End of Call" indicator file in the @@ -1353,6 +1373,15 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; (boolean)getSubmissionTimeValue("smartFormattingNeeded", "false"); expression $redactionNeeded : (boolean)getSubmissionTimeValue("redactionNeeded", "false"); + // Allowed value range for this is from 0.0 to 1.0. + expression $speechDetectorSensitivity : + (float64)getSubmissionTimeValue("speechDetectorSensitivity", "0.5"); + // Allowed value range for this is from 0.0 to 1.0. + expression $backgroundAudioSuppression : + (float64)getSubmissionTimeValue("backgroundAudioSuppression", "0.0"); + // Allowed value range for this is from -0.5 to 1.0. + expression $characterInsertionBias : + (float64)getSubmissionTimeValue("characterInsertionBias", "0.0"); expression $keywordsSpottingThreshold : (float64)getSubmissionTimeValue("keywordsSpottingThreshold", "0.0"); expression> $keywordsToBeSpotted : @@ -1466,6 +1495,9 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; wordAlternativesThreshold: $wordAlternativesThreshold; smartFormattingNeeded: $smartFormattingNeeded; redactionNeeded: $redactionNeeded; + speechDetectorSensitivity: $speechDetectorSensitivity; + backgroundAudioSuppression: $backgroundAudioSuppression; + characterInsertionBias: $characterInsertionBias; keywordsSpottingThreshold: $keywordsSpottingThreshold; keywordsToBeSpotted: $keywordsToBeSpotted; websocketLoggingNeeded: $sttWebsocketLoggingNeeded; @@ -1685,11 +1717,16 @@ public composite STTResultProcessor(input MyTranscriptionResult, BinarySpeechDat MTR.callStartTimeInEpochSeconds = _callStartTimeInEpochSeconds; } - // If the user opted for include the time of the + // If the user opted for including the time of the // utterance result reception time, let us add it to // the transcription result. if($includeUtteranceResultReceptionTime == true) { + // This is the current time expressed in ctime format. MTR.utteranceResultReceptionTime = ctime(getTimestamp()); + // We will also do the utterance reception time expressed + // in seconds elapsed since the start of the call. + MTR.utteranceRxTime = + getSeconds(getTimestamp()) - MTR.callStartTimeInEpochSeconds; } // We will write the transcription results to diff --git a/samples/VgwDataRouterToWatsonSTT/info.xml b/samples/VgwDataRouterToWatsonSTT/info.xml index fa23913..2817911 100644 --- a/samples/VgwDataRouterToWatsonSTT/info.xml +++ b/samples/VgwDataRouterToWatsonSTT/info.xml @@ -4,13 +4,13 @@ VgwDataRouterToWatsonSTT Example that showcases STT on Cloud and STT on CP4D - 1.0.4 + 1.0.5 4.2.1.6 com.ibm.streamsx.sttgateway - [2.3.1,7.0.0] + [2.3.2,7.0.0] com.ibm.streamsx.json diff --git a/samples/VgwDataRouterToWatsonSTTMini/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl b/samples/VgwDataRouterToWatsonSTTMini/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl index 913bb08..005d0af 100644 --- a/samples/VgwDataRouterToWatsonSTTMini/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl +++ b/samples/VgwDataRouterToWatsonSTTMini/com.ibm.streamsx.sttgateway.sample.watsonstt/VgwDataRouterToWatsonSTT.spl @@ -1,14 +1,14 @@ /* ============================================== # Licensed Materials - Property of IBM -# Copyright IBM Corp. 2018, 2021 +# Copyright IBM Corp. 2018, 2022 ============================================== */ /* ============================================== First created on: Nov/27/2020 -Last modified on: Sep/20/2021 +Last modified on: Jan/08/2022 IMPORTANT NOTE -------------- @@ -342,6 +342,9 @@ type MySTTResult_t = rstring vgwSessionId, boolean isCustomerSpeechData, * @param wordAlternativesThreshold wordAlternativesThreshold * @param smartFormattingNeeded smartFormattingNeeded * @param redactionNeeded redactionNeeded + * @param speechDetectorSensitivity speechDetectorSensitivity + * @param backgroundAudioSuppression backgroundAudioSuppression + * @param characterInsertionBias characterInsertionBias * @param keywordsSpottingThreshold keywordsSpottingThreshold * @param keywordsToBeSpotted keywordsToBeSpotted * @param sttWebsocketLoggingNeeded sttWebsocketLoggingNeeded @@ -760,6 +763,10 @@ public composite VgwDataRouterToWatsonSTT { // After getting released, such UDP channels will become available for // doing speech to text work for any new voice calls. mutable map _vgwSessionToCompletedUdpChannelMap = {}; + // This map tells us the call start date time string for a given vgwSessionId. + mutable map _vgwSessionToCallStartDateTime = {}; + // This map tells us the call start time in epoch seconds for a given vgwSessionId. + mutable map _vgwSessionToCallStartTimeInEpochSeconds = {}; mutable BinarySpeech_t _oTuple = {}; mutable rstring _key = ""; } @@ -909,6 +916,10 @@ public composite VgwDataRouterToWatsonSTT { BSD.callStartTimeInEpochSeconds = getSeconds(getTimestamp()); } + // Insert the call start date time values in the state variables. + insertM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId, BSD.callStartDateTime); + insertM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId, BSD.callStartTimeInEpochSeconds); + rstring socsFileName = dataDirectory() + "/" + BSD.vgwSessionId + "-call-started.txt"; uint64 fileHandle = fopen (socsFileName, "w+", err); @@ -969,6 +980,12 @@ public composite VgwDataRouterToWatsonSTT { (rstring)BSD.totalSpeechDataBytesReceived + ", speechEngineId=" + (rstring)BSD.speechEngineId + ", speechResultProcessorId=" + (rstring)BSD.speechResultProcessorId); + // Set the call start date time values to the tuple attributes. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + BSD.callStartDateTime = _vgwSessionToCallStartDateTime[BSD.vgwSessionId]; + BSD.callStartTimeInEpochSeconds = _vgwSessionToCallStartTimeInEpochSeconds[BSD.vgwSessionId]; + } + // Submit this tuple. submit(BSD, BSDF); } else { @@ -1119,6 +1136,12 @@ public composite VgwDataRouterToWatsonSTT { removeM(_vgwSessionToCompletedUdpChannelMap, key2); } + // Remove the call start date time values from the state variables. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + removeM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId); + removeM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId); + } + // At this time, the voice call for this VGW session id has ended. // We can now write an "End of Call" indicator file in the // application's data directory. e-g: 5362954-call-completed.txt @@ -1250,6 +1273,15 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; (boolean)getSubmissionTimeValue("smartFormattingNeeded", "false"); expression $redactionNeeded : (boolean)getSubmissionTimeValue("redactionNeeded", "false"); + // Allowed value range for this is from 0.0 to 1.0. + expression $speechDetectorSensitivity : + (float64)getSubmissionTimeValue("speechDetectorSensitivity", "0.5"); + // Allowed value range for this is from 0.0 to 1.0. + expression $backgroundAudioSuppression : + (float64)getSubmissionTimeValue("backgroundAudioSuppression", "0.0"); + // Allowed value range for this is from -0.5 to 1.0. + expression $characterInsertionBias : + (float64)getSubmissionTimeValue("characterInsertionBias", "0.0"); expression $keywordsSpottingThreshold : (float64)getSubmissionTimeValue("keywordsSpottingThreshold", "0.0"); expression> $keywordsToBeSpotted : @@ -1363,6 +1395,9 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; wordAlternativesThreshold: $wordAlternativesThreshold; smartFormattingNeeded: $smartFormattingNeeded; redactionNeeded: $redactionNeeded; + speechDetectorSensitivity: $speechDetectorSensitivity; + backgroundAudioSuppression: $backgroundAudioSuppression; + characterInsertionBias: $characterInsertionBias; keywordsSpottingThreshold: $keywordsSpottingThreshold; keywordsToBeSpotted: $keywordsToBeSpotted; websocketLoggingNeeded: $sttWebsocketLoggingNeeded; @@ -1505,7 +1540,7 @@ public composite STTResultProcessor(input MyTranscriptionResult) { MTR.callStartTimeInEpochSeconds = _callStartTimeInEpochSeconds; } - // If the user opted for include the time of the + // If the user opted for including the time of the // utterance result reception time, let us add it to // the transcription result. if($includeUtteranceResultReceptionTime == true) { @@ -1535,8 +1570,8 @@ public composite STTResultProcessor(input MyTranscriptionResult) { // replace this operator with an Export operator. // But, exporting from multiple speech processors may not be // the right approach. A preferred and better approach would be - // to use a WebSocketSink operator to send the utterances to be - // received on the other end by a WebSocketSendReceive operator. + // to use a WebSocketSendReceive operator to send the utterances to be + // received on the other end by a WebSocketSource operator. // That approach will scale well without putting more stress on the // Streams instance since the data transfer is done outside the // confines of the Streams runtime. If more details are needed about diff --git a/samples/VgwDataRouterToWatsonSTTMini/info.xml b/samples/VgwDataRouterToWatsonSTTMini/info.xml index 85bbe24..9556d23 100644 --- a/samples/VgwDataRouterToWatsonSTTMini/info.xml +++ b/samples/VgwDataRouterToWatsonSTTMini/info.xml @@ -3,13 +3,13 @@ VgwDataRouterToWatsonSTTMini Example that showcases STT on Cloud and STT on CP4D - 1.0.4 + 1.0.5 4.2.1.6 com.ibm.streamsx.sttgateway - [2.3.1,7.0.0] + [2.3.2,7.0.0] com.ibm.streamsx.websocket diff --git a/samples/VoiceGatewayToStreamsToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VoiceGatewayToStreamsToWatsonSTT.spl b/samples/VoiceGatewayToStreamsToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VoiceGatewayToStreamsToWatsonSTT.spl index 8c7b74b..48306d9 100644 --- a/samples/VoiceGatewayToStreamsToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VoiceGatewayToStreamsToWatsonSTT.spl +++ b/samples/VoiceGatewayToStreamsToWatsonSTT/com.ibm.streamsx.sttgateway.sample.watsonstt/VoiceGatewayToStreamsToWatsonSTT.spl @@ -1,14 +1,14 @@ /* ============================================== # Licensed Materials - Property of IBM -# Copyright IBM Corp. 2018, 2021 +# Copyright IBM Corp. 2018, 2022 ============================================== */ /* ============================================== First created on: Sep/25/2019 -Last modified on: Sep/20/2021 +Last modified on: Jan/08/2022 A) What does this example application do? -------------------------------------- @@ -310,6 +310,9 @@ type MySTTResult_t = rstring vgwSessionId, boolean isCustomerSpeechData, * @param wordAlternativesThreshold wordAlternativesThreshold * @param smartFormattingNeeded smartFormattingNeeded * @param redactionNeeded redactionNeeded + * @param speechDetectorSensitivity speechDetectorSensitivity + * @param backgroundAudioSuppression backgroundAudioSuppression + * @param characterInsertionBias characterInsertionBias * @param keywordsSpottingThreshold keywordsSpottingThreshold * @param keywordsToBeSpotted keywordsToBeSpotted * @param sttWebsocketLoggingNeeded sttWebsocketLoggingNeeded @@ -690,6 +693,10 @@ public composite VoiceGatewayToStreamsToWatsonSTT { // After getting released, such UDP channels will become available for // doing speech to text work for any new voice calls. mutable map _vgwSessionToCompletedUdpChannelMap = {}; + // This map tells us the call start date time string for a given vgwSessionId. + mutable map _vgwSessionToCallStartDateTime = {}; + // This map tells us the call start time in epoch seconds for a given vgwSessionId. + mutable map _vgwSessionToCallStartTimeInEpochSeconds = {}; mutable BinarySpeech_t _oTuple = {}; mutable rstring _key = ""; } @@ -839,6 +846,10 @@ public composite VoiceGatewayToStreamsToWatsonSTT { BSD.callStartTimeInEpochSeconds = getSeconds(getTimestamp()); } + // Insert the call start date time values in the state variables. + insertM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId, BSD.callStartDateTime); + insertM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId, BSD.callStartTimeInEpochSeconds); + rstring socsFileName = dataDirectory() + "/" + BSD.vgwSessionId + "-call-started.txt"; uint64 fileHandle = fopen (socsFileName, "w+", err); @@ -895,6 +906,12 @@ public composite VoiceGatewayToStreamsToWatsonSTT { (rstring)BSD.totalSpeechDataBytesReceived + ", sttEngineId=" + (rstring)BSD.sttEngineId + ", sttResultProcessorId=" + (rstring)BSD.sttResultProcessorId); + // Set the call start date time values to the tuple attributes. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + BSD.callStartDateTime = _vgwSessionToCallStartDateTime[BSD.vgwSessionId]; + BSD.callStartTimeInEpochSeconds = _vgwSessionToCallStartTimeInEpochSeconds[BSD.vgwSessionId]; + } + // Submit this tuple. submit(BSD, BSDF); } else { @@ -1048,6 +1065,12 @@ public composite VoiceGatewayToStreamsToWatsonSTT { // We are done. Remove it from the map as well. removeM(_vgwSessionToCompletedUdpChannelMap, key2); } + + // Remove the call start date time values from the state variables. + if(has(_vgwSessionToCallStartDateTime, BSD.vgwSessionId) == true) { + removeM(_vgwSessionToCallStartDateTime, BSD.vgwSessionId); + removeM(_vgwSessionToCallStartTimeInEpochSeconds, BSD.vgwSessionId); + } // At this time, the voice call for this VGW session id has ended. // We can now write an "End of Call" indicator file in the @@ -1193,6 +1216,15 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; (boolean)getSubmissionTimeValue("smartFormattingNeeded", "false"); expression $redactionNeeded : (boolean)getSubmissionTimeValue("redactionNeeded", "false"); + // Allowed value range for this is from 0.0 to 1.0. + expression $speechDetectorSensitivity : + (float64)getSubmissionTimeValue("speechDetectorSensitivity", "0.5"); + // Allowed value range for this is from 0.0 to 1.0. + expression $backgroundAudioSuppression : + (float64)getSubmissionTimeValue("backgroundAudioSuppression", "0.0"); + // Allowed value range for this is from -0.5 to 1.0. + expression $characterInsertionBias : + (float64)getSubmissionTimeValue("characterInsertionBias", "0.0"); expression $keywordsSpottingThreshold : (float64)getSubmissionTimeValue("keywordsSpottingThreshold", "0.0"); expression> $keywordsToBeSpotted : @@ -1306,6 +1338,9 @@ public composite IBMWatsonSpeechToText(input AudioBlobContent, AccessToken; wordAlternativesThreshold: $wordAlternativesThreshold; smartFormattingNeeded: $smartFormattingNeeded; redactionNeeded: $redactionNeeded; + speechDetectorSensitivity: $speechDetectorSensitivity; + backgroundAudioSuppression: $backgroundAudioSuppression; + characterInsertionBias: $characterInsertionBias; keywordsSpottingThreshold: $keywordsSpottingThreshold; keywordsToBeSpotted: $keywordsToBeSpotted; websocketLoggingNeeded: $sttWebsocketLoggingNeeded; @@ -1525,10 +1560,11 @@ public composite STTResultProcessor(input MyTranscriptionResult, BinarySpeechDat MTR.callStartTimeInEpochSeconds = _callStartTimeInEpochSeconds; } - // If the user opted for include the time of the + // If the user opted for including the time of the // utterance result reception time, let us add it to // the transcription result. if($includeUtteranceResultReceptionTime == true) { + // This is the current time expressed in ctime format. MTR.utteranceResultReceptionTime = ctime(getTimestamp()); // We will also do the utterance reception time expressed // in seconds elapsed since the start of the call. diff --git a/samples/VoiceGatewayToStreamsToWatsonSTT/info.xml b/samples/VoiceGatewayToStreamsToWatsonSTT/info.xml index 9f531c2..15597c9 100644 --- a/samples/VoiceGatewayToStreamsToWatsonSTT/info.xml +++ b/samples/VoiceGatewayToStreamsToWatsonSTT/info.xml @@ -3,13 +3,13 @@ VoiceGatewayToStreamsToWatsonSTT Example that showcases STT on Cloud and STT on CP4D - 1.0.7 + 1.0.8 4.2.1.6 com.ibm.streamsx.sttgateway - [2.3.1,7.0.0) + [2.3.2,7.0.0) com.ibm.streamsx.json diff --git a/sttgateway-tech-brief.txt b/sttgateway-tech-brief.txt index fd14e28..6320aa5 100644 --- a/sttgateway-tech-brief.txt +++ b/sttgateway-tech-brief.txt @@ -1,6 +1,6 @@ ============================================================ First created on: July/01/2018 -Last modified on: September/20/2021 +Last modified on: January/10/2022 Purpose of this toolkit -----------------------