diff --git a/.env.example b/.env.example index b8ad1d4..3ded9e0 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,7 @@ ASSISTANT_TOOL=rasa # Define the assistant tool to be used. Options: rasa, anthropic STT_TOOL=whisper-local # Define the STT tool to be used. Options: whisper-local, whisper SQL_DB_FILE_NAME="freetalkbot.db" # Name of the SQLite database file to be used by the whatsapp bot +AUDIO_FORMAT=pcm16 # Audio format that will use audiosocket server. Options: pcm16, g711 # Rasa variables. Mandatory if ASSISTANT_TOOL=rasa. # Used in rasa implementation and in golang communication channels @@ -23,5 +24,6 @@ WHISPER_LOCAL_URL=whisper_cpu:8000/v1 # Mandatory if STT_TOOL=whisper-local WHISPER__MODEL="deepdml/faster-whisper-large-v3-turbo-ct2" # The whisper model to use. Mandatory if STT_TOOL=whisper-local. # Optional variables +G711_AUDIO_CODEC=ulaw # Audio codec to be used in g711 audio format. Options: ulaw, alaw #PAIR_PHONE_NUMBER=+1234567890 # Use this variable to allow pair your whatsapp account with a pairing code #LOG_LEVEL=DEBUG # Use this variable to enable debug logs \ No newline at end of file diff --git a/README.md b/README.md index fabd886..f146b0f 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,101 @@ # freetalkbot -Implementation of communication channels to interact with LLM/NLU bot assistants. +Implementation of VoIP/Whatsapp communication channels to interact with LLM/NLU bot assistants. -* **Voice:** using [Audiosocket Asterisk](https://docs.asterisk.org/Configuration/Channel-Drivers/AudioSocket/) protocol -* **Whatsapp:** using [whatsmeow](https://github.com/tulir/whatsmeow) library. NO need of Whatsapp Business account, 100% free. +## VoIP channel + +Audiosocket server receiving a request from Asterisk. + +### Features: + +* Simulates a real conversation, but instead of human you are talking with an assistant. +* If you don't want to hear more assistant answer you can talk back. The assistant voice will be cut and it will process what you talked. +* Supports multiple calls (in theory, I haven't had the chance to test this). +* Fast answer from assistant (Speed is limited by the STT tool transcription generation and assistant answer generation times). + +### Architecture + +Refer to [architecture-Voicebot.png](docs/architecture-Voicebot.png). + +### Asterisk implementation + +The request can be implemented in two ways: + +1. Using [Audiosocket Dialplan application](https://docs.asterisk.org/Asterisk_20_Documentation/API_Documentation/Dialplan_Applications/AudioSocket/): + +```sh +[dp_entry_call_inout] +exten = 101,1,Verbose("Call to AudioSocket via Channel interface") +same = n,Answer() +same = n,AudioSocket(40325ec2-5efd-4bd3-805f-53576e581d13,:8080) +same = n,Hangup() +``` + +When using this way, the audio received from asterisk will be signed linear, 16-bit, 8kHz, mono PCM (little-endian). The envar `AUDIO_FORMAT` value must be `pcm16`. + +2. Using [Audiosocket Channel driver](https://docs.asterisk.org/Configuration/Channel-Drivers/AudioSocket/) + +```sh +[dp_entry_call_inout] +exten = 101,1,Verbose("Call to AudioSocket via Channel interface") +same = n,Answer() +same = n,Dial(AudioSocket/:8080/40325ec2-5efd-4bd3-805f-53576e581d13) +same = n,Hangup() +``` + +When using this way, the audio received from asterisk will be use the codec negotiated between the phone and asterisk. By default it is g711, and the audiosocket server can process audio in this codec (both ulaw and alaw.). The envar `AUDIO_FORMAT` value must be `g711` and the envar `G711_AUDIO_CODEC` must be set between `ulaw` or `alaw`. +If you want to choose a different codec than `g711` you can, both you will have to implement the transformation of the audio data from that codec to `pcm16`. Please refer to [g711.go](packages/audiosocket/g711.go) file. + +### STT + +There are two choices. +* OpenAI Whisper or +* Host [Faster Whisper Server](https://github.com/fedirz/faster-whisper-server). Second choice is recommended if you have GPU power. The advantage of using this server is that the audio is streamed via websocket protocol, which will guarantee more speed in transcription generation. + +### TTS + +It uses PicoTTS(https://github.com/ihuguet/picotts). The voices used are the ones that comes with pico. + +### Languages supported + +They are limited by the languages that PicoTTS supports: en-EN, en-GB, es-ES, de-DE, fr-FR, it-IT + +## WhatsApp channel + +This implementation was done using [whatsmeow](https://pkg.go.dev/go.mau.fi/whatsmeow) library. **NO need of WhatsApp Business account, 100% free.** + +### Features + +* Free whatsapp server that acts like WhatsApp web. +* Conversations with the users via text or voice messages. For voice, the user sends it, and server returns text answer. +* It answers in the same language that the user. All languages supported!!. + +### Architecture + +Refer to [architecture-Whatsapp.png](docs/architecture-Whatsapp.png). + +### Implementation + +For this channel you will need a phone with WhatsApp installed and with a number. The server will act as a WhatsApp client that will pair with your WhatsApp account. +After initialize the server you will see in the logs a QR code. Scan that QR code with the WhatsApp account that you will use. +If you can't scan the QR code you can also link the WhatsApp account using a pair code. For that you must set the envar `PAIR_PHONE_NUMBER` with your phone number using format show in the `.env.example`. If you don't need the pair code don't set this envar. + +Once you pair your WhatsApp account the session will be stored in a sqlite file. This file is created inside the container but mapped through a docker volume, so you can use it when you want to develop locally. If you delete this file you will have to login again using a new QR code. + +### STT Tool + +When receiving an audio message it uses an STT tool to transcribe. It can be the same already mentioned in the VoIP channel. + +### Languages supported + +All languages that you want!!! + +## Assistants Integration + +Currently the channels are integrated with two LLM/NLU assistants. + +* [RASA](./assistants/rasa/README.md) +* [Anthropic](./assistants/anthropic/README.md) ## Dependencies @@ -16,7 +108,7 @@ Install go dependencies with `go mod tidy`. Run it as well if you add a new pack ### Environment variables -Check the variables in `env.example` file. Create `.env` file with `cp -a .env.example .env` and modify it with your values. +Check the variables in `env.example` file. There you will have a detailed description of each variable to setup the communications channels with the STT tool and assistant of your choice. Create `.env` file with `cp -a .env.example .env` and modify it with your values. Read carefully the file to know which variables are relevant for each component ## Run @@ -26,7 +118,7 @@ You can pull the docker image and run it with the environment variables set up. ```sh docker pull ghcr.io/felipem1210/freetalkbot/freetalkbot:latest COM_CHANNEL=audio #or whatsapp -ocker run -it --rm --env-file ./.env ghcr.io/felipem1210/freetalkbot/freetalkbot:latest freetalkbot init -c $COM_CHANNEL +docker run -it --rm --env-file ./.env ghcr.io/felipem1210/freetalkbot/freetalkbot:latest freetalkbot init -c $COM_CHANNEL ``` ## Development @@ -34,10 +126,10 @@ ocker run -it --rm --env-file ./.env ghcr.io/felipem1210/freetalkbot/freetalkbot For local development you can use docker or podman to raise up the components defined in the `docker-compose.yml` file. These components are: * Asterisk -* Anthropic +* Anthropic connector * Rasa assistant * Rasa Actions server -* [Whisper ASR](https://ahmetoner.com/whisper-asr-webservice/) (optional) +* Faster Whisper Server (optional) * Audio bot server * Whatsapp bot server @@ -49,9 +141,9 @@ Run `make build`. This will build locally all the images needed for components. After setting up properly the environment variables: -* Without whisper-asr: `make run` -* With whisper-local using cpu: `make run-local-whisper-cpu` -* With whisper-local using gpu: `make run-local-whisper-gpu` +* Without faster-whisper-server: `make run` +* With faster-whisper-server using cpu: `make run-local-whisper-cpu` +* With faster-whisper-server using gpu: `make run-local-whisper-gpu` ### Configure asterisk @@ -65,32 +157,6 @@ Asterisk is raised up in network_mode brige. The asterisk configuration files ar * For SIP checkout `pjsip_endpoint.conf` file in `asterisk/container-config` folder. * For IAX checkout iax.conf file in `asterisk/local-config` folder. -## Communication Channels - -You can communicate with your chatbot assistant via two channels. - -### Voice channel - -Audiosocket server implementation, receives a request from Asterisk. - -### Whatsapp channel - -Same variables than audio bot are needed, just change the make command `make run-local-whatsapp` - -After initialize you will see in the logs a QR code. Scan that QR code with the whatsapp account that you will use. -If you can't scan the QR code you can also link the whatsapp account using a pair code. For that you must set the envar `PAIR_PHONE_NUMBER` with your phone number using format show in the `.env.example`. If you don't need the pair code don't set this envar. - -Once you pair your whatsapp account the session will be stored in a sqlite file. This file is created inside the container but mapped through a docker volume, so you can use it when you want to develop locally. If you delete this file you will have to login again using a new QR code. - -The channel is prepared to receive text or voice messages. - -## Assistants - -Currently the channels are integrated with two LLM/NLU assistants. - -* [RASA](./rasa/README.md) -* [Anthropic](./anthropic/README.md) - # Gratitude and Thanks The following projects inspired to the construction of this one: diff --git a/docs/architecture-Voicebot.drawio.png b/docs/architecture-Voicebot.drawio.png deleted file mode 100644 index a7545c8..0000000 Binary files a/docs/architecture-Voicebot.drawio.png and /dev/null differ diff --git a/docs/architecture-Voicebot.png b/docs/architecture-Voicebot.png new file mode 100644 index 0000000..56d1fe1 Binary files /dev/null and b/docs/architecture-Voicebot.png differ diff --git a/docs/architecture-Whatsapp.png b/docs/architecture-Whatsapp.png new file mode 100644 index 0000000..4ebacad Binary files /dev/null and b/docs/architecture-Whatsapp.png differ diff --git a/docs/architecture.drawio b/docs/architecture.drawio index da12777..83e964b 100644 --- a/docs/architecture.drawio +++ b/docs/architecture.drawio @@ -1,329 +1,331 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/packages/audiosocket/g711.go b/packages/audiosocket/g711.go new file mode 100644 index 0000000..6834c56 --- /dev/null +++ b/packages/audiosocket/g711.go @@ -0,0 +1,60 @@ +package audiosocketserver + +import "math" + +// ulawToLinear decodes a byte coded in g711 u-law format to a 16-bit signed linear PCM value. +func ulawToLinear(ulaw byte) int16 { + ulaw ^= 0xFF + sign := int16(ulaw & 0x80) + exponent := int16((ulaw >> 4) & 0x07) + mantissa := int16(ulaw & 0x0F) + value := (mantissa << 4) + 0x08 + if exponent != 0 { + value += 0x100 + value <<= (exponent - 1) + } + if sign != 0 { + value = -value + } + return value +} + +// alawToLinear decodes a byte coded in G.711 A-law format to a 16-bit signed linear PCM value. +func alawToLinear(alaw byte) int16 { + const QUANT_MASK = 0x0F // Quantization field mask. + const SEG_MASK = 0x70 // Segment field mask. + const SEG_SHIFT = 4 // Left shift for segment number. + const BIAS = 0x84 // Bias for linear code. + + alaw ^= 0x55 + + segment := (alaw & SEG_MASK) >> SEG_SHIFT + mantissa := alaw & QUANT_MASK + linear := int16(mantissa<<4) + BIAS + + if segment != 0 { + linear += 0x100 << (segment - 1) + } + + if alaw&0x80 != 0 { + return -linear + } + return linear +} + +// Calculate volume data for G711 audio data +func calculateVolumeG711(buffer []byte, codec string) float64 { + var sum float64 + var sample int16 + sampleCount := len(buffer) + for _, data := range buffer { + switch codec { + case "ulaw": + sample = ulawToLinear(data) + case "alaw": + sample = alawToLinear(data) + } + sum += float64(sample) * float64(sample) + } + return math.Sqrt(sum / float64(sampleCount)) +} diff --git a/packages/audiosocket/main.go b/packages/audiosocket/main.go index 1310070..5f338c6 100644 --- a/packages/audiosocket/main.go +++ b/packages/audiosocket/main.go @@ -19,9 +19,7 @@ import ( ) const ( - listenAddr = ":8080" - inputAudioFormat = "pcm16" // "g711" or "pcm16" - inputAudioCodec = "ulaw" // "ulaw" or "alaw" + listenAddr = ":8080" // slinChunkSize is the number of bytes which should be sent per Slin // audiosocket message. Larger data will be chunked into this size for @@ -30,19 +28,21 @@ const ( // This is based on 8kHz, 20ms, 16-bit signed linear. slinChunkSize = 320 // 8000Hz * 20ms * 2 bytes - silenceThreshold = 500 // Silence threshold - silenceDuration = 2 * time.Second // Minimum duration of silence - MaxCallDuration = 2 * time.Minute // MaxCallDuration is the maximum amount of time to allow a call to be up before it is terminated. + silenceDuration = 2 * time.Second // Minimum duration of silence + MaxCallDuration = 2 * time.Minute // MaxCallDuration is the maximum amount of time to allow a call to be up before it is terminated. ) var ( - audioData []byte - id uuid.UUID - err error - ctx context.Context - cancel context.CancelFunc - language string - openaiClient common.OpenaiClient + inputAudioFormat string + g711AudioCodec string + silenceThreshold float64 + audioData []byte + id uuid.UUID + err error + ctx context.Context + cancel context.CancelFunc + language string + openaiClient common.OpenaiClient ) // ErrHangup indicates that the call should be terminated or has been terminated @@ -53,6 +53,15 @@ func InitializeServer() { if os.Getenv("STT_TOOL") == "whisper" { openaiClient = common.CreateOpenAiClient() } + + inputAudioFormat = os.Getenv("AUDIO_FORMAT") + if inputAudioFormat == "pcm16" { + silenceThreshold = 500 + } else if inputAudioFormat == "g711" { + silenceThreshold = 1000 + g711AudioCodec = os.Getenv("G711_AUDIO_CODEC") + } + slog.Info(fmt.Sprintf("listening for AudioSocket connections on %s", listenAddr)) if err = listen(ctx); err != nil { log.Fatalln("listen failure:", err) @@ -256,7 +265,7 @@ func processFromAsterisk(cancel context.CancelFunc, c net.Conn, playingAudioCh c messageData = append(messageData, m.Payload()...) var volume float64 if inputAudioFormat == "g711" { - volume = calculateVolumeG711(m.Payload(), inputAudioCodec) + volume = calculateVolumeG711(m.Payload(), g711AudioCodec) } else { volume = calculateVolumePCM16(m.Payload()) } diff --git a/packages/audiosocket/utils.go b/packages/audiosocket/utils.go index 301fd87..bb08209 100644 --- a/packages/audiosocket/utils.go +++ b/packages/audiosocket/utils.go @@ -41,63 +41,6 @@ func calculateVolumePCM16(buffer []byte) float64 { return math.Sqrt(sum / float64(len(buffer)/2)) } -// ulawToLinear decodes a byte coded in g711 u-law format to a 16-bit signed linear PCM value. -func ulawToLinear(ulaw byte) int16 { - ulaw ^= 0xFF - sign := int16(ulaw & 0x80) - exponent := int16((ulaw >> 4) & 0x07) - mantissa := int16(ulaw & 0x0F) - value := (mantissa << 4) + 0x08 - if exponent != 0 { - value += 0x100 - value <<= (exponent - 1) - } - if sign != 0 { - value = -value - } - return value -} - -// alawToLinear decodes a byte coded in G.711 A-law format to a 16-bit signed linear PCM value. -func alawToLinear(alaw byte) int16 { - const QUANT_MASK = 0x0F // Quantization field mask. - const SEG_MASK = 0x70 // Segment field mask. - const SEG_SHIFT = 4 // Left shift for segment number. - const BIAS = 0x84 // Bias for linear code. - - alaw ^= 0x55 - - segment := (alaw & SEG_MASK) >> SEG_SHIFT - mantissa := alaw & QUANT_MASK - linear := int16(mantissa<<4) + BIAS - - if segment != 0 { - linear += 0x100 << (segment - 1) - } - - if alaw&0x80 != 0 { - return -linear - } - return linear -} - -// Calculate volume data for G711 audio data -func calculateVolumeG711(buffer []byte, codec string) float64 { - var sum float64 - var sample int16 - sampleCount := len(buffer) - for _, data := range buffer { - switch codec { - case "ulaw": - sample = ulawToLinear(data) - case "alaw": - sample = alawToLinear(data) - } - sum += float64(sample) * float64(sample) - } - return math.Sqrt(sum / float64(sampleCount)) -} - // delete a file func deleteFile(filename string) { if err := os.Remove(filename); err != nil { diff --git a/packages/cmd/init.go b/packages/cmd/init.go index 4d538ce..5b80d01 100644 --- a/packages/cmd/init.go +++ b/packages/cmd/init.go @@ -40,6 +40,19 @@ var prCmd = &cobra.Command{ } if comChan == "audio" { + validateEnv([]string{"AUDIO_FORMAT"}) + switch os.Getenv("AUDIO_FORMAT") { + case "g711": + validateEnv([]string{"G711_AUDIO_CODEC"}) + case "pcm16": + default: + fmt.Println("Invalid value for variable AUDIO_FORMAT, valid values are g711 and pcm16") + os.Exit(1) + + } + if os.Getenv("AUDIO_FORMAT") == "g711" { + validateEnv([]string{"G711_AUDIO_CODEC"}) + } audiosocketserver.InitializeServer() } else if comChan == "whatsapp" { validateEnv([]string{"SQL_DB_FILE_NAME"})