From 00e407954950ea761f6c2062253f26145048b8d2 Mon Sep 17 00:00:00 2001 From: donglianghao Date: Tue, 8 Oct 2019 18:03:18 +0800 Subject: [PATCH 1/2] algorithm process on mini board and modify for new sr libs --- .gitlab-ci.yml | 1 + .gitmodules | 3 + components/audio_stream/algorithm_stream.c | 602 ++++++++++++++++++ .../audio_stream/include/algorithm_stream.h | 150 +++++ components/esp-adf-libs | 2 +- components/esp-sr | 1 + docs/Doxyfile | 6 +- .../{esp_sr_iface.rst => esp_wn_iface.rst} | 25 +- .../speech-recognition/index.rst | 4 +- .../speech-recognition/wakeup-word-libs.rst | 72 --- .../speech-recognition/esp_sr_iface.rst | 1 - .../speech-recognition/esp_wn_iface.rst | 1 + .../speech-recognition/wakeup-word-libs.rst | 1 - .../main/example_record_and_play.c | 18 +- examples/dueros/main/dueros_app.c | 26 +- .../asr/main/example_asr_main.c | 57 +- 16 files changed, 843 insertions(+), 127 deletions(-) create mode 100644 components/audio_stream/algorithm_stream.c create mode 100644 components/audio_stream/include/algorithm_stream.h create mode 160000 components/esp-sr rename docs/en/api-reference/speech-recognition/{esp_sr_iface.rst => esp_wn_iface.rst} (75%) delete mode 100644 docs/en/api-reference/speech-recognition/wakeup-word-libs.rst delete mode 100644 docs/zh_CN/api-reference/speech-recognition/esp_sr_iface.rst create mode 100644 docs/zh_CN/api-reference/speech-recognition/esp_wn_iface.rst delete mode 100644 docs/zh_CN/api-reference/speech-recognition/wakeup-word-libs.rst diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2bfcb4aac..7430d2c2d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -35,6 +35,7 @@ before_script: - sed -i "s%https://github.com/espressif/esp-idf%${GITLAB_SSH_SERVER}/idf/esp-idf.git%" .gitmodules # replace submodule esp-adf-libs to internal repository to speedup cloning - sed -i "s%https://github.com/espressif/esp-adf-libs%${GITLAB_SSH_SERVER}/adf/esp-adf-libs.git%" .gitmodules + - sed -i "s%https://github.com/espressif/esp-sr.git%${GITLAB_SSH_SERVER}/speech-recognition-internal/esp_sr_public.git%" .gitmodules - git submodule update --init # (the same regular expressions are used to set these are used in 'only:' sections below - source esp-idf/tools/ci/configure_ci_environment.sh diff --git a/.gitmodules b/.gitmodules index 9a69a3110..2093ecbbc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "components/esp-adf-libs"] path = components/esp-adf-libs url = https://github.com/espressif/esp-adf-libs +[submodule "components/esp-sr"] + path = components/esp-sr + url = https://github.com/espressif/esp-sr.git diff --git a/components/audio_stream/algorithm_stream.c b/components/audio_stream/algorithm_stream.c new file mode 100644 index 000000000..09e773534 --- /dev/null +++ b/components/audio_stream/algorithm_stream.c @@ -0,0 +1,602 @@ +/* + * ESPRESSIF MIT License + * + * Copyright (c) 2019 + * + * Permission is hereby granted for use on all ESPRESSIF SYSTEMS products, in which case, + * it is free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include +#include "esp_log.h" +#include "audio_pipeline.h" +#include "audio_element.h" +#include "audio_mem.h" +#include "audio_error.h" +#include "esp_aec.h" +#include "esp_agc.h" +#include "esp_ns.h" +#include "esp_resample.h" +#include "algorithm_stream.h" + +#define NS_FRAME_BYTES 320 // 10ms data frame (10 * 16 * 2) +#define AGC_FRAME_BYTES 320 // 10ms data frame (10 * 16 * 2) +#define AEC_FRAME_BYTES 512 // 16ms data frame (16 * 16 * 2) + +#define ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ 16000 //Hz +#define ALGORITHM_STREAM_DEFAULT_SAMPLE_BIT 16 +#define ALGORITHM_STREAM_DEFAULT_CHANNEL 1 + +#define ALGORITHM_STREAM_DEFAULT_AGC_MODE 3 +#define ALGORITHM_STREAM_DEFAULT_AGC_FRAME_LENGTH 10 //ms + +#define ALGORITHM_STREAM_RESAMPE_DEFAULT_COMPLEXITY 0 +#define ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE (AEC_FRAME_BYTES * 8) + +#define ALGORITHM_STREAM_FULL_FUCTION_MASK (ALGORITHM_STREAM_USE_AEC | ALGORITHM_STREAM_USE_AGC | ALGORITHM_STREAM_USE_NS) + +static const char *TAG = "ALGORITHM_STREAM"; + +typedef struct { + void *rsp_handle; + unsigned char *rsp_in; + unsigned char *rsp_out; + int in_offset; + int ap_factor; + int16_t *aec_buff; + resample_info_t rsp_info; + ringbuf_handle_t input_rb; + bool data_need_be_resampled; + bool data_need_be_divided_after_rsp; //The encode mode of resample function doesn't support change channels +} algorithm_data_info_t; + +typedef struct { + void *ns_handle; + void *aec_handle; + void *agc_handle; + char *scale_buff; + int16_t *aec_buff; + int16_t *ns_buff; + int16_t *agc_buff; + int8_t algo_mask; + algorithm_data_info_t record; + algorithm_data_info_t reference; + algorithm_stream_input_type_t input_type; +} algo_stream_t; + +static esp_err_t algorithm_data_info_destroy(algorithm_data_info_t *data_info) +{ + static void *rsp_handle; + if (rsp_handle == data_info->rsp_handle) { // Avoid the rsp handle be destroyed twice. + return ESP_OK; + } + if (data_info->rsp_handle) { + rsp_handle = data_info->rsp_handle; + esp_resample_destroy(data_info->rsp_handle); + data_info->rsp_handle = NULL; + } + if (data_info->aec_buff) { + free(data_info->aec_buff); + data_info->aec_buff = NULL; + } + return ESP_OK; +} + +static esp_err_t _algo_close(audio_element_handle_t self) +{ + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + + if (algo->ns_handle) { + ns_destroy(algo->ns_handle); + algo->ns_handle = NULL; + } + if (algo->aec_handle) { + aec_destroy(algo->aec_handle); + algo->aec_handle = NULL; + } + if (algo->agc_handle) { + esp_agc_clse(algo->agc_handle); + algo->agc_handle = NULL; + } + if (algo->ns_buff) { + free(algo->ns_buff); + algo->ns_buff = NULL; + } + if (algo->aec_buff) { + free(algo->aec_buff); + algo->aec_buff = NULL; + } + if (algo->agc_buff) { + free(algo->agc_buff); + algo->agc_buff = NULL; + } + if (algo->scale_buff) { + free(algo->scale_buff); + algo->scale_buff = NULL; + } + + algorithm_data_info_destroy(record); + algorithm_data_info_destroy(reference); + + return ESP_OK; +} + +static esp_err_t _algo_open(audio_element_handle_t self) +{ + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + bool _success = true; + + if (algo->algo_mask & ALGORITHM_STREAM_USE_AEC) { + _success &= ((algo->aec_handle = aec_create(ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ, AEC_FRAME_LENGTH_MS, AEC_FILTER_LENGTH)) != NULL); + } + if (algo->algo_mask & ALGORITHM_STREAM_USE_AGC) { + _success &= ((algo->agc_handle = esp_agc_open(ALGORITHM_STREAM_DEFAULT_AGC_MODE, ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ)) != NULL); + } + if (algo->algo_mask & ALGORITHM_STREAM_USE_NS) { + _success &= ((algo->ns_handle = ns_create(ALGORITHM_STREAM_DEFAULT_AGC_FRAME_LENGTH)) != NULL); + } + AUDIO_NULL_CHECK(TAG, _success, { + _algo_close(self); + return ESP_FAIL; + }); + return ESP_OK; +} + +static int algorithm_process(audio_element_handle_t self) +{ + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + static int copy_cnt, cur_pos; + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + + if (algo->algo_mask & ALGORITHM_STREAM_USE_AEC) { + aec_process(algo->aec_handle, record->aec_buff, reference->aec_buff, algo->aec_buff); + memcpy(algo->scale_buff + cur_pos, algo->aec_buff, AEC_FRAME_BYTES); + cur_pos += AEC_FRAME_BYTES; + copy_cnt = cur_pos / AGC_FRAME_BYTES; + + for (int i = 0; i < copy_cnt; i++) { + if ((algo->algo_mask & ALGORITHM_STREAM_USE_NS) && (algo->algo_mask & ALGORITHM_STREAM_USE_AGC)) { + ns_process(algo->ns_handle, (int16_t *)algo->scale_buff + i * (NS_FRAME_BYTES >> 1), algo->ns_buff); + esp_agc_process(algo->agc_handle, algo->ns_buff, algo->agc_buff, (AGC_FRAME_BYTES >> 1), ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ); + audio_element_output(self, (char *)algo->agc_buff, AGC_FRAME_BYTES); + } else if (algo->algo_mask & ALGORITHM_STREAM_USE_NS) { + ns_process(algo->ns_handle, (int16_t *)algo->scale_buff + i * (NS_FRAME_BYTES >> 1), algo->ns_buff); + audio_element_output(self, (char *)algo->ns_buff, NS_FRAME_BYTES); + } else if (algo->algo_mask & ALGORITHM_STREAM_USE_AGC) { + esp_agc_process(algo->agc_handle, (int16_t *)algo->scale_buff + i * (AGC_FRAME_BYTES >> 1), algo->agc_buff, (AGC_FRAME_BYTES >> 1), ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ); + audio_element_output(self, (char *)algo->ns_buff, AGC_FRAME_BYTES); + } else { + audio_element_output(self, (char *)algo->aec_buff, AEC_FRAME_BYTES); + cur_pos -= AEC_FRAME_BYTES; + return AEC_FRAME_BYTES; + } + } + memcpy(algo->scale_buff, algo->scale_buff + AGC_FRAME_BYTES * copy_cnt, cur_pos - AGC_FRAME_BYTES * copy_cnt); + cur_pos -= AGC_FRAME_BYTES * copy_cnt; + return AGC_FRAME_BYTES * copy_cnt; + } else { + if((algo->algo_mask & ALGORITHM_STREAM_USE_AGC) && (algo->algo_mask & ALGORITHM_STREAM_USE_NS)) { + ns_process(algo->ns_handle, (int16_t *)algo->scale_buff, algo->ns_buff); + esp_agc_process(algo->agc_handle, algo->ns_buff, algo->agc_buff, (AGC_FRAME_BYTES >> 1), ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ); + audio_element_output(self, (char *)algo->agc_buff, AGC_FRAME_BYTES); + return AGC_FRAME_BYTES; + } else if (algo->algo_mask & ALGORITHM_STREAM_USE_NS) { + ns_process(algo->ns_handle, (int16_t *)algo->scale_buff, algo->ns_buff); + audio_element_output(self, (char *)algo->ns_buff, NS_FRAME_BYTES); + return NS_FRAME_BYTES; + } else if (algo->algo_mask & ALGORITHM_STREAM_USE_AGC) { + esp_agc_process(algo->agc_handle, (int16_t *)algo->scale_buff, algo->agc_buff, (AGC_FRAME_BYTES >> 1), ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ); + audio_element_output(self, (char *)algo->agc_buff, AGC_FRAME_BYTES); + return AGC_FRAME_BYTES; + } else { + return AEL_IO_FAIL; + } + } +} + +static esp_err_t algorithm_data_divided(int16_t *raw_buff, int len, int16_t *left_channel, int linear_lfac, int16_t *right_channel, int linear_rfac) +{ + // To improve efficiency, data splitting and linear amplification are integrated into one function + for (int i = 0; i < len / 4; i++) { + if (left_channel) { + left_channel[i] = raw_buff[i << 1] * linear_lfac; + } + if (right_channel) { + right_channel[i] = raw_buff[(i << 1) + 1] * linear_rfac; + } + } + return ESP_OK; +} + +static esp_err_t algorithm_data_linear_amplication(int16_t *raw_buff, int len, int linear_factor) +{ + for (int i = 0; i < len / 2; i++) { + raw_buff[i] *= linear_factor; + } + return ESP_OK; +} + +static int algorithm_data_process_for_type1(audio_element_handle_t self) +{ + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + int in_ret = 0, bytes_consume = 0, out_len_bytes = record->rsp_info.out_len_bytes; + char tmp_buffer[2 * AEC_FRAME_BYTES] = {0}; + + if (record->data_need_be_resampled) { // When use input type1, use record or reference handle is the same. + if (record->in_offset > 0) { + memmove(record->rsp_in, &record->rsp_in[ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE - record->in_offset], record->in_offset); + } + in_ret = audio_element_input(self, (char *)&record->rsp_in[record->in_offset], ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE - record->in_offset); + if (in_ret <= 0) { + return in_ret; + } else { + record->in_offset += in_ret; + bytes_consume = esp_resample_run(record->rsp_handle, (void *)&record->rsp_info, + record->rsp_in, record->rsp_out, + record->in_offset, &out_len_bytes); + ESP_LOGD(TAG, "in_ret = %d, out_len_bytes = %d, bytes_consume = %d", in_ret, out_len_bytes, bytes_consume); + + if ((bytes_consume > 0) && (out_len_bytes == record->rsp_info.out_len_bytes)) { + record->in_offset -= bytes_consume; + algorithm_data_divided((int16_t *)record->rsp_out, out_len_bytes, reference->aec_buff, reference->ap_factor, record->aec_buff, record->ap_factor); + return algorithm_process(self); + } else { + ESP_LOGE(TAG, "Fail to resample"); + return AEL_IO_FAIL; + } + } + } else { + in_ret = audio_element_input(self, tmp_buffer, AEC_FRAME_BYTES * 2); + if (in_ret <= 0) { + return in_ret; + } else { + algorithm_data_divided((int16_t *)tmp_buffer, AEC_FRAME_BYTES * 2, reference->aec_buff, reference->ap_factor, record->aec_buff, record->ap_factor); + return algorithm_process(self); + } + } +} + +static int algorithm_data_pre_process_for_type2(audio_element_handle_t self, algorithm_data_info_t *data_info, int input_buffer_index) +{ + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + int in_ret = 0, bytes_consume = 0, out_len_bytes = data_info->rsp_info.out_len_bytes, basic_frame_size; + char tmp_buffer[2 * AEC_FRAME_BYTES] = {0}; + int16_t *input_buffer; + + if (algo->algo_mask & ALGORITHM_STREAM_USE_AEC) { + input_buffer = data_info->aec_buff; + basic_frame_size = AEC_FRAME_BYTES; + } else { + basic_frame_size = AGC_FRAME_BYTES; + input_buffer = (int16_t *)algo->scale_buff; + } + + if (data_info->data_need_be_resampled) { + if (data_info->in_offset > 0) { + memmove(data_info->rsp_in, &data_info->rsp_in[ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE - data_info->in_offset], data_info->in_offset); + } + if (input_buffer_index < 0) { + in_ret = audio_element_input(self, (char *)&data_info->rsp_in[data_info->in_offset], ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE - data_info->in_offset); + } else { + in_ret = audio_element_multi_input(self, (char *)&data_info->rsp_in[data_info->in_offset], ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE - data_info->in_offset, input_buffer_index, portMAX_DELAY); + } + if (in_ret <= 0) { + return in_ret; + } else { + data_info->in_offset += in_ret; + bytes_consume = esp_resample_run(data_info->rsp_handle, (void *)&data_info->rsp_info, + data_info->rsp_in, data_info->rsp_out, + data_info->in_offset, &out_len_bytes); + + ESP_LOGD(TAG, "in_ret = %d, out_len_bytes = %d, bytes_consume = %d", in_ret, out_len_bytes, bytes_consume); + if ((bytes_consume > 0) && (out_len_bytes == data_info->rsp_info.out_len_bytes)) { + data_info->in_offset -= bytes_consume; + if (data_info->data_need_be_divided_after_rsp) { + algorithm_data_divided((int16_t *)data_info->rsp_out, out_len_bytes, input_buffer, data_info->ap_factor, NULL, 0); + } else { + memcpy(input_buffer, data_info->rsp_out, out_len_bytes); + algorithm_data_linear_amplication(input_buffer, out_len_bytes, data_info->ap_factor); + } + } else { + ESP_LOGE(TAG, "Fail to resample"); + return AEL_IO_FAIL; + } + } + } else { + if (data_info->data_need_be_divided_after_rsp) { + if (input_buffer_index < 0) { + in_ret = audio_element_input(self, tmp_buffer, basic_frame_size * 2); + } else { + in_ret = audio_element_multi_input(self, tmp_buffer, basic_frame_size * 2, input_buffer_index, portMAX_DELAY); + } + if (in_ret <= 0) { + return in_ret; + } else { + algorithm_data_divided((int16_t *)tmp_buffer, basic_frame_size * 2, input_buffer, data_info->ap_factor, NULL, 0); + } + } else { + if (input_buffer_index < 0) { + in_ret = audio_element_input(self, (char *)input_buffer, basic_frame_size); + } else { + in_ret = audio_element_multi_input(self, (char *)input_buffer, basic_frame_size, input_buffer_index, portMAX_DELAY); + } + if (in_ret <= 0) { + return in_ret; + } else { + algorithm_data_linear_amplication(input_buffer, basic_frame_size, data_info->ap_factor); + } + } + } + return basic_frame_size; +} + +static int algorithm_data_process_for_type2(audio_element_handle_t self) +{ + int ret = 0; + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + + if (algo->algo_mask & ALGORITHM_STREAM_USE_AEC) { + ret |= algorithm_data_pre_process_for_type2(self, reference, 0); + } + ret |= algorithm_data_pre_process_for_type2(self, record, -1); + if (ret <= 0) { + return ret; + } + return algorithm_process(self); +} + +static audio_element_err_t _algo_process(audio_element_handle_t self, char *in_buffer, int in_len) +{ + int ret = ESP_OK; + algo_stream_t *algo = (algo_stream_t *)audio_element_getdata(self); + if (algo->input_type == ALGORITHM_STREAM_INPUT_TYPE1) { + ret = algorithm_data_process_for_type1(self); + } else if (algo->input_type == ALGORITHM_STREAM_INPUT_TYPE2) { + ret = algorithm_data_process_for_type2(self); + } else { + ESP_LOGE(TAG, "Type %d is not supported", algo->input_type); + return AEL_IO_FAIL; + } + return ret; +} + +static esp_err_t algorithm_resample_config(algorithm_data_info_t *data_info, algorithm_stream_input_type_t type, int src_fre, int src_ch) +{ + if (type == ALGORITHM_STREAM_INPUT_TYPE1) { + data_info->data_need_be_divided_after_rsp = false; + if (src_fre != ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ) { + data_info->data_need_be_resampled = true; + } else { + data_info->data_need_be_resampled = false; + } + return ESP_OK; + } + + if ((src_fre == ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ) && (src_ch == ALGORITHM_STREAM_DEFAULT_CHANNEL)) { + data_info->data_need_be_resampled = false; + data_info->data_need_be_divided_after_rsp = false; + } else if (src_fre == ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ) { + data_info->data_need_be_resampled = false; + data_info->data_need_be_divided_after_rsp = true; + } else { + data_info->data_need_be_resampled = true; + if (src_ch == 2) { + data_info->data_need_be_divided_after_rsp = true; + } else if (src_ch == 1) { + data_info->data_need_be_divided_after_rsp = false; + } else { + ESP_LOGE(TAG, "The channel number should be 0 or 1"); + return ESP_FAIL; + } + } + return ESP_OK; +} + +static esp_err_t create_rsp_handle(algo_stream_t *algo, algorithm_stream_cfg_t *cfg) +{ + AUDIO_NULL_CHECK(TAG, algo, return ESP_FAIL); + AUDIO_NULL_CHECK(TAG, cfg, return ESP_FAIL); + + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + + resample_info_t rsp_cfg = { + .dest_rate = ALGORITHM_STREAM_DEFAULT_SAMPLE_RATE_HZ, + .dest_ch = ALGORITHM_STREAM_DEFAULT_CHANNEL, // The encode resample cannot process diffrent channel + .mode = RESAMPLE_ENCODE_MODE, + .sample_bits = ALGORITHM_STREAM_DEFAULT_SAMPLE_BIT, + .max_indata_bytes = ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE, // The max input data maybe 48K 2ch --> 16k 1ch, so max_data = AEC_FRAME_BYTES * 6 + .complexity = ALGORITHM_STREAM_RESAMPE_DEFAULT_COMPLEXITY, + .type = ESP_RESAMPLE_TYPE_AUTO + }; + + algorithm_resample_config(record, cfg->input_type, cfg->rec_sample_rate, cfg->rec_ch); + algorithm_resample_config(reference, cfg->input_type, cfg->ref_sample_rate, cfg->ref_ch); + + if (algo->input_type == ALGORITHM_STREAM_INPUT_TYPE1) { + if (record->data_need_be_resampled) { + rsp_cfg.src_rate = cfg->rec_sample_rate; + rsp_cfg.src_ch = 2; + rsp_cfg.dest_ch = 2; + rsp_cfg.out_len_bytes = AEC_FRAME_BYTES * 2; + memcpy(&record->rsp_info, &rsp_cfg, sizeof(resample_info_t)); + record->rsp_handle = esp_resample_create(&rsp_cfg, &record->rsp_in, &record->rsp_out); + AUDIO_NULL_CHECK(TAG, record->rsp_handle, { + ESP_LOGE(TAG, "Fail to create resample handle"); + return ESP_FAIL; + }); + return ESP_OK; + } else { + return ESP_OK; + } + } + + if (record->data_need_be_resampled) { + rsp_cfg.src_rate = cfg->rec_sample_rate; + rsp_cfg.dest_ch = cfg->rec_ch; + rsp_cfg.src_ch = cfg->rec_ch; + if (record->data_need_be_divided_after_rsp) { + if (cfg->algo_mask & ALGORITHM_STREAM_USE_AEC) { + rsp_cfg.out_len_bytes = AEC_FRAME_BYTES * 2; + } else { + rsp_cfg.out_len_bytes = AGC_FRAME_BYTES * 2; + } + } else { + if (cfg->algo_mask & ALGORITHM_STREAM_USE_AEC) { + rsp_cfg.out_len_bytes = AEC_FRAME_BYTES; + } else { + rsp_cfg.out_len_bytes = AGC_FRAME_BYTES; + } + } + + memcpy(&record->rsp_info, &rsp_cfg, sizeof(resample_info_t)); + record->rsp_handle = esp_resample_create(&rsp_cfg, &record->rsp_in, &record->rsp_out); + AUDIO_NULL_CHECK(TAG, record->rsp_handle, { + ESP_LOGE(TAG, "Fail to create recorder resample handle"); + return ESP_FAIL; + }); + } else { + return ESP_OK; + } + if (reference->data_need_be_resampled) { + rsp_cfg.src_rate = cfg->ref_sample_rate; + rsp_cfg.dest_ch = cfg->ref_ch; + rsp_cfg.src_ch = cfg->ref_ch; + if (reference->data_need_be_divided_after_rsp) { + if (cfg->algo_mask & ALGORITHM_STREAM_USE_AEC) { + rsp_cfg.out_len_bytes = AEC_FRAME_BYTES * 2; + } else { + rsp_cfg.out_len_bytes = AGC_FRAME_BYTES * 2; + } + } else { + if (cfg->algo_mask & ALGORITHM_STREAM_USE_AEC) { + rsp_cfg.out_len_bytes = AEC_FRAME_BYTES; + } else { + rsp_cfg.out_len_bytes = AGC_FRAME_BYTES; + } + } + memcpy(&reference->rsp_info, &rsp_cfg, sizeof(resample_info_t)); + reference->rsp_handle = esp_resample_create(&rsp_cfg, &reference->rsp_in, &reference->rsp_out); + AUDIO_NULL_CHECK(TAG, reference->rsp_handle, { + esp_resample_destroy(record->rsp_handle); + record->rsp_handle = NULL; + ESP_LOGE(TAG, "Fail to create reference resample handle"); + return ESP_FAIL; + }); + } else { + return ESP_OK; + } + return ESP_OK; +} + +esp_err_t algo_stream_set_multi_input_rb(audio_element_handle_t algo_handle, ringbuf_handle_t input_rb) +{ + AUDIO_NULL_CHECK(TAG, algo_handle, return ESP_FAIL); + AUDIO_NULL_CHECK(TAG, input_rb, return ESP_FAIL); + + if (rb_get_size(input_rb) < ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE) { + ESP_LOGE(TAG, "The ringbuffer size should be better than %d", ALGORITHM_STREAM_RESAMPE_DEFAULT_MAX_INPUT_SIZE); + return ESP_FAIL; + } + + return audio_element_set_multi_input_ringbuf(algo_handle, input_rb, 0); +} + +audio_element_handle_t algo_stream_init(algorithm_stream_cfg_t *config) +{ + AUDIO_NULL_CHECK(TAG, config, return NULL); + if ((config->rec_linear_factor <= 0) || (config->ref_linear_factor <= 0)) { + ESP_LOGE(TAG, "The linear amplication factor should be greater than 0"); + return NULL; + } + if ((config->algo_mask < 0) || (config->algo_mask > ALGORITHM_STREAM_FULL_FUCTION_MASK)) { // + ESP_LOGE(TAG, "Please choose a reasonable mask value"); + return NULL; + } + algo_stream_t *algo = (algo_stream_t *)audio_calloc(1, sizeof(algo_stream_t)); + AUDIO_NULL_CHECK(TAG, algo, return NULL); + algorithm_data_info_t *record = &algo->record; + algorithm_data_info_t *reference = &algo->reference; + + record->ap_factor = config->rec_linear_factor; + reference->ap_factor = config->ref_linear_factor; + + audio_element_cfg_t cfg = DEFAULT_AUDIO_ELEMENT_CONFIG(); + cfg.open = _algo_open; + cfg.close = _algo_close; + cfg.process = _algo_process; + cfg.task_stack = config->task_stack; + cfg.task_prio = config->task_prio; + cfg.task_core = config->task_core; + cfg.multi_in_rb_num = config->input_type; + cfg.tag = "algorithm"; + if (config->input_type == ALGORITHM_STREAM_INPUT_TYPE1) { + if ((config->ref_sample_rate != config->rec_sample_rate) || (config->ref_ch != config->rec_ch)) { + ESP_LOGE(TAG, "The frequence and channel number should be the same, please check about that!"); + free(algo); + return NULL; + } + if (config->algo_mask != (ALGORITHM_STREAM_USE_AEC | ALGORITHM_STREAM_USE_AGC | ALGORITHM_STREAM_USE_NS)) { + ESP_LOGE(TAG, "When type1 is choosen, both these algorithms should be used"); + free(algo); + return NULL; + } + } + + cfg.buffer_len = AEC_FRAME_BYTES; + algo->input_type = config->input_type; + algo->algo_mask = config->algo_mask; + audio_element_handle_t el = audio_element_init(&cfg); + AUDIO_NULL_CHECK(TAG, el, { + free(algo); + return NULL; + }); + bool _success = true; + _success &= (create_rsp_handle(algo, config) == ESP_OK); + _success &= ((algo->scale_buff = audio_calloc(1, AEC_FRAME_BYTES + AGC_FRAME_BYTES)) != NULL); + if (algo->algo_mask & ALGORITHM_STREAM_USE_AEC) { + _success &= ( + (algo->aec_buff = audio_calloc(1, AEC_FRAME_BYTES)) && + (record->aec_buff = audio_calloc(1, AEC_FRAME_BYTES)) && + (reference->aec_buff = audio_calloc(1, AEC_FRAME_BYTES)) + ); + } + if (algo->algo_mask & ALGORITHM_STREAM_USE_AGC) { + _success &= ((algo->agc_buff = audio_calloc(1, AGC_FRAME_BYTES)) != NULL); + } + if (algo->algo_mask & ALGORITHM_STREAM_USE_NS) { + _success &= ((algo->ns_buff = audio_calloc(1, NS_FRAME_BYTES)) != NULL); + + } + AUDIO_NULL_CHECK(TAG, _success, { + ESP_LOGE(TAG, "Error occured"); + _algo_close(el); + free(algo); + return NULL; + }); + + audio_element_setdata(el, algo); + return el; +} diff --git a/components/audio_stream/include/algorithm_stream.h b/components/audio_stream/include/algorithm_stream.h new file mode 100644 index 000000000..dadecaa2e --- /dev/null +++ b/components/audio_stream/include/algorithm_stream.h @@ -0,0 +1,150 @@ +/* + * ESPRESSIF MIT License + * + * Copyright (c) 2019 + * + * Permission is hereby granted for use on all ESPRESSIF SYSTEMS products, in which case, + * it is free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the Software is furnished + * to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _ALGORITHM_STREAM_H_ +#define _ALGORITHM_STREAM_H_ + +#include "audio_element.h" + +#define ALGORITHM_STREAM_PINNED_TO_CORE 0 +#define ALGORITHM_STREAM_TASK_PERIOD 5 +#define ALGORITHM_STREAM_TASK_STACK_SIZE (5 * 1024) + + +/* + +// AEC: Acoustic Echo Cancellation +// AGC: Automatic Gain Control +// WWE: Wake Word Engine +// NS: Noise Suppression + +-----------+ + | | + | TYPE 1 | + | | ++-----------------------------------------------+-----------+---------------------------------------------------+ +| | +| reference signal | +| +-----------+ +-----------+ +-----------\ +-----------+ +-----------+ +-----------+ | +| | | | | | \ | | | | | | | +| | I2S read |--->| Resample |--->| Data split |--->| AEC |--->| NS |--->| AGC | | +| | | | | | / | | | | | | | +| +-----------+ +-----------+ +-----------/ +------------ +-----------+ +-----------+ | +| record signal | +| | ++---------------------------------------------------------------------------------------------------------------+ + + +-----------+ + | | + | TYPE 2 | + | | ++-----------------------------------------------+-----------+---------------------------------------------------+ +| | +| | +| +-----------+ +-----------+ +-----------+ +-----------+ +-----------+ +-----------+ | +| | | | | | | | | | | | | | +| | I2S read |--->| Resample |--->| rec signal|--->| AEC |--->| NS |--->| AGC | | +| | | | | | | | | | | | | | +| +-----------+ +-----------+ +-----------+ +-----^-----+ +-----------+ +-----------+ | +| | | +| +-----------+ +-----------+ +-----------+ | | +| | | | | | | | | +| | input_rb |--->| Resample |--->| ref signal|----------+ | +| | | | | | | | +| +-----------+ +-----------+ +-----------+ | +| | ++---------------------------------------------------------------------------------------------------------------+ + +*/ + +/* + * @brief Two types of algorithm stream input method + */ +typedef enum { + ALGORITHM_STREAM_INPUT_TYPE1 = 1, /*!< Type 1 is default used by mini-board, the reference signal and the recording signal are respectively read in from the left channel and the right channel of the same I2S */ + ALGORITHM_STREAM_INPUT_TYPE2 = 2, /*!< Type 2 read in record signal from I2S and when data be written, the data should be copy as a reference signal and input to the algorithm element by using multiple input buffer. */ +} algorithm_stream_input_type_t; /*!< When use type2, you can combine arbitrarily the algorithm modules you want to use, use algo_mask parameters below to configure that. */ + +/* + * @brief Choose the algorithm to be used + */ +typedef enum { + ALGORITHM_STREAM_USE_AEC = (0x1 << 0), /*!< Use AEC */ + ALGORITHM_STREAM_USE_AGC = (0x1 << 1), /*!< Use AGC */ + ALGORITHM_STREAM_USE_NS = (0x1 << 2) /*!< Use NS */ +} algorithm_stream_mask_t; + +/* + * @brief Algorithm stream configurations + */ +typedef struct { + algorithm_stream_input_type_t input_type; /*!< Input type of stream */ + int task_stack; /*!< Task stack size */ + int task_prio; /*!< Task peroid */ + int task_core; /*!< The core that task to be created */ + int rec_ch; /*!< Channel number of record signal */ + int ref_ch; /*!< Channel number of reference signal */ + int ref_sample_rate; /*!< Sample rate of reference signal */ + int rec_sample_rate; /*!< Sample rate of record signal */ + int rec_linear_factor; /*!< The linear amplication factor of record signal*/ + int ref_linear_factor; /*!< The linear amplication factor of reference signal */ + int8_t algo_mask; /*!< Choose algorithm to use */ +} algorithm_stream_cfg_t; + +#define ALGORITHM_STREAM_CFG_DEFAULT() { \ + .input_type = ALGORITHM_STREAM_INPUT_TYPE1, \ + .task_core = ALGORITHM_STREAM_PINNED_TO_CORE, \ + .task_prio = ALGORITHM_STREAM_TASK_PERIOD, \ + .task_stack = ALGORITHM_STREAM_TASK_STACK_SIZE, \ + .ref_ch = 1, \ + .rec_ch = 1, \ + .ref_sample_rate = 16000, \ + .rec_sample_rate = 16000, \ + .rec_linear_factor = 1, \ + .ref_linear_factor = 3, \ + .algo_mask = (ALGORITHM_STREAM_USE_AEC | ALGORITHM_STREAM_USE_AGC | ALGORITHM_STREAM_USE_NS), \ +} + +/** + * @brief Initialize algorithm stream + * + * @param config The algorithm Stream configuration + * + * @return The audio element handle + */ +audio_element_handle_t algo_stream_init(algorithm_stream_cfg_t *config); + +/** + * @brief Set reference signal input ringbuff + * + * @note If input type2 is choosen, call this function to set ringbuffer to input reference data. + * + * @param algo_handle Handle of algorithm stream + * @param input_rb Ringbuffer handle to be set + * + * @return ESP_OK success + * ESP_FAIL fail + */ +esp_err_t algo_stream_set_multi_input_rb(audio_element_handle_t algo_handle, ringbuf_handle_t input_rb); + +#endif diff --git a/components/esp-adf-libs b/components/esp-adf-libs index 695553298..5778d0a0b 160000 --- a/components/esp-adf-libs +++ b/components/esp-adf-libs @@ -1 +1 @@ -Subproject commit 695553298e543a93f24639af6b65f0ba5f606ab3 +Subproject commit 5778d0a0b99aaa2b7fc961014184681be2de7758 diff --git a/components/esp-sr b/components/esp-sr new file mode 160000 index 000000000..eef4aed07 --- /dev/null +++ b/components/esp-sr @@ -0,0 +1 @@ +Subproject commit eef4aed076bfe630ad6dc6c3e60077cee314ceed diff --git a/docs/Doxyfile b/docs/Doxyfile index ef3e2e510..d57bdcf12 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -33,6 +33,7 @@ INPUT = \ ../../components/audio_stream/include/i2s_stream.h \ ../../components/audio_stream/include/raw_stream.h \ ../../components/audio_stream/include/spiffs_stream.h \ + ../../components/audio_stream/include/algorithm_stream.h \ ## ESP Codec ../../components/esp-adf-libs/esp_codec/include/codec/esp_decoder.h \ ../../components/esp-adf-libs/esp_codec/include/codec/audio_type_def.h \ @@ -68,10 +69,9 @@ INPUT = \ ../../components/esp-adf-libs/esp_codec/include/codec/filter_resample.h \ ../../components/esp-adf-libs/esp_codec/include/codec/audio_sonic.h \ ## Speech Recognitions - ../../components/esp-adf-libs/esp_sr/include/esp_sr_iface.h \ - ../../components/esp-adf-libs/esp_sr/include/esp_sr_models.h \ - ../../components/esp-adf-libs/esp_sr/include/esp_vad.h \ ../../components/esp-adf-libs/recorder_engine/include/recorder_engine.h \ + ../../components/esp-sr/wake_word_engine/include/esp_wn_iface.h \ + ../../components/esp-sr/acoustic_algorithm/include/esp_vad.h \ ## ESP Audio ../../components/esp-adf-libs/esp_audio/include/audio_def.h \ ../../components/esp-adf-libs/esp_audio/include/esp_audio.h \ diff --git a/docs/en/api-reference/speech-recognition/esp_sr_iface.rst b/docs/en/api-reference/speech-recognition/esp_wn_iface.rst similarity index 75% rename from docs/en/api-reference/speech-recognition/esp_sr_iface.rst rename to docs/en/api-reference/speech-recognition/esp_wn_iface.rst index e8477f3e7..2000e543c 100644 --- a/docs/en/api-reference/speech-recognition/esp_sr_iface.rst +++ b/docs/en/api-reference/speech-recognition/esp_wn_iface.rst @@ -35,30 +35,35 @@ A code snippet below demonstrates how to initialize the model, determine the num .. code-block:: c - #include "esp_sr_iface.h" - #include "esp_sr_models.h" + #include "esp_wn_iface.h" + #include "esp_wn_models.h" + #include "rec_eng_helper.h" - static const sr_model_iface_t *model = &sr_model_wakenet3_quantized; + esp_wn_iface_t *wakenet; + model_coeff_getter_t *model_coeff_getter; + model_iface_data_t *model_data; // Initialize wakeNet model data - static model_iface_data_t *model_data = model->create(DET_MODE_90); + get_wakenet_iface(&wakenet); + get_wakenet_coeff(&model_coeff_getter); + model_data = wakenet->create(model_coeff_getter, DET_MODE_90); // Set parameters of buffer - int audio_chunksize = model->get_samp_chunksize(model_data); - int frequency = model->get_samp_rate(model_data); - int16_t *buffer = malloc(audio_chunksize sizeof(int16_t)); + int audio_chunksize = wakenet->get_samp_chunksize(model_data); + int frequency = wakenet->get_samp_rate(model_data); + int16_t *buffer = malloc(audio_chunksize * sizeof(int16_t)); // Get voice data feed to buffer ... // Detect - int r = model->detect(model_data, buffer); + int r = wakenet->detect(model_data, buffer); if (r > 0) { printf("Detection triggered output %d.\n", r); } // Destroy model - model->destroy(model_data) + wakenet->destroy(model_data) Application Example @@ -70,5 +75,5 @@ Implementation of the speech recognition API is demonstrated in :example:`speech API Reference ------------- -.. include:: /_build/inc/esp_sr_iface.inc +.. include:: /_build/inc/esp_wn_iface.inc diff --git a/docs/en/api-reference/speech-recognition/index.rst b/docs/en/api-reference/speech-recognition/index.rst index 588adc5fa..18eb5de0d 100644 --- a/docs/en/api-reference/speech-recognition/index.rst +++ b/docs/en/api-reference/speech-recognition/index.rst @@ -2,7 +2,7 @@ Speech Recognition ****************** -The ESP-ADF comes complete with :doc:`wakeup word libraries ` and :doc:`speech recognition interface ` to recognize voice wakeup commands. Most of currently implemented wakeup commands are in Chinese with one command "Alexa" in English. +The ESP-ADF comes complete with :doc:`wakeup word libraries ` and :doc:`speech recognition interface ` to recognize voice wakeup commands. Most of currently implemented wakeup commands are in Chinese with one command "Alexa" in English. Provided in this section functions also include automatic speech detection, also known as :doc:`voice activity detection (VAD) `, and :doc:`speech recording engine `. @@ -13,6 +13,6 @@ The Speech Recognition API is designed to easy integrate with existing :doc:`../ :maxdepth: 1 wakeup-word-libs - esp_sr_iface + esp_wn_iface esp_vad recorder_engine diff --git a/docs/en/api-reference/speech-recognition/wakeup-word-libs.rst b/docs/en/api-reference/speech-recognition/wakeup-word-libs.rst deleted file mode 100644 index bbc94c5dd..000000000 --- a/docs/en/api-reference/speech-recognition/wakeup-word-libs.rst +++ /dev/null @@ -1,72 +0,0 @@ -Wakeup Word Libraries -===================== - -Espressif speech recognition libraries contain several wakeup words split into models. Two models are provided: - -* ``SR_MODEL_WN3_QUANT`` used for a single wakeup word, -* ``SR_MODEL_WN4_QUANT`` used for multi wakeup words. - -Model selection is done in menuconfig by setting :ref:`CONFIG_SR_MODEL_SEL`. - - -Single Wakeup Word Model ------------------------- - -This model is defined as ``SR_MODEL_WN3_QUANT`` in configuration and contains two libraries, one with wake word in Chinese and the other one in English. - -=============================== =========== ======================== -Library Language Wakeup Word -=============================== =========== ======================== -``libnn_model_hilexin_wn3.a`` Chinese 嗨,乐鑫 (Hāi, lè xīn) -------------------------------- ----------- ------------------------ -``libnn_model_alexa_wn3.a`` English Alexa -=============================== =========== ======================== - -To select desired wakeup word set :ref:`CONFIG_NAME_OF_WAKEUP_WORD`. - - -Multiple Wakeup Word Model --------------------------- - -This model is defined as ``SR_MODEL_WN4_QUANT`` in configuration and contains two libraries with wakeup words in Chinese. - -==== ======================== ======================== ======================== -Library ``libnn_model_light_control_ch_wn4.a`` (Chinese) ----------------------------------------------------------------------------------- -No. Wakeup Words Pronunciation English Meaning -==== ======================== ======================== ======================== - 1 打开电灯 Dǎkāi diàndēng Turn on the light ----- ------------------------ ------------------------ ------------------------ - 2 关闭电灯 Guānbì diàndēng Turn off the light -==== ======================== ======================== ======================== - -==== ======================== ======================== ======================== -Library ``libnn_model_speech_cmd_ch_wn4`` (Chinese) ----------------------------------------------------------------------------------- -No. Wakeup Words Pronunciation English Meaning -==== ======================== ======================== ======================== - 1 嗨,乐鑫 Hāi, lè xīn Hi, Espressif ----- ------------------------ ------------------------ ------------------------ - 2 打开电灯 Dǎkāi diàndēng Turn on the light ----- ------------------------ ------------------------ ------------------------ - 3 关闭电灯 Guānbì diàndēng Turn off the light ----- ------------------------ ------------------------ ------------------------ - 4 音量加大 Yīnliàng jiā dà Increase volume ----- ------------------------ ------------------------ ------------------------ - 5 音量减小 Yīnliàng jiǎn xiǎo Volume down ----- ------------------------ ------------------------ ------------------------ - 6 播放 Bòfàng Play ----- ------------------------ ------------------------ ------------------------ - 7 暂停 Zàntíng Pause ----- ------------------------ ------------------------ ------------------------ - 8 静音 Jìngyīn Mute ----- ------------------------ ------------------------ ------------------------ - 9 播放本地歌曲 Bòfàng běndì gēqǔ Play local music -==== ======================== ======================== ======================== - -To select desired set of multi wakeup words set :ref:`CONFIG_NAME_OF_WAKEUP_WORD`. - -API Reference -------------- - -Declarations of all available speech recognition models is contained in a header file :component_file:`esp-adf-libs/esp_sr/include/esp_sr_models.h`. diff --git a/docs/zh_CN/api-reference/speech-recognition/esp_sr_iface.rst b/docs/zh_CN/api-reference/speech-recognition/esp_sr_iface.rst deleted file mode 100644 index e20d84d89..000000000 --- a/docs/zh_CN/api-reference/speech-recognition/esp_sr_iface.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../../../en/api-reference/speech-recognition/esp_sr_iface.rst \ No newline at end of file diff --git a/docs/zh_CN/api-reference/speech-recognition/esp_wn_iface.rst b/docs/zh_CN/api-reference/speech-recognition/esp_wn_iface.rst new file mode 100644 index 000000000..bbd74e31a --- /dev/null +++ b/docs/zh_CN/api-reference/speech-recognition/esp_wn_iface.rst @@ -0,0 +1 @@ +.. include:: ../../../en/api-reference/speech-recognition/esp_wn_iface.rst \ No newline at end of file diff --git a/docs/zh_CN/api-reference/speech-recognition/wakeup-word-libs.rst b/docs/zh_CN/api-reference/speech-recognition/wakeup-word-libs.rst deleted file mode 100644 index 818895f1d..000000000 --- a/docs/zh_CN/api-reference/speech-recognition/wakeup-word-libs.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../../../en/api-reference/speech-recognition/wakeup-word-libs.rst \ No newline at end of file diff --git a/examples/advanced_examples/record_while_play/main/example_record_and_play.c b/examples/advanced_examples/record_while_play/main/example_record_and_play.c index 3625b313a..862ae51e7 100644 --- a/examples/advanced_examples/record_while_play/main/example_record_and_play.c +++ b/examples/advanced_examples/record_while_play/main/example_record_and_play.c @@ -26,8 +26,9 @@ #include "filter_resample.h" #include "http_stream.h" #include "raw_stream.h" -#include "esp_sr_iface.h" -#include "esp_sr_models.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" +#include "rec_eng_helper.h" #include "esp_peripherals.h" #include "periph_sdcard.h" @@ -53,6 +54,10 @@ typedef enum { static input_stream_t input_type_flag; static output_stream_t output_type_flag; +esp_wn_iface_t *wakenet; +model_coeff_getter_t *model_coeff_getter; +model_iface_data_t *model_data; + static audio_pipeline_handle_t example_create_play_pipeline(const char *url, output_stream_t output_type) { audio_pipeline_handle_t pipeline; @@ -233,6 +238,8 @@ static void example_stop_all_pipelines(void) audio_element_deinit(i2s_stream_reader); audio_element_deinit(raw_reader); audio_element_deinit(resample_for_rec); + wakenet->destroy(model_data); + model_data = NULL; break; } default: @@ -329,8 +336,9 @@ void app_main(void) #endif ESP_LOGI(TAG, "[ * ] Create asr model"); - const esp_sr_iface_t *model = &esp_sr_wakenet5_quantized; - model_iface_data_t *iface = model->create(DET_MODE_90); + get_wakenet_iface(&wakenet); + get_wakenet_coeff(&model_coeff_getter); + model_data = wakenet->create(model_coeff_getter, DET_MODE_90); ESP_LOGI(TAG, "[ 4 ] Set up event listener"); audio_event_iface_cfg_t evt_cfg = AUDIO_EVENT_IFACE_DEFAULT_CFG(); @@ -352,7 +360,7 @@ void app_main(void) if (input_type_flag == INPUT_STREAM_ASR) { audio_event_iface_listen(evt, &msg, 0); raw_stream_read(raw_reader, buff, 960); - int keyword = model->detect(iface, (int16_t *)buff); + int keyword = wakenet->detect(model_data, (int16_t *)buff); if (keyword) { ESP_LOGW(TAG, "###spot keyword###"); } diff --git a/examples/dueros/main/dueros_app.c b/examples/dueros/main/dueros_app.c index e15c5c62e..3922bd9fc 100644 --- a/examples/dueros/main/dueros_app.c +++ b/examples/dueros/main/dueros_app.c @@ -57,6 +57,7 @@ #include "airkiss_config.h" #include "smart_config.h" #include "periph_adc_button.h" +#include "algorithm_stream.h" static const char *TAG = "DUEROS"; extern esp_audio_handle_t player; @@ -96,12 +97,18 @@ void rec_engine_cb(rec_event_type_t type, void *user_data) } } +int duer_i2s_read_cb(audio_element_handle_t el, char *buf, int len, TickType_t wait_time, void *ctx) +{ + audio_element_handle_t i2s_read_handle = (audio_element_handle_t)ctx; + return audio_element_input(i2s_read_handle, buf, len); +} + static audio_element_handle_t raw_read; #ifdef CONFIG_ESP_LYRAT_MINI_V1_1_BOARD static esp_err_t recorder_pipeline_open_for_mini(void **handle) { - audio_element_handle_t i2s_stream_reader; + audio_element_handle_t i2s_reader; audio_pipeline_handle_t recorder; audio_pipeline_cfg_t pipeline_cfg = DEFAULT_AUDIO_PIPELINE_CONFIG(); recorder = audio_pipeline_init(&pipeline_cfg); @@ -112,18 +119,22 @@ static esp_err_t recorder_pipeline_open_for_mini(void **handle) i2s_cfg.i2s_port = 1; i2s_cfg.i2s_config.use_apll = 0; i2s_cfg.i2s_config.sample_rate = 16000; - i2s_cfg.i2s_config.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT; i2s_cfg.type = AUDIO_STREAM_READER; - i2s_stream_reader = i2s_stream_init(&i2s_cfg); + i2s_reader = i2s_stream_init(&i2s_cfg); raw_stream_cfg_t raw_cfg = RAW_STREAM_CFG_DEFAULT(); raw_cfg.type = AUDIO_STREAM_READER; raw_read = raw_stream_init(&raw_cfg); - audio_pipeline_register(recorder, i2s_stream_reader, "i2s"); + algorithm_stream_cfg_t algo_cfg = ALGORITHM_STREAM_CFG_DEFAULT(); + audio_element_handle_t algo_handle = algo_stream_init(&algo_cfg); + + audio_pipeline_register(recorder, algo_handle, "algo"); + audio_element_set_read_cb(algo_handle, duer_i2s_read_cb, (void *)i2s_reader); audio_pipeline_register(recorder, raw_read, "raw"); - audio_pipeline_link(recorder, (const char *[]) {"i2s", "raw"}, 2); + audio_pipeline_link(recorder, (const char *[]) {"algo", "raw"}, 2); + audio_pipeline_run(recorder); ESP_LOGI(TAG, "Recorder has been created"); *handle = recorder; @@ -365,13 +376,12 @@ void duer_app_init(void) wifi_config_t sta_cfg = {0}; strncpy((char *)&sta_cfg.sta.ssid, CONFIG_WIFI_SSID, strlen(CONFIG_WIFI_SSID)); strncpy((char *)&sta_cfg.sta.password, CONFIG_WIFI_PASSWORD, strlen(CONFIG_WIFI_PASSWORD)); - wifi_service_config_t cfg = WIFI_SERVICE_DEFAULT_CONFIG(); cfg.evt_cb = wifi_service_cb; cfg.cb_ctx = NULL; cfg.setting_timeout_s = 60; wifi_serv = wifi_service_create(&cfg); - + vTaskDelay(1000); int reg_idx = 0; esp_wifi_setting_handle_t h = NULL; #ifdef CONFIG_AIRKISS_ENCRYPT @@ -389,6 +399,7 @@ void duer_app_init(void) wifi_service_set_sta_info(wifi_serv, &sta_cfg); wifi_service_connect(wifi_serv); + rec_config_t eng = DEFAULT_REC_ENGINE_CONFIG(); eng.vad_off_delay_ms = 800; eng.wakeup_time_ms = 10 * 1000; @@ -410,5 +421,4 @@ void duer_app_init(void) duer_serv_handle = dueros_service_create(); duer_audio_wrapper_init(); audio_service_set_callback(duer_serv_handle, duer_callback, retry_login_timer); - } diff --git a/examples/speech_recognition/asr/main/example_asr_main.c b/examples/speech_recognition/asr/main/example_asr_main.c index 497e5d008..4dc14d9fb 100644 --- a/examples/speech_recognition/asr/main/example_asr_main.c +++ b/examples/speech_recognition/asr/main/example_asr_main.c @@ -24,11 +24,11 @@ #include "i2s_stream.h" #include "raw_stream.h" #include "filter_resample.h" -#include "esp_sr_iface.h" -#include "esp_sr_models.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" +#include "rec_eng_helper.h" static const char *TAG = "example_asr_keywords"; - static const char *EVENT_TAG = "asr_event"; typedef enum { @@ -55,32 +55,32 @@ void app_main() }; gpio_config(&gpio_conf); #endif - esp_log_level_set("*", ESP_LOG_WARN); esp_log_level_set(TAG, ESP_LOG_INFO); esp_log_level_set(EVENT_TAG, ESP_LOG_INFO); ESP_LOGI(TAG, "Initialize SR handle"); -#if CONFIG_SR_MODEL_WN4_QUANT - const esp_sr_iface_t *model = &esp_sr_wakenet4_quantized; -#else - const esp_sr_iface_t *model = &esp_sr_wakenet3_quantized; -#endif - model_iface_data_t *iface = model->create(DET_MODE_90); - int num = model->get_word_num(iface); + esp_wn_iface_t *wakenet; + model_coeff_getter_t *model_coeff_getter; + model_iface_data_t *model_data; + + get_wakenet_iface(&wakenet); + get_wakenet_coeff(&model_coeff_getter); + model_data = wakenet->create(model_coeff_getter, DET_MODE_90); + int num = wakenet->get_word_num(model_data); for (int i = 1; i <= num; i++) { - char *name = model->get_word_name(iface, i); + char *name = wakenet->get_word_name(model_data, i); ESP_LOGI(TAG, "keywords: %s (index = %d)", name, i); } - float threshold = model->get_det_threshold_by_mode(iface, DET_MODE_90, 1); - int sample_rate = model->get_samp_rate(iface); - int audio_chunksize = model->get_samp_chunksize(iface); + float threshold = wakenet->get_det_threshold(model_data, 1); + int sample_rate = wakenet->get_samp_rate(model_data); + int audio_chunksize = wakenet->get_samp_chunksize(model_data); ESP_LOGI(EVENT_TAG, "keywords_num = %d, threshold = %f, sample_rate = %d, chunksize = %d, sizeof_uint16 = %d", num, threshold, sample_rate, audio_chunksize, sizeof(int16_t)); int16_t *buff = (int16_t *)malloc(audio_chunksize * sizeof(short)); if (NULL == buff) { ESP_LOGE(EVENT_TAG, "Memory allocation failed!"); - model->destroy(iface); - model = NULL; + wakenet->destroy(model_data); + model_data = NULL; return; } @@ -100,11 +100,15 @@ void app_main() i2s_stream_cfg_t i2s_cfg = I2S_STREAM_CFG_DEFAULT(); i2s_cfg.i2s_config.sample_rate = 48000; i2s_cfg.type = AUDIO_STREAM_READER; + + // Mini board record by I2S1 and play music by I2S0, no need to add resample element. #if defined CONFIG_ESP_LYRAT_MINI_V1_1_BOARD + i2s_cfg.i2s_config.sample_rate = 16000; i2s_cfg.i2s_port = 1; -#endif + i2s_cfg.i2s_config.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT; + i2s_stream_reader = i2s_stream_init(&i2s_cfg); +#else i2s_stream_reader = i2s_stream_init(&i2s_cfg); - ESP_LOGI(EVENT_TAG, "[ 2.2 ] Create filter to resample audio data"); rsp_filter_cfg_t rsp_cfg = DEFAULT_RESAMPLE_FILTER_CONFIG(); rsp_cfg.src_rate = 48000; @@ -113,6 +117,7 @@ void app_main() rsp_cfg.dest_ch = 1; rsp_cfg.type = AUDIO_CODEC_TYPE_ENCODER; filter = rsp_filter_init(&rsp_cfg); +#endif ESP_LOGI(EVENT_TAG, "[ 2.3 ] Create raw to receive data"); raw_stream_cfg_t raw_cfg = { @@ -123,17 +128,22 @@ void app_main() ESP_LOGI(EVENT_TAG, "[ 3 ] Register all elements to audio pipeline"); audio_pipeline_register(pipeline, i2s_stream_reader, "i2s"); - audio_pipeline_register(pipeline, filter, "filter"); audio_pipeline_register(pipeline, raw_read, "raw"); +#if defined CONFIG_ESP_LYRAT_MINI_V1_1_BOARD + ESP_LOGI(EVENT_TAG, "[ 4 ] Link elements together [codec_chip]-->i2s_stream-->raw-->[SR]"); + audio_pipeline_link(pipeline, (const char *[]) {"i2s", "raw"}, 2); +#else + audio_pipeline_register(pipeline, filter, "filter"); ESP_LOGI(EVENT_TAG, "[ 4 ] Link elements together [codec_chip]-->i2s_stream-->filter-->raw-->[SR]"); audio_pipeline_link(pipeline, (const char *[]) {"i2s", "filter", "raw"}, 3); +#endif ESP_LOGI(EVENT_TAG, "[ 5 ] Start audio_pipeline"); audio_pipeline_run(pipeline); while (1) { raw_stream_read(raw_read, (char *)buff, audio_chunksize * sizeof(short)); - int keyword = model->detect(iface, (int16_t *)buff); + int keyword = wakenet->detect(model_data, (int16_t *)buff); switch (keyword) { case WAKE_UP: ESP_LOGI(TAG, "Wake up"); @@ -172,7 +182,6 @@ void app_main() ESP_LOGD(TAG, "Not supported keyword"); break; } - } ESP_LOGI(EVENT_TAG, "[ 6 ] Stop audio_pipeline"); @@ -193,8 +202,8 @@ void app_main() audio_element_deinit(filter); ESP_LOGI(EVENT_TAG, "[ 7 ] Destroy model"); - model->destroy(iface); - model = NULL; + wakenet->destroy(model_data); + model_data = NULL; free(buff); buff = NULL; } From 845303486602b4f2eb143fe507ad304de407efb6 Mon Sep 17 00:00:00 2001 From: Krzysztof Date: Wed, 13 Nov 2019 20:32:54 +0800 Subject: [PATCH 2/2] Remove references to 'esp-sr' repository --- components/audio_stream/include/algorithm_stream.h | 6 +++--- docs/Doxyfile | 4 +--- docs/en/api-reference/speech-recognition/esp_vad.rst | 2 +- docs/en/api-reference/speech-recognition/esp_wn_iface.rst | 3 +-- docs/en/api-reference/speech-recognition/index.rst | 3 +-- 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/components/audio_stream/include/algorithm_stream.h b/components/audio_stream/include/algorithm_stream.h index dadecaa2e..60aee4d04 100644 --- a/components/audio_stream/include/algorithm_stream.h +++ b/components/audio_stream/include/algorithm_stream.h @@ -77,7 +77,7 @@ */ -/* +/** * @brief Two types of algorithm stream input method */ typedef enum { @@ -85,7 +85,7 @@ typedef enum { ALGORITHM_STREAM_INPUT_TYPE2 = 2, /*!< Type 2 read in record signal from I2S and when data be written, the data should be copy as a reference signal and input to the algorithm element by using multiple input buffer. */ } algorithm_stream_input_type_t; /*!< When use type2, you can combine arbitrarily the algorithm modules you want to use, use algo_mask parameters below to configure that. */ -/* +/** * @brief Choose the algorithm to be used */ typedef enum { @@ -94,7 +94,7 @@ typedef enum { ALGORITHM_STREAM_USE_NS = (0x1 << 2) /*!< Use NS */ } algorithm_stream_mask_t; -/* +/** * @brief Algorithm stream configurations */ typedef struct { diff --git a/docs/Doxyfile b/docs/Doxyfile index d57bdcf12..425c6c464 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -68,10 +68,8 @@ INPUT = \ ../../components/esp-adf-libs/esp_codec/include/codec/equalizer.h \ ../../components/esp-adf-libs/esp_codec/include/codec/filter_resample.h \ ../../components/esp-adf-libs/esp_codec/include/codec/audio_sonic.h \ - ## Speech Recognitions + ## Speech Recognition ../../components/esp-adf-libs/recorder_engine/include/recorder_engine.h \ - ../../components/esp-sr/wake_word_engine/include/esp_wn_iface.h \ - ../../components/esp-sr/acoustic_algorithm/include/esp_vad.h \ ## ESP Audio ../../components/esp-adf-libs/esp_audio/include/audio_def.h \ ../../components/esp-adf-libs/esp_audio/include/esp_audio.h \ diff --git a/docs/en/api-reference/speech-recognition/esp_vad.rst b/docs/en/api-reference/speech-recognition/esp_vad.rst index bc8e18a52..f4f559197 100644 --- a/docs/en/api-reference/speech-recognition/esp_vad.rst +++ b/docs/en/api-reference/speech-recognition/esp_vad.rst @@ -15,4 +15,4 @@ Implementation of the voice activity detection API is demonstrated in :example:` API Reference ------------- -.. include:: /_build/inc/esp_vad.inc +For the latest API reference please refer to `Espressif Speech recognition repository `_. diff --git a/docs/en/api-reference/speech-recognition/esp_wn_iface.rst b/docs/en/api-reference/speech-recognition/esp_wn_iface.rst index 2000e543c..c27156577 100644 --- a/docs/en/api-reference/speech-recognition/esp_wn_iface.rst +++ b/docs/en/api-reference/speech-recognition/esp_wn_iface.rst @@ -75,5 +75,4 @@ Implementation of the speech recognition API is demonstrated in :example:`speech API Reference ------------- -.. include:: /_build/inc/esp_wn_iface.inc - +For the latest API reference please refer to `Espressif Speech recognition repository `_. diff --git a/docs/en/api-reference/speech-recognition/index.rst b/docs/en/api-reference/speech-recognition/index.rst index 18eb5de0d..546163e07 100644 --- a/docs/en/api-reference/speech-recognition/index.rst +++ b/docs/en/api-reference/speech-recognition/index.rst @@ -2,7 +2,7 @@ Speech Recognition ****************** -The ESP-ADF comes complete with :doc:`wakeup word libraries ` and :doc:`speech recognition interface ` to recognize voice wakeup commands. Most of currently implemented wakeup commands are in Chinese with one command "Alexa" in English. +The ESP-ADF comes complete with :doc:`speech recognition interface ` to recognize voice wakeup commands. Most of currently implemented wakeup commands are in Chinese with one command "Hi Jeson" in English. Provided in this section functions also include automatic speech detection, also known as :doc:`voice activity detection (VAD) `, and :doc:`speech recording engine `. @@ -12,7 +12,6 @@ The Speech Recognition API is designed to easy integrate with existing :doc:`../ :caption: In This Section :maxdepth: 1 - wakeup-word-libs esp_wn_iface esp_vad recorder_engine