diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c
index 2b11eb096..c4e9be62b 100644
--- a/c-api-examples/offline-tts-c-api.c
+++ b/c-api-examples/offline-tts-c-api.c
@@ -186,7 +186,7 @@ int32_t main(int32_t argc, char *argv[]) {
SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config);
const SherpaOnnxGeneratedAudio *audio =
- SherpaOnnxOfflineTtsGenerate(tts, text, sid);
+ SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0);
SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename);
diff --git a/go-api-examples/non-streaming-tts/main.go b/go-api-examples/non-streaming-tts/main.go
index a263b42b2..1728d8e1f 100644
--- a/go-api-examples/non-streaming-tts/main.go
+++ b/go-api-examples/non-streaming-tts/main.go
@@ -49,7 +49,7 @@ func main() {
log.Println("Start generating!")
- audio := tts.Generate(text, sid)
+ audio := tts.Generate(text, sid, 1.0)
log.Println("Done!")
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp
new file mode 100644
index 000000000..2c87b1600
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp
@@ -0,0 +1,92 @@
+
+// NonStreamingTextToSpeech.cpp : Defines the class behaviors for the application.
+//
+
+#include "pch.h"
+#include "framework.h"
+#include "NonStreamingTextToSpeech.h"
+#include "NonStreamingTextToSpeechDlg.h"
+
+#ifdef _DEBUG
+#define new DEBUG_NEW
+#endif
+
+
+// CNonStreamingTextToSpeechApp
+
+BEGIN_MESSAGE_MAP(CNonStreamingTextToSpeechApp, CWinApp)
+ ON_COMMAND(ID_HELP, &CWinApp::OnHelp)
+END_MESSAGE_MAP()
+
+
+// CNonStreamingTextToSpeechApp construction
+
+CNonStreamingTextToSpeechApp::CNonStreamingTextToSpeechApp()
+{
+ // TODO: add construction code here,
+ // Place all significant initialization in InitInstance
+}
+
+
+// The one and only CNonStreamingTextToSpeechApp object
+
+CNonStreamingTextToSpeechApp theApp;
+
+
+// CNonStreamingTextToSpeechApp initialization
+
+BOOL CNonStreamingTextToSpeechApp::InitInstance()
+{
+ CWinApp::InitInstance();
+
+
+ // Create the shell manager, in case the dialog contains
+ // any shell tree view or shell list view controls.
+ CShellManager *pShellManager = new CShellManager;
+
+ // Activate "Windows Native" visual manager for enabling themes in MFC controls
+ CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows));
+
+ // Standard initialization
+ // If you are not using these features and wish to reduce the size
+ // of your final executable, you should remove from the following
+ // the specific initialization routines you do not need
+ // Change the registry key under which our settings are stored
+ // TODO: You should modify this string to be something appropriate
+ // such as the name of your company or organization
+ SetRegistryKey(_T("Local AppWizard-Generated Applications"));
+
+ CNonStreamingTextToSpeechDlg dlg;
+ m_pMainWnd = &dlg;
+ INT_PTR nResponse = dlg.DoModal();
+ if (nResponse == IDOK)
+ {
+ // TODO: Place code here to handle when the dialog is
+ // dismissed with OK
+ }
+ else if (nResponse == IDCANCEL)
+ {
+ // TODO: Place code here to handle when the dialog is
+ // dismissed with Cancel
+ }
+ else if (nResponse == -1)
+ {
+ TRACE(traceAppMsg, 0, "Warning: dialog creation failed, so application is terminating unexpectedly.\n");
+ TRACE(traceAppMsg, 0, "Warning: if you are using MFC controls on the dialog, you cannot #define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n");
+ }
+
+ // Delete the shell manager created above.
+ if (pShellManager != nullptr)
+ {
+ delete pShellManager;
+ }
+
+#if !defined(_AFXDLL) && !defined(_AFX_NO_MFC_CONTROLS_IN_DIALOGS)
+ ControlBarCleanUp();
+#endif
+
+ // Since the dialog has been closed, return FALSE so that we exit the
+ // application, rather than start the application's message pump.
+ return FALSE;
+}
+
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h
new file mode 100644
index 000000000..19b2ff365
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h
@@ -0,0 +1,32 @@
+
+// NonStreamingTextToSpeech.h : main header file for the PROJECT_NAME application
+//
+
+#pragma once
+
+#ifndef __AFXWIN_H__
+ #error "include 'pch.h' before including this file for PCH"
+#endif
+
+#include "resource.h" // main symbols
+
+
+// CNonStreamingTextToSpeechApp:
+// See NonStreamingTextToSpeech.cpp for the implementation of this class
+//
+
+class CNonStreamingTextToSpeechApp : public CWinApp
+{
+public:
+ CNonStreamingTextToSpeechApp();
+
+// Overrides
+public:
+ virtual BOOL InitInstance();
+
+// Implementation
+
+ DECLARE_MESSAGE_MAP()
+};
+
+extern CNonStreamingTextToSpeechApp theApp;
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc
new file mode 100644
index 000000000..087e9b310
Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc differ
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj
new file mode 100644
index 000000000..d7eb6fd1b
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj
@@ -0,0 +1,219 @@
+
+
+
+
+ Debug
+ Win32
+
+
+ Release
+ Win32
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+ 17.0
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}
+ MFCProj
+ NonStreamingTextToSpeech
+ 10.0
+
+
+
+ Application
+ true
+ v143
+ Unicode
+ Dynamic
+
+
+ Application
+ false
+ v143
+ true
+ Unicode
+ Dynamic
+
+
+ Application
+ true
+ v143
+ Unicode
+ Dynamic
+
+
+ Application
+ false
+ v143
+ true
+ Unicode
+ Static
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ false
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+
+ Use
+ Level3
+ true
+ true
+ true
+ _WINDOWS;NDEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+ true
+ true
+
+
+ false
+ true
+ NDEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ NDEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+
+
+ false
+ true
+ _DEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ _DEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ _WINDOWS;_DEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+
+
+ false
+ true
+ _DEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ _DEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+ Use
+ Level3
+ true
+ true
+ true
+ WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions)
+ pch.h
+
+
+ Windows
+ true
+ true
+
+
+ false
+ true
+ NDEBUG;%(PreprocessorDefinitions)
+
+
+ 0x0409
+ NDEBUG;%(PreprocessorDefinitions)
+ $(IntDir);%(AdditionalIncludeDirectories)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Create
+ Create
+ Create
+ Create
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters
new file mode 100644
index 000000000..689493c4d
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters
@@ -0,0 +1,63 @@
+
+
+
+
+ {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
+ cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
+
+
+ {93995380-89BD-4b04-88EB-625FBE52EBFB}
+ h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
+
+
+ {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
+ rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
+
+
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+ Header Files
+
+
+
+
+ Source Files
+
+
+ Source Files
+
+
+ Source Files
+
+
+
+
+ Resource Files
+
+
+
+
+ Resource Files
+
+
+
+
+ Resource Files
+
+
+
\ No newline at end of file
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
new file mode 100644
index 000000000..a6e6d3552
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp
@@ -0,0 +1,357 @@
+
+// NonStreamingTextToSpeechDlg.cpp : implementation file
+//
+
+#include "pch.h"
+#include "framework.h"
+#include "NonStreamingTextToSpeech.h"
+#include "NonStreamingTextToSpeechDlg.h"
+#include "afxdialogex.h"
+
+#include
+#include
+#include
+#include
+
+#ifdef _DEBUG
+#define new DEBUG_NEW
+#endif
+
+
+// CAboutDlg dialog used for App About
+
+class CAboutDlg : public CDialogEx
+{
+public:
+ CAboutDlg();
+
+// Dialog Data
+#ifdef AFX_DESIGN_TIME
+ enum { IDD = IDD_ABOUTBOX };
+#endif
+
+ protected:
+ virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support
+
+// Implementation
+protected:
+ DECLARE_MESSAGE_MAP()
+};
+
+CAboutDlg::CAboutDlg() : CDialogEx(IDD_ABOUTBOX)
+{
+}
+
+void CAboutDlg::DoDataExchange(CDataExchange* pDX)
+{
+ CDialogEx::DoDataExchange(pDX);
+}
+
+BEGIN_MESSAGE_MAP(CAboutDlg, CDialogEx)
+END_MESSAGE_MAP()
+
+
+// CNonStreamingTextToSpeechDlg dialog
+
+// see
+// https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
+static std::wstring Utf8ToUtf16(const std::string &utf8) {
+ std::vector unicode;
+ size_t i = 0;
+ while (i < utf8.size()) {
+ unsigned long uni;
+ size_t todo;
+ bool error = false;
+ unsigned char ch = utf8[i++];
+ if (ch <= 0x7F) {
+ uni = ch;
+ todo = 0;
+ } else if (ch <= 0xBF) {
+ throw std::logic_error("not a UTF-8 string");
+ } else if (ch <= 0xDF) {
+ uni = ch & 0x1F;
+ todo = 1;
+ } else if (ch <= 0xEF) {
+ uni = ch & 0x0F;
+ todo = 2;
+ } else if (ch <= 0xF7) {
+ uni = ch & 0x07;
+ todo = 3;
+ } else {
+ throw std::logic_error("not a UTF-8 string");
+ }
+ for (size_t j = 0; j < todo; ++j) {
+ if (i == utf8.size()) throw std::logic_error("not a UTF-8 string");
+ unsigned char ch = utf8[i++];
+ if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string");
+ uni <<= 6;
+ uni += ch & 0x3F;
+ }
+ if (uni >= 0xD800 && uni <= 0xDFFF)
+ throw std::logic_error("not a UTF-8 string");
+ if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string");
+ unicode.push_back(uni);
+ }
+ std::wstring utf16;
+ for (size_t i = 0; i < unicode.size(); ++i) {
+ unsigned long uni = unicode[i];
+ if (uni <= 0xFFFF) {
+ utf16 += (wchar_t)uni;
+ } else {
+ uni -= 0x10000;
+ utf16 += (wchar_t)((uni >> 10) + 0xD800);
+ utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00);
+ }
+ }
+ return utf16;
+}
+
+// The system calls this function to obtain the cursor to display while the user drags
+// the minimized window.
+HCURSOR CNonStreamingTextToSpeechDlg::OnQueryDragIcon()
+{
+ return static_cast(m_hIcon);
+}
+
+
+void AppendTextToEditCtrl(CEdit& e, const std::string &s) {
+ // get the initial text length
+ int nLength = e.GetWindowTextLength();
+ // put the selection at the end of text
+ e.SetSel(nLength, nLength);
+ // replace the selection
+
+ std::wstring wstr = Utf8ToUtf16(s);
+
+ // my_text_.ReplaceSel(wstr.c_str());
+ e.ReplaceSel(wstr.c_str());
+}
+
+void AppendLineToMultilineEditCtrl(CEdit& e, const std::string &s) {
+ AppendTextToEditCtrl(e, "\r\n" + s);
+}
+
+
+CNonStreamingTextToSpeechDlg::CNonStreamingTextToSpeechDlg(CWnd* pParent /*=nullptr*/)
+ : CDialogEx(IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG, pParent)
+ {
+ m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME);
+}
+
+void CNonStreamingTextToSpeechDlg::DoDataExchange(CDataExchange* pDX)
+{
+ CDialogEx::DoDataExchange(pDX);
+ DDX_Control(pDX, IDC_HINT, my_hint_);
+ DDX_Control(pDX, IDC_SPEAKER, speaker_id_);
+ DDX_Control(pDX, IDC_SPEED, speed_);
+ DDX_Control(pDX, IDOK, generate_btn_);
+ DDX_Control(pDX, IDC_TEXT, my_text_);
+}
+
+BEGIN_MESSAGE_MAP(CNonStreamingTextToSpeechDlg, CDialogEx)
+ ON_WM_SYSCOMMAND()
+ ON_WM_PAINT()
+ ON_WM_QUERYDRAGICON()
+ ON_BN_CLICKED(IDOK, &CNonStreamingTextToSpeechDlg::OnBnClickedOk)
+ END_MESSAGE_MAP()
+
+
+// CNonStreamingTextToSpeechDlg message handlers
+
+BOOL CNonStreamingTextToSpeechDlg::OnInitDialog()
+{
+ CDialogEx::OnInitDialog();
+
+ // Add "About..." menu item to system menu.
+
+ // IDM_ABOUTBOX must be in the system command range.
+ ASSERT((IDM_ABOUTBOX & 0xFFF0) == IDM_ABOUTBOX);
+ ASSERT(IDM_ABOUTBOX < 0xF000);
+
+ CMenu* pSysMenu = GetSystemMenu(FALSE);
+ if (pSysMenu != nullptr)
+ {
+ BOOL bNameValid;
+ CString strAboutMenu;
+ bNameValid = strAboutMenu.LoadString(IDS_ABOUTBOX);
+ ASSERT(bNameValid);
+ if (!strAboutMenu.IsEmpty())
+ {
+ pSysMenu->AppendMenu(MF_SEPARATOR);
+ pSysMenu->AppendMenu(MF_STRING, IDM_ABOUTBOX, strAboutMenu);
+ }
+ }
+
+ // Set the icon for this dialog. The framework does this automatically
+ // when the application's main window is not a dialog
+ SetIcon(m_hIcon, TRUE); // Set big icon
+ SetIcon(m_hIcon, FALSE); // Set small icon
+
+ // TODO: Add extra initialization here
+ Init();
+
+ return TRUE; // return TRUE unless you set the focus to a control
+}
+
+void CNonStreamingTextToSpeechDlg::OnSysCommand(UINT nID, LPARAM lParam)
+{
+ if ((nID & 0xFFF0) == IDM_ABOUTBOX)
+ {
+ CAboutDlg dlgAbout;
+ dlgAbout.DoModal();
+ }
+ else
+ {
+ CDialogEx::OnSysCommand(nID, lParam);
+ }
+}
+
+// If you add a minimize button to your dialog, you will need the code below
+// to draw the icon . For MFC applications using the document/view model,
+// this is automatically done for you by the framework.
+
+void CNonStreamingTextToSpeechDlg::OnPaint()
+{
+ if (IsIconic())
+ {
+ CPaintDC dc(this); // device context for painting
+
+ SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()), 0);
+
+ // Center icon in client rectangle
+ int cxIcon = GetSystemMetrics(SM_CXICON);
+ int cyIcon = GetSystemMetrics(SM_CYICON);
+ CRect rect;
+ GetClientRect(&rect);
+ int x = (rect.Width() - cxIcon + 1) / 2;
+ int y = (rect.Height() - cyIcon + 1) / 2;
+
+ // Draw the icon
+ dc.DrawIcon(x, y, m_hIcon);
+ }
+ else
+ {
+ CDialogEx::OnPaint();
+ }
+}
+
+bool Exists(const std::string &filename) {
+ std::ifstream is(filename);
+ return is.good();
+}
+
+void CNonStreamingTextToSpeechDlg::InitHint() {
+ AppendLineToMultilineEditCtrl(my_hint_, "Speaker ID: Used only for multi-speaker models. Example value: 0");
+ AppendLineToMultilineEditCtrl(my_hint_, "Speed: Larger -> Faster in speech speed. Example value: 1.0");
+ AppendLineToMultilineEditCtrl(my_hint_, "\r\n\r\nPlease input your text and click the button Generate");
+
+}
+
+void CNonStreamingTextToSpeechDlg::Init() {
+ InitHint();
+ speaker_id_.SetWindowText(Utf8ToUtf16("0").c_str());
+ speed_.SetWindowText(Utf8ToUtf16("1.0").c_str());
+
+ bool ok = true;
+ std::string error_message = "--------------------";
+ if (!Exists("./model.onnx")) {
+ error_message += "Cannot find ./model.onnx\r\n";
+ ok = false;
+ }
+
+ if (!Exists("./lexicon.txt")) {
+ error_message += "Cannot find ./lexicon.txt\r\n";
+ ok = false;
+ }
+
+ if (!Exists("./tokens.txt")) {
+ error_message += "Cannot find ./tokens.txt\r\n";
+ ok = false;
+ }
+
+ if (!ok) {
+ generate_btn_.EnableWindow(FALSE);
+ error_message +=
+ "\r\nPlease refer to\r\n"
+ "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html";
+ error_message += "\r\nto download models.\r\n";
+ error_message += "\r\nWe given an example below\r\n\r\n";
+ error_message +=
+ "wget -O model.onnx "
+ "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
+ "vits-aishell3.onnx\r\n";
+ error_message +=
+ "wget "
+ "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
+ "lexicon.txt\r\n";
+ error_message +=
+ "wget "
+ "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/"
+ "tokens.txt\r\n";
+
+ AppendLineToMultilineEditCtrl(my_hint_, error_message);
+ return;
+ }
+
+ // Now init tts
+ SherpaOnnxOfflineTtsConfig config;
+ memset(&config, 0, sizeof(config));
+ config.model.debug = 0;
+ config.model.num_threads = 1;
+ config.model.provider = "cpu";
+ config.model.vits.model = "./model.onnx";
+ config.model.vits.lexicon = "./lexicon.txt";
+ config.model.vits.tokens = "./tokens.txt";
+
+ tts_ = SherpaOnnxCreateOfflineTts(&config);
+}
+
+ CNonStreamingTextToSpeechDlg::~CNonStreamingTextToSpeechDlg() {
+ if (tts_) {
+ SherpaOnnxDestroyOfflineTts(tts_);
+ }
+ }
+
+
+
+void CNonStreamingTextToSpeechDlg::OnBnClickedOk() {
+ // TODO: Add your control notification handler code here
+ CString s;
+ speaker_id_.GetWindowText(s);
+ int speaker_id = _ttoi(s);
+ if (speaker_id < 0) {
+ AfxMessageBox(Utf8ToUtf16("Please input a valid speaker ID").c_str(), MB_OK);
+ return;
+ }
+
+ speed_.GetWindowText(s);
+ float speed = _ttof(s);
+ if (speed < 0) {
+ AfxMessageBox(Utf8ToUtf16("Please input a valid speed").c_str(), MB_OK);
+ return;
+ }
+
+ my_text_.GetWindowText(s);
+ CT2CA pszConvertedAnsiString(s);
+ std::string ss(pszConvertedAnsiString);
+ if (ss.empty()) {
+ AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK);
+ return;
+ }
+
+const SherpaOnnxGeneratedAudio *audio =
+ SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed);
+ std::string filename = "./generated.wav";
+int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate,
+ filename.c_str());
+
+ SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
+
+ if (ok) {
+ AfxMessageBox(Utf8ToUtf16("Saved to ./generated.wav successfully").c_str(), MB_OK);
+ } else {
+ AfxMessageBox(Utf8ToUtf16("Failed to save to ./generated.wav").c_str(), MB_OK);
+ }
+
+ //CDialogEx::OnOK();
+}
diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
new file mode 100644
index 000000000..29d8a82f6
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h
@@ -0,0 +1,48 @@
+
+// NonStreamingTextToSpeechDlg.h : header file
+//
+
+#pragma once
+
+#include "sherpa-onnx/c-api/c-api.h"
+
+
+// CNonStreamingTextToSpeechDlg dialog
+class CNonStreamingTextToSpeechDlg : public CDialogEx
+{
+// Construction
+public:
+ CNonStreamingTextToSpeechDlg(CWnd* pParent = nullptr); // standard constructor
+ ~CNonStreamingTextToSpeechDlg();
+
+// Dialog Data
+#ifdef AFX_DESIGN_TIME
+ enum { IDD = IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG };
+#endif
+
+ protected:
+ virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support
+
+
+// Implementation
+protected:
+ HICON m_hIcon;
+
+ // Generated message map functions
+ virtual BOOL OnInitDialog();
+ afx_msg void OnSysCommand(UINT nID, LPARAM lParam);
+ afx_msg void OnPaint();
+ afx_msg HCURSOR OnQueryDragIcon();
+ DECLARE_MESSAGE_MAP()
+ public:
+ CEdit my_hint_;
+ CEdit speaker_id_;
+ CEdit speed_;
+ void Init();
+ void InitHint();
+ CButton generate_btn_;
+ afx_msg void OnBnClickedOk();
+
+ SherpaOnnxOfflineTts *tts_;
+ CEdit my_text_;
+};
diff --git a/mfc-examples/NonStreamingTextToSpeech/Resource.h b/mfc-examples/NonStreamingTextToSpeech/Resource.h
new file mode 100644
index 000000000..20bd50962
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/Resource.h
@@ -0,0 +1,24 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by NonStreamingTextToSpeech.rc
+//
+#define IDM_ABOUTBOX 0x0010
+#define IDD_ABOUTBOX 100
+#define IDS_ABOUTBOX 101
+#define IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG 102
+#define IDR_MAINFRAME 128
+#define IDC_SPEAKER 1000
+#define IDC_SPEED 1003
+#define IDC_TEXT 1004
+#define IDC_HINT 1005
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE 130
+#define _APS_NEXT_COMMAND_VALUE 32771
+#define _APS_NEXT_CONTROL_VALUE 1006
+#define _APS_NEXT_SYMED_VALUE 101
+#endif
+#endif
diff --git a/mfc-examples/NonStreamingTextToSpeech/framework.h b/mfc-examples/NonStreamingTextToSpeech/framework.h
new file mode 100644
index 000000000..b25aff0c0
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/framework.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#ifndef VC_EXTRALEAN
+#define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers
+#endif
+
+#include "targetver.h"
+
+#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be explicit
+
+// turns off MFC's hiding of some common and often safely ignored warning messages
+#define _AFX_ALL_WARNINGS
+
+#include // MFC core and standard components
+#include // MFC extensions
+
+
+
+
+
+#ifndef _AFX_NO_OLE_SUPPORT
+#include // MFC support for Internet Explorer 4 Common Controls
+#endif
+#ifndef _AFX_NO_AFXCMN_SUPPORT
+#include // MFC support for Windows Common Controls
+#endif // _AFX_NO_AFXCMN_SUPPORT
+
+#include // MFC support for ribbons and control bars
+
+
+
+
+
+
+
+
+
+
+
diff --git a/mfc-examples/NonStreamingTextToSpeech/pch.cpp b/mfc-examples/NonStreamingTextToSpeech/pch.cpp
new file mode 100644
index 000000000..64b7eef6d
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/pch.cpp
@@ -0,0 +1,5 @@
+// pch.cpp: source file corresponding to the pre-compiled header
+
+#include "pch.h"
+
+// When you are using pre-compiled headers, this source file is necessary for compilation to succeed.
diff --git a/mfc-examples/NonStreamingTextToSpeech/pch.h b/mfc-examples/NonStreamingTextToSpeech/pch.h
new file mode 100644
index 000000000..885d5d62e
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/pch.h
@@ -0,0 +1,13 @@
+// pch.h: This is a precompiled header file.
+// Files listed below are compiled only once, improving build performance for future builds.
+// This also affects IntelliSense performance, including code completion and many code browsing features.
+// However, files listed here are ALL re-compiled if any one of them is updated between builds.
+// Do not add files here that you will be updating frequently as this negates the performance advantage.
+
+#ifndef PCH_H
+#define PCH_H
+
+// add headers that you want to pre-compile here
+#include "framework.h"
+
+#endif //PCH_H
diff --git a/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico
new file mode 100644
index 000000000..d56fbcdfd
Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico differ
diff --git a/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2 b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2
new file mode 100644
index 000000000..6702c7ec9
Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2 differ
diff --git a/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props
new file mode 100644
index 000000000..4c144708a
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props
@@ -0,0 +1,53 @@
+
+
+
+
+
+ ..\..\build
+ ..\..\build\install
+
+ sherpa-onnx-portaudio_static.lib;
+ sherpa-onnx-c-api.lib;
+ sherpa-onnx-core.lib;
+ kaldi-decoder-core.lib;
+ sherpa-onnx-kaldifst-core.lib;
+ sherpa-onnx-fst.lib;
+ kaldi-native-fbank-core.lib;
+ absl_base.lib;
+ absl_city.lib;
+ absl_hash.lib;
+ absl_low_level_hash.lib;
+ absl_raw_hash_set.lib;
+ absl_raw_logging_internal.lib;
+ absl_throw_delegate.lib;
+ clog.lib;
+ cpuinfo.lib;
+ flatbuffers.lib;
+ libprotobuf-lite.lib;
+ onnx.lib;
+ onnx_proto.lib;
+ onnxruntime_common.lib;
+ onnxruntime_flatbuffers.lib;
+ onnxruntime_framework.lib;
+ onnxruntime_graph.lib;
+ onnxruntime_mlas.lib;
+ onnxruntime_optimizer.lib;
+ onnxruntime_providers.lib;
+ onnxruntime_session.lib;
+ onnxruntime_util.lib;
+ re2.lib;
+
+
+
+
+
+ $(SherpaOnnxBuildDirectory)\_deps\portaudio-src\include;
+ $(SherpaOnnxInstallDirectory)\include;%(AdditionalIncludeDirectories)
+
+
+ $(SherpaOnnxInstallDirectory)\lib;%(AdditionalLibraryDirectories)
+ $(SherpaOnnxLibraries);
+
+
+
+
diff --git a/mfc-examples/NonStreamingTextToSpeech/targetver.h b/mfc-examples/NonStreamingTextToSpeech/targetver.h
new file mode 100644
index 000000000..87c0086de
--- /dev/null
+++ b/mfc-examples/NonStreamingTextToSpeech/targetver.h
@@ -0,0 +1,8 @@
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include
diff --git a/mfc-examples/mfc-examples.sln b/mfc-examples/mfc-examples.sln
index 807d8a5be..1a6c79c6d 100644
--- a/mfc-examples/mfc-examples.sln
+++ b/mfc-examples/mfc-examples.sln
@@ -7,6 +7,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamingSpeechRecognition"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingSpeechRecognition", "NonStreamingSpeechRecognition\NonStreamingSpeechRecognition.vcxproj", "{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}"
EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingTextToSpeech", "NonStreamingTextToSpeech\NonStreamingTextToSpeech.vcxproj", "{9A5F2CCC-1AAB-4F7F-A608-F0B512023405}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
@@ -31,6 +33,14 @@ Global
{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.Build.0 = Release|x64
{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.ActiveCfg = Release|Win32
{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.Build.0 = Release|Win32
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x64.ActiveCfg = Debug|x64
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x64.Build.0 = Debug|x64
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x86.ActiveCfg = Debug|Win32
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x86.Build.0 = Debug|Win32
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x64.ActiveCfg = Release|x64
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x64.Build.0 = Release|x64
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x86.ActiveCfg = Release|Win32
+ {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go
index 8503fc6b2..923214601 100644
--- a/scripts/go/sherpa_onnx.go
+++ b/scripts/go/sherpa_onnx.go
@@ -572,11 +572,11 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts {
return tts
}
-func (tts *OfflineTts) Generate(text string, sid int) *GeneratedAudio {
+func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio {
s := C.CString(text)
defer C.free(unsafe.Pointer(s))
- audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid))
+ audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid), C.float(speed))
defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio)
ans := &GeneratedAudio{}
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
index 9be6ff807..6a7a1dd61 100644
--- a/sherpa-onnx/c-api/c-api.cc
+++ b/sherpa-onnx/c-api/c-api.cc
@@ -568,8 +568,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts(
void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; }
const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
- const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid) {
- sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid);
+ const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
+ float speed) {
+ sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid, speed);
if (audio.samples.empty()) {
return nullptr;
diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h
index aab342390..19def531d 100644
--- a/sherpa-onnx/c-api/c-api.h
+++ b/sherpa-onnx/c-api/c-api.h
@@ -639,7 +639,8 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts);
// The user has to use DestroyOfflineTtsGeneratedAudio() to free the returned
// pointer to avoid memory leak.
SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate(
- const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid);
+ const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid,
+ float speed);
SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio(
const SherpaOnnxGeneratedAudio *p);
diff --git a/sherpa-onnx/csrc/offline-tts-impl.h b/sherpa-onnx/csrc/offline-tts-impl.h
index 1de5590ea..41835a99d 100644
--- a/sherpa-onnx/csrc/offline-tts-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-impl.h
@@ -18,8 +18,8 @@ class OfflineTtsImpl {
static std::unique_ptr Create(const OfflineTtsConfig &config);
- virtual GeneratedAudio Generate(const std::string &text,
- int64_t sid = 0) const = 0;
+ virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
+ float speed = 1.0) const = 0;
};
} // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h
index 6b19024e0..142035689 100644
--- a/sherpa-onnx/csrc/offline-tts-vits-impl.h
+++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h
@@ -24,8 +24,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
model_->Punctuations(), model_->Language(),
config.model.debug) {}
- GeneratedAudio Generate(const std::string &text,
- int64_t sid = 0) const override {
+ GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
+ float speed = 1.0) const override {
int32_t num_speakers = model_->NumSpeakers();
if (num_speakers == 0 && sid != 0) {
SHERPA_ONNX_LOGE(
@@ -66,7 +66,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl {
Ort::Value x_tensor = Ort::Value::CreateTensor(
memory_info, x.data(), x.size(), x_shape.data(), x_shape.size());
- Ort::Value audio = model_->Run(std::move(x_tensor), sid);
+ Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed);
std::vector audio_shape =
audio.GetTensorTypeAndShapeInfo().GetShape();
diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc
index dcb90079a..71c19e570 100644
--- a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc
+++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc
@@ -17,7 +17,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) {
po->Register("vits-noise-scale-w", &noise_scale_w,
"noise_scale_w for VITS models");
po->Register("vits-length-scale", &length_scale,
- "length_scale for VITS models");
+ "Speech speed. Larger->Slower; Smaller->faster.");
}
bool OfflineTtsVitsModelConfig::Validate() const {
diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc
index 4f2ae9ec0..060aa8b94 100644
--- a/sherpa-onnx/csrc/offline-tts-vits-model.cc
+++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc
@@ -26,7 +26,7 @@ class OfflineTtsVitsModel::Impl {
Init(buf.data(), buf.size());
}
- Ort::Value Run(Ort::Value x, int64_t sid) {
+ Ort::Value Run(Ort::Value x, int64_t sid, float speed) {
auto memory_info =
Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
@@ -48,6 +48,10 @@ class OfflineTtsVitsModel::Impl {
float length_scale = config_.vits.length_scale;
float noise_scale_w = config_.vits.noise_scale_w;
+ if (speed != 1 && speed > 0) {
+ length_scale = 1. / speed;
+ }
+
Ort::Value noise_scale_tensor =
Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1);
@@ -139,8 +143,9 @@ OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config)
OfflineTtsVitsModel::~OfflineTtsVitsModel() = default;
-Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/) {
- return impl_->Run(std::move(x), sid);
+Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/,
+ float speed /*= 1.0*/) {
+ return impl_->Run(std::move(x), sid, speed);
}
int32_t OfflineTtsVitsModel::SampleRate() const { return impl_->SampleRate(); }
diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.h b/sherpa-onnx/csrc/offline-tts-vits-model.h
index a3870fbd7..31addfdfe 100644
--- a/sherpa-onnx/csrc/offline-tts-vits-model.h
+++ b/sherpa-onnx/csrc/offline-tts-vits-model.h
@@ -29,7 +29,7 @@ class OfflineTtsVitsModel {
* @return Return a float32 tensor containing audio samples. You can flatten
* it to a 1-D tensor.
*/
- Ort::Value Run(Ort::Value x, int64_t sid = 0);
+ Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0);
// Sample rate of the generated audio
int32_t SampleRate() const;
diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc
index 36ec9beee..94a288354 100644
--- a/sherpa-onnx/csrc/offline-tts.cc
+++ b/sherpa-onnx/csrc/offline-tts.cc
@@ -28,9 +28,9 @@ OfflineTts::OfflineTts(const OfflineTtsConfig &config)
OfflineTts::~OfflineTts() = default;
-GeneratedAudio OfflineTts::Generate(const std::string &text,
- int64_t sid /*=0*/) const {
- return impl_->Generate(text, sid);
+GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/,
+ float speed /*= 1.0*/) const {
+ return impl_->Generate(text, sid, speed);
}
} // namespace sherpa_onnx
diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h
index 6e0b1402d..c2d87461b 100644
--- a/sherpa-onnx/csrc/offline-tts.h
+++ b/sherpa-onnx/csrc/offline-tts.h
@@ -43,7 +43,8 @@ class OfflineTts {
// trained using the VCTK dataset. It is not used for
// single-speaker models, e.g., models trained using the ljspeech
// dataset.
- GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const;
+ GeneratedAudio Generate(const std::string &text, int64_t sid = 0,
+ float speed = 1.0) const;
private:
std::unique_ptr impl_;
diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc
index 81af16917..11f91c07d 100644
--- a/sherpa-onnx/python/csrc/offline-tts.cc
+++ b/sherpa-onnx/python/csrc/offline-tts.cc
@@ -40,7 +40,8 @@ void PybindOfflineTts(py::module *m) {
using PyClass = OfflineTts;
py::class_(*m, "OfflineTts")
.def(py::init(), py::arg("config"))
- .def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0);
+ .def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0,
+ py::arg("speed") = 1.0);
}
} // namespace sherpa_onnx