diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c index 2b11eb096..c4e9be62b 100644 --- a/c-api-examples/offline-tts-c-api.c +++ b/c-api-examples/offline-tts-c-api.c @@ -186,7 +186,7 @@ int32_t main(int32_t argc, char *argv[]) { SherpaOnnxOfflineTts *tts = SherpaOnnxCreateOfflineTts(&config); const SherpaOnnxGeneratedAudio *audio = - SherpaOnnxOfflineTtsGenerate(tts, text, sid); + SherpaOnnxOfflineTtsGenerate(tts, text, sid, 1.0); SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, filename); diff --git a/go-api-examples/non-streaming-tts/main.go b/go-api-examples/non-streaming-tts/main.go index a263b42b2..1728d8e1f 100644 --- a/go-api-examples/non-streaming-tts/main.go +++ b/go-api-examples/non-streaming-tts/main.go @@ -49,7 +49,7 @@ func main() { log.Println("Start generating!") - audio := tts.Generate(text, sid) + audio := tts.Generate(text, sid, 1.0) log.Println("Done!") diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp new file mode 100644 index 000000000..2c87b1600 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.cpp @@ -0,0 +1,92 @@ + +// NonStreamingTextToSpeech.cpp : Defines the class behaviors for the application. +// + +#include "pch.h" +#include "framework.h" +#include "NonStreamingTextToSpeech.h" +#include "NonStreamingTextToSpeechDlg.h" + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + + +// CNonStreamingTextToSpeechApp + +BEGIN_MESSAGE_MAP(CNonStreamingTextToSpeechApp, CWinApp) + ON_COMMAND(ID_HELP, &CWinApp::OnHelp) +END_MESSAGE_MAP() + + +// CNonStreamingTextToSpeechApp construction + +CNonStreamingTextToSpeechApp::CNonStreamingTextToSpeechApp() +{ + // TODO: add construction code here, + // Place all significant initialization in InitInstance +} + + +// The one and only CNonStreamingTextToSpeechApp object + +CNonStreamingTextToSpeechApp theApp; + + +// CNonStreamingTextToSpeechApp initialization + +BOOL CNonStreamingTextToSpeechApp::InitInstance() +{ + CWinApp::InitInstance(); + + + // Create the shell manager, in case the dialog contains + // any shell tree view or shell list view controls. + CShellManager *pShellManager = new CShellManager; + + // Activate "Windows Native" visual manager for enabling themes in MFC controls + CMFCVisualManager::SetDefaultManager(RUNTIME_CLASS(CMFCVisualManagerWindows)); + + // Standard initialization + // If you are not using these features and wish to reduce the size + // of your final executable, you should remove from the following + // the specific initialization routines you do not need + // Change the registry key under which our settings are stored + // TODO: You should modify this string to be something appropriate + // such as the name of your company or organization + SetRegistryKey(_T("Local AppWizard-Generated Applications")); + + CNonStreamingTextToSpeechDlg dlg; + m_pMainWnd = &dlg; + INT_PTR nResponse = dlg.DoModal(); + if (nResponse == IDOK) + { + // TODO: Place code here to handle when the dialog is + // dismissed with OK + } + else if (nResponse == IDCANCEL) + { + // TODO: Place code here to handle when the dialog is + // dismissed with Cancel + } + else if (nResponse == -1) + { + TRACE(traceAppMsg, 0, "Warning: dialog creation failed, so application is terminating unexpectedly.\n"); + TRACE(traceAppMsg, 0, "Warning: if you are using MFC controls on the dialog, you cannot #define _AFX_NO_MFC_CONTROLS_IN_DIALOGS.\n"); + } + + // Delete the shell manager created above. + if (pShellManager != nullptr) + { + delete pShellManager; + } + +#if !defined(_AFXDLL) && !defined(_AFX_NO_MFC_CONTROLS_IN_DIALOGS) + ControlBarCleanUp(); +#endif + + // Since the dialog has been closed, return FALSE so that we exit the + // application, rather than start the application's message pump. + return FALSE; +} + diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h new file mode 100644 index 000000000..19b2ff365 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.h @@ -0,0 +1,32 @@ + +// NonStreamingTextToSpeech.h : main header file for the PROJECT_NAME application +// + +#pragma once + +#ifndef __AFXWIN_H__ + #error "include 'pch.h' before including this file for PCH" +#endif + +#include "resource.h" // main symbols + + +// CNonStreamingTextToSpeechApp: +// See NonStreamingTextToSpeech.cpp for the implementation of this class +// + +class CNonStreamingTextToSpeechApp : public CWinApp +{ +public: + CNonStreamingTextToSpeechApp(); + +// Overrides +public: + virtual BOOL InitInstance(); + +// Implementation + + DECLARE_MESSAGE_MAP() +}; + +extern CNonStreamingTextToSpeechApp theApp; diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc new file mode 100644 index 000000000..087e9b310 Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.rc differ diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj new file mode 100644 index 000000000..d7eb6fd1b --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj @@ -0,0 +1,219 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405} + MFCProj + NonStreamingTextToSpeech + 10.0 + + + + Application + true + v143 + Unicode + Dynamic + + + Application + false + v143 + true + Unicode + Dynamic + + + Application + true + v143 + Unicode + Dynamic + + + Application + false + v143 + true + Unicode + Static + + + + + + + + + + + + + + + + + + + + + + + + + false + + + true + + + true + + + false + + + + Use + Level3 + true + true + true + _WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + WIN32;_WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + _WINDOWS;_DEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + + + false + true + _DEBUG;%(PreprocessorDefinitions) + + + 0x0409 + _DEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + Use + Level3 + true + true + true + WIN32;_WINDOWS;NDEBUG;%(PreprocessorDefinitions) + pch.h + + + Windows + true + true + + + false + true + NDEBUG;%(PreprocessorDefinitions) + + + 0x0409 + NDEBUG;%(PreprocessorDefinitions) + $(IntDir);%(AdditionalIncludeDirectories) + + + + + + + + + + + + + + + Create + Create + Create + Create + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters new file mode 100644 index 000000000..689493c4d --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeech.vcxproj.filters @@ -0,0 +1,63 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + Header Files + + + + + Source Files + + + Source Files + + + Source Files + + + + + Resource Files + + + + + Resource Files + + + + + Resource Files + + + \ No newline at end of file diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp new file mode 100644 index 000000000..a6e6d3552 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.cpp @@ -0,0 +1,357 @@ + +// NonStreamingTextToSpeechDlg.cpp : implementation file +// + +#include "pch.h" +#include "framework.h" +#include "NonStreamingTextToSpeech.h" +#include "NonStreamingTextToSpeechDlg.h" +#include "afxdialogex.h" + +#include +#include +#include +#include + +#ifdef _DEBUG +#define new DEBUG_NEW +#endif + + +// CAboutDlg dialog used for App About + +class CAboutDlg : public CDialogEx +{ +public: + CAboutDlg(); + +// Dialog Data +#ifdef AFX_DESIGN_TIME + enum { IDD = IDD_ABOUTBOX }; +#endif + + protected: + virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support + +// Implementation +protected: + DECLARE_MESSAGE_MAP() +}; + +CAboutDlg::CAboutDlg() : CDialogEx(IDD_ABOUTBOX) +{ +} + +void CAboutDlg::DoDataExchange(CDataExchange* pDX) +{ + CDialogEx::DoDataExchange(pDX); +} + +BEGIN_MESSAGE_MAP(CAboutDlg, CDialogEx) +END_MESSAGE_MAP() + + +// CNonStreamingTextToSpeechDlg dialog + +// see +// https://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring +static std::wstring Utf8ToUtf16(const std::string &utf8) { + std::vector unicode; + size_t i = 0; + while (i < utf8.size()) { + unsigned long uni; + size_t todo; + bool error = false; + unsigned char ch = utf8[i++]; + if (ch <= 0x7F) { + uni = ch; + todo = 0; + } else if (ch <= 0xBF) { + throw std::logic_error("not a UTF-8 string"); + } else if (ch <= 0xDF) { + uni = ch & 0x1F; + todo = 1; + } else if (ch <= 0xEF) { + uni = ch & 0x0F; + todo = 2; + } else if (ch <= 0xF7) { + uni = ch & 0x07; + todo = 3; + } else { + throw std::logic_error("not a UTF-8 string"); + } + for (size_t j = 0; j < todo; ++j) { + if (i == utf8.size()) throw std::logic_error("not a UTF-8 string"); + unsigned char ch = utf8[i++]; + if (ch < 0x80 || ch > 0xBF) throw std::logic_error("not a UTF-8 string"); + uni <<= 6; + uni += ch & 0x3F; + } + if (uni >= 0xD800 && uni <= 0xDFFF) + throw std::logic_error("not a UTF-8 string"); + if (uni > 0x10FFFF) throw std::logic_error("not a UTF-8 string"); + unicode.push_back(uni); + } + std::wstring utf16; + for (size_t i = 0; i < unicode.size(); ++i) { + unsigned long uni = unicode[i]; + if (uni <= 0xFFFF) { + utf16 += (wchar_t)uni; + } else { + uni -= 0x10000; + utf16 += (wchar_t)((uni >> 10) + 0xD800); + utf16 += (wchar_t)((uni & 0x3FF) + 0xDC00); + } + } + return utf16; +} + +// The system calls this function to obtain the cursor to display while the user drags +// the minimized window. +HCURSOR CNonStreamingTextToSpeechDlg::OnQueryDragIcon() +{ + return static_cast(m_hIcon); +} + + +void AppendTextToEditCtrl(CEdit& e, const std::string &s) { + // get the initial text length + int nLength = e.GetWindowTextLength(); + // put the selection at the end of text + e.SetSel(nLength, nLength); + // replace the selection + + std::wstring wstr = Utf8ToUtf16(s); + + // my_text_.ReplaceSel(wstr.c_str()); + e.ReplaceSel(wstr.c_str()); +} + +void AppendLineToMultilineEditCtrl(CEdit& e, const std::string &s) { + AppendTextToEditCtrl(e, "\r\n" + s); +} + + +CNonStreamingTextToSpeechDlg::CNonStreamingTextToSpeechDlg(CWnd* pParent /*=nullptr*/) + : CDialogEx(IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG, pParent) + { + m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); +} + +void CNonStreamingTextToSpeechDlg::DoDataExchange(CDataExchange* pDX) +{ + CDialogEx::DoDataExchange(pDX); + DDX_Control(pDX, IDC_HINT, my_hint_); + DDX_Control(pDX, IDC_SPEAKER, speaker_id_); + DDX_Control(pDX, IDC_SPEED, speed_); + DDX_Control(pDX, IDOK, generate_btn_); + DDX_Control(pDX, IDC_TEXT, my_text_); +} + +BEGIN_MESSAGE_MAP(CNonStreamingTextToSpeechDlg, CDialogEx) + ON_WM_SYSCOMMAND() + ON_WM_PAINT() + ON_WM_QUERYDRAGICON() + ON_BN_CLICKED(IDOK, &CNonStreamingTextToSpeechDlg::OnBnClickedOk) + END_MESSAGE_MAP() + + +// CNonStreamingTextToSpeechDlg message handlers + +BOOL CNonStreamingTextToSpeechDlg::OnInitDialog() +{ + CDialogEx::OnInitDialog(); + + // Add "About..." menu item to system menu. + + // IDM_ABOUTBOX must be in the system command range. + ASSERT((IDM_ABOUTBOX & 0xFFF0) == IDM_ABOUTBOX); + ASSERT(IDM_ABOUTBOX < 0xF000); + + CMenu* pSysMenu = GetSystemMenu(FALSE); + if (pSysMenu != nullptr) + { + BOOL bNameValid; + CString strAboutMenu; + bNameValid = strAboutMenu.LoadString(IDS_ABOUTBOX); + ASSERT(bNameValid); + if (!strAboutMenu.IsEmpty()) + { + pSysMenu->AppendMenu(MF_SEPARATOR); + pSysMenu->AppendMenu(MF_STRING, IDM_ABOUTBOX, strAboutMenu); + } + } + + // Set the icon for this dialog. The framework does this automatically + // when the application's main window is not a dialog + SetIcon(m_hIcon, TRUE); // Set big icon + SetIcon(m_hIcon, FALSE); // Set small icon + + // TODO: Add extra initialization here + Init(); + + return TRUE; // return TRUE unless you set the focus to a control +} + +void CNonStreamingTextToSpeechDlg::OnSysCommand(UINT nID, LPARAM lParam) +{ + if ((nID & 0xFFF0) == IDM_ABOUTBOX) + { + CAboutDlg dlgAbout; + dlgAbout.DoModal(); + } + else + { + CDialogEx::OnSysCommand(nID, lParam); + } +} + +// If you add a minimize button to your dialog, you will need the code below +// to draw the icon . For MFC applications using the document/view model, +// this is automatically done for you by the framework. + +void CNonStreamingTextToSpeechDlg::OnPaint() +{ + if (IsIconic()) + { + CPaintDC dc(this); // device context for painting + + SendMessage(WM_ICONERASEBKGND, reinterpret_cast(dc.GetSafeHdc()), 0); + + // Center icon in client rectangle + int cxIcon = GetSystemMetrics(SM_CXICON); + int cyIcon = GetSystemMetrics(SM_CYICON); + CRect rect; + GetClientRect(&rect); + int x = (rect.Width() - cxIcon + 1) / 2; + int y = (rect.Height() - cyIcon + 1) / 2; + + // Draw the icon + dc.DrawIcon(x, y, m_hIcon); + } + else + { + CDialogEx::OnPaint(); + } +} + +bool Exists(const std::string &filename) { + std::ifstream is(filename); + return is.good(); +} + +void CNonStreamingTextToSpeechDlg::InitHint() { + AppendLineToMultilineEditCtrl(my_hint_, "Speaker ID: Used only for multi-speaker models. Example value: 0"); + AppendLineToMultilineEditCtrl(my_hint_, "Speed: Larger -> Faster in speech speed. Example value: 1.0"); + AppendLineToMultilineEditCtrl(my_hint_, "\r\n\r\nPlease input your text and click the button Generate"); + +} + +void CNonStreamingTextToSpeechDlg::Init() { + InitHint(); + speaker_id_.SetWindowText(Utf8ToUtf16("0").c_str()); + speed_.SetWindowText(Utf8ToUtf16("1.0").c_str()); + + bool ok = true; + std::string error_message = "--------------------"; + if (!Exists("./model.onnx")) { + error_message += "Cannot find ./model.onnx\r\n"; + ok = false; + } + + if (!Exists("./lexicon.txt")) { + error_message += "Cannot find ./lexicon.txt\r\n"; + ok = false; + } + + if (!Exists("./tokens.txt")) { + error_message += "Cannot find ./tokens.txt\r\n"; + ok = false; + } + + if (!ok) { + generate_btn_.EnableWindow(FALSE); + error_message += + "\r\nPlease refer to\r\n" + "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html"; + error_message += "\r\nto download models.\r\n"; + error_message += "\r\nWe given an example below\r\n\r\n"; + error_message += + "wget -O model.onnx " + "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" + "vits-aishell3.onnx\r\n"; + error_message += + "wget " + "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" + "lexicon.txt\r\n"; + error_message += + "wget " + "https://huggingface.co/csukuangfj/vits-zh-aishell3/resolve/main/" + "tokens.txt\r\n"; + + AppendLineToMultilineEditCtrl(my_hint_, error_message); + return; + } + + // Now init tts + SherpaOnnxOfflineTtsConfig config; + memset(&config, 0, sizeof(config)); + config.model.debug = 0; + config.model.num_threads = 1; + config.model.provider = "cpu"; + config.model.vits.model = "./model.onnx"; + config.model.vits.lexicon = "./lexicon.txt"; + config.model.vits.tokens = "./tokens.txt"; + + tts_ = SherpaOnnxCreateOfflineTts(&config); +} + + CNonStreamingTextToSpeechDlg::~CNonStreamingTextToSpeechDlg() { + if (tts_) { + SherpaOnnxDestroyOfflineTts(tts_); + } + } + + + +void CNonStreamingTextToSpeechDlg::OnBnClickedOk() { + // TODO: Add your control notification handler code here + CString s; + speaker_id_.GetWindowText(s); + int speaker_id = _ttoi(s); + if (speaker_id < 0) { + AfxMessageBox(Utf8ToUtf16("Please input a valid speaker ID").c_str(), MB_OK); + return; + } + + speed_.GetWindowText(s); + float speed = _ttof(s); + if (speed < 0) { + AfxMessageBox(Utf8ToUtf16("Please input a valid speed").c_str(), MB_OK); + return; + } + + my_text_.GetWindowText(s); + CT2CA pszConvertedAnsiString(s); + std::string ss(pszConvertedAnsiString); + if (ss.empty()) { + AfxMessageBox(Utf8ToUtf16("Please input your text").c_str(), MB_OK); + return; + } + +const SherpaOnnxGeneratedAudio *audio = + SherpaOnnxOfflineTtsGenerate(tts_, ss.c_str(), speaker_id, speed); + std::string filename = "./generated.wav"; +int ok = SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, + filename.c_str()); + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); + + if (ok) { + AfxMessageBox(Utf8ToUtf16("Saved to ./generated.wav successfully").c_str(), MB_OK); + } else { + AfxMessageBox(Utf8ToUtf16("Failed to save to ./generated.wav").c_str(), MB_OK); + } + + //CDialogEx::OnOK(); +} diff --git a/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h new file mode 100644 index 000000000..29d8a82f6 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/NonStreamingTextToSpeechDlg.h @@ -0,0 +1,48 @@ + +// NonStreamingTextToSpeechDlg.h : header file +// + +#pragma once + +#include "sherpa-onnx/c-api/c-api.h" + + +// CNonStreamingTextToSpeechDlg dialog +class CNonStreamingTextToSpeechDlg : public CDialogEx +{ +// Construction +public: + CNonStreamingTextToSpeechDlg(CWnd* pParent = nullptr); // standard constructor + ~CNonStreamingTextToSpeechDlg(); + +// Dialog Data +#ifdef AFX_DESIGN_TIME + enum { IDD = IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG }; +#endif + + protected: + virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support + + +// Implementation +protected: + HICON m_hIcon; + + // Generated message map functions + virtual BOOL OnInitDialog(); + afx_msg void OnSysCommand(UINT nID, LPARAM lParam); + afx_msg void OnPaint(); + afx_msg HCURSOR OnQueryDragIcon(); + DECLARE_MESSAGE_MAP() + public: + CEdit my_hint_; + CEdit speaker_id_; + CEdit speed_; + void Init(); + void InitHint(); + CButton generate_btn_; + afx_msg void OnBnClickedOk(); + + SherpaOnnxOfflineTts *tts_; + CEdit my_text_; +}; diff --git a/mfc-examples/NonStreamingTextToSpeech/Resource.h b/mfc-examples/NonStreamingTextToSpeech/Resource.h new file mode 100644 index 000000000..20bd50962 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/Resource.h @@ -0,0 +1,24 @@ +//{{NO_DEPENDENCIES}} +// Microsoft Visual C++ generated include file. +// Used by NonStreamingTextToSpeech.rc +// +#define IDM_ABOUTBOX 0x0010 +#define IDD_ABOUTBOX 100 +#define IDS_ABOUTBOX 101 +#define IDD_NONSTREAMINGTEXTTOSPEECH_DIALOG 102 +#define IDR_MAINFRAME 128 +#define IDC_SPEAKER 1000 +#define IDC_SPEED 1003 +#define IDC_TEXT 1004 +#define IDC_HINT 1005 + +// Next default values for new objects +// +#ifdef APSTUDIO_INVOKED +#ifndef APSTUDIO_READONLY_SYMBOLS +#define _APS_NEXT_RESOURCE_VALUE 130 +#define _APS_NEXT_COMMAND_VALUE 32771 +#define _APS_NEXT_CONTROL_VALUE 1006 +#define _APS_NEXT_SYMED_VALUE 101 +#endif +#endif diff --git a/mfc-examples/NonStreamingTextToSpeech/framework.h b/mfc-examples/NonStreamingTextToSpeech/framework.h new file mode 100644 index 000000000..b25aff0c0 --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/framework.h @@ -0,0 +1,39 @@ +#pragma once + +#ifndef VC_EXTRALEAN +#define VC_EXTRALEAN // Exclude rarely-used stuff from Windows headers +#endif + +#include "targetver.h" + +#define _ATL_CSTRING_EXPLICIT_CONSTRUCTORS // some CString constructors will be explicit + +// turns off MFC's hiding of some common and often safely ignored warning messages +#define _AFX_ALL_WARNINGS + +#include // MFC core and standard components +#include // MFC extensions + + + + + +#ifndef _AFX_NO_OLE_SUPPORT +#include // MFC support for Internet Explorer 4 Common Controls +#endif +#ifndef _AFX_NO_AFXCMN_SUPPORT +#include // MFC support for Windows Common Controls +#endif // _AFX_NO_AFXCMN_SUPPORT + +#include // MFC support for ribbons and control bars + + + + + + + + + + + diff --git a/mfc-examples/NonStreamingTextToSpeech/pch.cpp b/mfc-examples/NonStreamingTextToSpeech/pch.cpp new file mode 100644 index 000000000..64b7eef6d --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/pch.cpp @@ -0,0 +1,5 @@ +// pch.cpp: source file corresponding to the pre-compiled header + +#include "pch.h" + +// When you are using pre-compiled headers, this source file is necessary for compilation to succeed. diff --git a/mfc-examples/NonStreamingTextToSpeech/pch.h b/mfc-examples/NonStreamingTextToSpeech/pch.h new file mode 100644 index 000000000..885d5d62e --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/pch.h @@ -0,0 +1,13 @@ +// pch.h: This is a precompiled header file. +// Files listed below are compiled only once, improving build performance for future builds. +// This also affects IntelliSense performance, including code completion and many code browsing features. +// However, files listed here are ALL re-compiled if any one of them is updated between builds. +// Do not add files here that you will be updating frequently as this negates the performance advantage. + +#ifndef PCH_H +#define PCH_H + +// add headers that you want to pre-compile here +#include "framework.h" + +#endif //PCH_H diff --git a/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico new file mode 100644 index 000000000..d56fbcdfd Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.ico differ diff --git a/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2 b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2 new file mode 100644 index 000000000..6702c7ec9 Binary files /dev/null and b/mfc-examples/NonStreamingTextToSpeech/res/NonStreamingTextToSpeech.rc2 differ diff --git a/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props new file mode 100644 index 000000000..4c144708a --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/sherpa-onnx-deps.props @@ -0,0 +1,53 @@ + + + + + + ..\..\build + ..\..\build\install + + sherpa-onnx-portaudio_static.lib; + sherpa-onnx-c-api.lib; + sherpa-onnx-core.lib; + kaldi-decoder-core.lib; + sherpa-onnx-kaldifst-core.lib; + sherpa-onnx-fst.lib; + kaldi-native-fbank-core.lib; + absl_base.lib; + absl_city.lib; + absl_hash.lib; + absl_low_level_hash.lib; + absl_raw_hash_set.lib; + absl_raw_logging_internal.lib; + absl_throw_delegate.lib; + clog.lib; + cpuinfo.lib; + flatbuffers.lib; + libprotobuf-lite.lib; + onnx.lib; + onnx_proto.lib; + onnxruntime_common.lib; + onnxruntime_flatbuffers.lib; + onnxruntime_framework.lib; + onnxruntime_graph.lib; + onnxruntime_mlas.lib; + onnxruntime_optimizer.lib; + onnxruntime_providers.lib; + onnxruntime_session.lib; + onnxruntime_util.lib; + re2.lib; + + + + + + $(SherpaOnnxBuildDirectory)\_deps\portaudio-src\include; + $(SherpaOnnxInstallDirectory)\include;%(AdditionalIncludeDirectories) + + + $(SherpaOnnxInstallDirectory)\lib;%(AdditionalLibraryDirectories) + $(SherpaOnnxLibraries); + + + + diff --git a/mfc-examples/NonStreamingTextToSpeech/targetver.h b/mfc-examples/NonStreamingTextToSpeech/targetver.h new file mode 100644 index 000000000..87c0086de --- /dev/null +++ b/mfc-examples/NonStreamingTextToSpeech/targetver.h @@ -0,0 +1,8 @@ +#pragma once + +// Including SDKDDKVer.h defines the highest available Windows platform. + +// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and +// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. + +#include diff --git a/mfc-examples/mfc-examples.sln b/mfc-examples/mfc-examples.sln index 807d8a5be..1a6c79c6d 100644 --- a/mfc-examples/mfc-examples.sln +++ b/mfc-examples/mfc-examples.sln @@ -7,6 +7,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "StreamingSpeechRecognition" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingSpeechRecognition", "NonStreamingSpeechRecognition\NonStreamingSpeechRecognition.vcxproj", "{0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NonStreamingTextToSpeech", "NonStreamingTextToSpeech\NonStreamingTextToSpeech.vcxproj", "{9A5F2CCC-1AAB-4F7F-A608-F0B512023405}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -31,6 +33,14 @@ Global {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x64.Build.0 = Release|x64 {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.ActiveCfg = Release|Win32 {0298EE00-7AF2-4A66-9D5F-AA0D92AC871D}.Release|x86.Build.0 = Release|Win32 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x64.ActiveCfg = Debug|x64 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x64.Build.0 = Debug|x64 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x86.ActiveCfg = Debug|Win32 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Debug|x86.Build.0 = Debug|Win32 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x64.ActiveCfg = Release|x64 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x64.Build.0 = Release|x64 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x86.ActiveCfg = Release|Win32 + {9A5F2CCC-1AAB-4F7F-A608-F0B512023405}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 8503fc6b2..923214601 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -572,11 +572,11 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { return tts } -func (tts *OfflineTts) Generate(text string, sid int) *GeneratedAudio { +func (tts *OfflineTts) Generate(text string, sid int, speed float32) *GeneratedAudio { s := C.CString(text) defer C.free(unsafe.Pointer(s)) - audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid)) + audio := C.SherpaOnnxOfflineTtsGenerate(tts.impl, s, C.int(sid), C.float(speed)) defer C.SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio) ans := &GeneratedAudio{} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 9be6ff807..6a7a1dd61 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -568,8 +568,9 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts) { delete tts; } const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid) { - sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid); + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, + float speed) { + sherpa_onnx::GeneratedAudio audio = tts->impl->Generate(text, sid, speed); if (audio.samples.empty()) { return nullptr; diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index aab342390..19def531d 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -639,7 +639,8 @@ SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTts(SherpaOnnxOfflineTts *tts); // The user has to use DestroyOfflineTtsGeneratedAudio() to free the returned // pointer to avoid memory leak. SHERPA_ONNX_API const SherpaOnnxGeneratedAudio *SherpaOnnxOfflineTtsGenerate( - const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid); + const SherpaOnnxOfflineTts *tts, const char *text, int32_t sid, + float speed); SHERPA_ONNX_API void SherpaOnnxDestroyOfflineTtsGeneratedAudio( const SherpaOnnxGeneratedAudio *p); diff --git a/sherpa-onnx/csrc/offline-tts-impl.h b/sherpa-onnx/csrc/offline-tts-impl.h index 1de5590ea..41835a99d 100644 --- a/sherpa-onnx/csrc/offline-tts-impl.h +++ b/sherpa-onnx/csrc/offline-tts-impl.h @@ -18,8 +18,8 @@ class OfflineTtsImpl { static std::unique_ptr Create(const OfflineTtsConfig &config); - virtual GeneratedAudio Generate(const std::string &text, - int64_t sid = 0) const = 0; + virtual GeneratedAudio Generate(const std::string &text, int64_t sid = 0, + float speed = 1.0) const = 0; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 6b19024e0..142035689 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -24,8 +24,8 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { model_->Punctuations(), model_->Language(), config.model.debug) {} - GeneratedAudio Generate(const std::string &text, - int64_t sid = 0) const override { + GeneratedAudio Generate(const std::string &text, int64_t sid = 0, + float speed = 1.0) const override { int32_t num_speakers = model_->NumSpeakers(); if (num_speakers == 0 && sid != 0) { SHERPA_ONNX_LOGE( @@ -66,7 +66,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { Ort::Value x_tensor = Ort::Value::CreateTensor( memory_info, x.data(), x.size(), x_shape.data(), x_shape.size()); - Ort::Value audio = model_->Run(std::move(x_tensor), sid); + Ort::Value audio = model_->Run(std::move(x_tensor), sid, speed); std::vector audio_shape = audio.GetTensorTypeAndShapeInfo().GetShape(); diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc index dcb90079a..71c19e570 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc @@ -17,7 +17,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { po->Register("vits-noise-scale-w", &noise_scale_w, "noise_scale_w for VITS models"); po->Register("vits-length-scale", &length_scale, - "length_scale for VITS models"); + "Speech speed. Larger->Slower; Smaller->faster."); } bool OfflineTtsVitsModelConfig::Validate() const { diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index 4f2ae9ec0..060aa8b94 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -26,7 +26,7 @@ class OfflineTtsVitsModel::Impl { Init(buf.data(), buf.size()); } - Ort::Value Run(Ort::Value x, int64_t sid) { + Ort::Value Run(Ort::Value x, int64_t sid, float speed) { auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); @@ -48,6 +48,10 @@ class OfflineTtsVitsModel::Impl { float length_scale = config_.vits.length_scale; float noise_scale_w = config_.vits.noise_scale_w; + if (speed != 1 && speed > 0) { + length_scale = 1. / speed; + } + Ort::Value noise_scale_tensor = Ort::Value::CreateTensor(memory_info, &noise_scale, 1, &scale_shape, 1); @@ -139,8 +143,9 @@ OfflineTtsVitsModel::OfflineTtsVitsModel(const OfflineTtsModelConfig &config) OfflineTtsVitsModel::~OfflineTtsVitsModel() = default; -Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/) { - return impl_->Run(std::move(x), sid); +Ort::Value OfflineTtsVitsModel::Run(Ort::Value x, int64_t sid /*=0*/, + float speed /*= 1.0*/) { + return impl_->Run(std::move(x), sid, speed); } int32_t OfflineTtsVitsModel::SampleRate() const { return impl_->SampleRate(); } diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.h b/sherpa-onnx/csrc/offline-tts-vits-model.h index a3870fbd7..31addfdfe 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model.h @@ -29,7 +29,7 @@ class OfflineTtsVitsModel { * @return Return a float32 tensor containing audio samples. You can flatten * it to a 1-D tensor. */ - Ort::Value Run(Ort::Value x, int64_t sid = 0); + Ort::Value Run(Ort::Value x, int64_t sid = 0, float speed = 1.0); // Sample rate of the generated audio int32_t SampleRate() const; diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index 36ec9beee..94a288354 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -28,9 +28,9 @@ OfflineTts::OfflineTts(const OfflineTtsConfig &config) OfflineTts::~OfflineTts() = default; -GeneratedAudio OfflineTts::Generate(const std::string &text, - int64_t sid /*=0*/) const { - return impl_->Generate(text, sid); +GeneratedAudio OfflineTts::Generate(const std::string &text, int64_t sid /*=0*/, + float speed /*= 1.0*/) const { + return impl_->Generate(text, sid, speed); } } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index 6e0b1402d..c2d87461b 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -43,7 +43,8 @@ class OfflineTts { // trained using the VCTK dataset. It is not used for // single-speaker models, e.g., models trained using the ljspeech // dataset. - GeneratedAudio Generate(const std::string &text, int64_t sid = 0) const; + GeneratedAudio Generate(const std::string &text, int64_t sid = 0, + float speed = 1.0) const; private: std::unique_ptr impl_; diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index 81af16917..11f91c07d 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -40,7 +40,8 @@ void PybindOfflineTts(py::module *m) { using PyClass = OfflineTts; py::class_(*m, "OfflineTts") .def(py::init(), py::arg("config")) - .def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0); + .def("generate", &PyClass::Generate, py::arg("text"), py::arg("sid") = 0, + py::arg("speed") = 1.0); } } // namespace sherpa_onnx