VOICEVOX · Hiroshiba · Jan 7, 2024 · Jan 4, 2024 · Jan 7, 2024 · Jan 7, 2024
@@ -1,7 +1,7 @@
 import threading
 
-import numpy
-from numpy import ndarray
+import numpy as np
+from numpy.typing import NDArray
 
 from .core_wrapper import CoreWrapper, OldCoreError
 from .metas.Metas import StyleId
@@ -67,55 +67,58 @@ def is_initialized_style_id_synthesis(self, style_id: StyleId) -> bool:
             return True  # コアが古い場合はどうしようもないのでTrueを返す
 
     def safe_yukarin_s_forward(
-        self, phoneme_list_s: ndarray, style_id: StyleId
-    ) -> ndarray:
+        self, phoneme_list_s: NDArray[np.int64], style_id: StyleId
+    ) -> NDArray[np.float32]:
         # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
         self.initialize_style_id_synthesis(style_id, skip_reinit=True)
         with self.mutex:
             phoneme_length = self.core.yukarin_s_forward(
                 length=len(phoneme_list_s),
                 phoneme_list=phoneme_list_s,
-                style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
             )
         return phoneme_length
 
     def safe_yukarin_sa_forward(
         self,
-        vowel_phoneme_list: ndarray,
-        consonant_phoneme_list: ndarray,
-        start_accent_list: ndarray,
-        end_accent_list: ndarray,
-        start_accent_phrase_list: ndarray,
-        end_accent_phrase_list: ndarray,
+        vowel_phoneme_list: NDArray[np.int64],
+        consonant_phoneme_list: NDArray[np.int64],
+        start_accent_list: NDArray[np.int64],
+        end_accent_list: NDArray[np.int64],
+        start_accent_phrase_list: NDArray[np.int64],
+        end_accent_phrase_list: NDArray[np.int64],
         style_id: StyleId,
-    ) -> ndarray:
+    ) -> NDArray[np.float32]:
         # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
         self.initialize_style_id_synthesis(style_id, skip_reinit=True)
         with self.mutex:
             f0_list = self.core.yukarin_sa_forward(
                 length=vowel_phoneme_list.shape[0],
-                vowel_phoneme_list=vowel_phoneme_list[numpy.newaxis],
-                consonant_phoneme_list=consonant_phoneme_list[numpy.newaxis],
-                start_accent_list=start_accent_list[numpy.newaxis],
-                end_accent_list=end_accent_list[numpy.newaxis],
-                start_accent_phrase_list=start_accent_phrase_list[numpy.newaxis],
-                end_accent_phrase_list=end_accent_phrase_list[numpy.newaxis],
-                style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
+                vowel_phoneme_list=vowel_phoneme_list[np.newaxis],
+                consonant_phoneme_list=consonant_phoneme_list[np.newaxis],
+                start_accent_list=start_accent_list[np.newaxis],
+                end_accent_list=end_accent_list[np.newaxis],
+                start_accent_phrase_list=start_accent_phrase_list[np.newaxis],
+                end_accent_phrase_list=end_accent_phrase_list[np.newaxis],
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
             )[0]
         return f0_list
 
     def safe_decode_forward(
-        self, phoneme: ndarray, f0: ndarray, style_id: StyleId
-    ) -> tuple[ndarray, int]:
+        self,
+        phoneme: NDArray[np.float32],
+        f0: NDArray[np.float32],
+        style_id: StyleId,
+    ) -> tuple[NDArray[np.float32], int]:
         # 「指定スタイルを初期化」「mutexによる安全性」「系列長・データ型に関するアダプター」を提供する
         self.initialize_style_id_synthesis(style_id, skip_reinit=True)
         with self.mutex:
             wave = self.core.decode_forward(
                 length=phoneme.shape[0],
                 phoneme_size=phoneme.shape[1],
-                f0=f0[:, numpy.newaxis],
+                f0=f0[:, np.newaxis],
                 phoneme=phoneme,
-                style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
+                style_id=np.array(style_id, dtype=np.int64).reshape(-1),
             )
         sr_wave = self.default_sampling_rate
         return wave, sr_wave
@@ -8,6 +8,7 @@
 from typing import Literal
 
 import numpy as np
+from numpy.typing import NDArray
 
 
 class OldCoreError(Exception):
@@ -525,21 +526,24 @@ def metas(self) -> str:
         return self.core.metas().decode("utf-8")
 
     def yukarin_s_forward(
-        self, length: int, phoneme_list: np.ndarray, style_id: np.ndarray
-    ) -> np.ndarray:
+        self,
+        length: int,
+        phoneme_list: NDArray[np.int64],
+        style_id: NDArray[np.int64],
+    ) -> NDArray[np.float32]:
         """
         音素列から、音素ごとの長さを求める関数
         Parameters
         ----------
         length : int
             音素列の長さ
-        phoneme_list : np.ndarray
+        phoneme_list : NDArray[np.int64]
             音素列
-        style_id : np.ndarray
+        style_id : NDArray[np.int64]
             スタイル番号
         Returns
         -------
-        output : np.ndarray
+        output : NDArray[np.float32]
             音素ごとの長さ
         """
         output = np.zeros((length,), dtype=np.float32)
@@ -556,37 +560,37 @@ def yukarin_s_forward(
     def yukarin_sa_forward(
         self,
         length: int,
-        vowel_phoneme_list: np.ndarray,
-        consonant_phoneme_list: np.ndarray,
-        start_accent_list: np.ndarray,
-        end_accent_list: np.ndarray,
-        start_accent_phrase_list: np.ndarray,
-        end_accent_phrase_list: np.ndarray,
-        style_id: np.ndarray,
-    ) -> np.ndarray:
+        vowel_phoneme_list: NDArray[np.int64],
+        consonant_phoneme_list: NDArray[np.int64],
+        start_accent_list: NDArray[np.int64],
+        end_accent_list: NDArray[np.int64],
+        start_accent_phrase_list: NDArray[np.int64],
+        end_accent_phrase_list: NDArray[np.int64],
+        style_id: NDArray[np.int64],
+    ) -> NDArray[np.float32]:
         """
         モーラごとの音素列とアクセント情報から、モーラごとの音高を求める関数
         Parameters
         ----------
         length : int
             モーラ列の長さ
-        vowel_phoneme_list : np.ndarray
+        vowel_phoneme_list : NDArray[np.int64]
             母音の音素列
-        consonant_phoneme_list : np.ndarray
+        consonant_phoneme_list : NDArray[np.int64]
             子音の音素列
-        start_accent_list : np.ndarray
+        start_accent_list : NDArray[np.int64]
         アクセントの開始位置
-        end_accent_list : np.ndarray
+        end_accent_list : NDArray[np.int64]
             アクセントの終了位置
-        start_accent_phrase_list : np.ndarray
+        start_accent_phrase_list : NDArray[np.int64]
             アクセント句の開始位置
-        end_accent_phrase_list : np.ndarray
+        end_accent_phrase_list : NDArray[np.int64]
             アクセント句の終了位置
-        style_id : np.ndarray
+        style_id : NDArray[np.int64]
             スタイル番号
         Returns
         -------
-        output : np.ndarray
+        output : NDArray[np.float32]
             モーラごとの音高
         """
         output = np.empty(
@@ -615,10 +619,10 @@ def decode_forward(
         self,
         length: int,
         phoneme_size: int,
-        f0: np.ndarray,
-        phoneme: np.ndarray,
-        style_id: np.ndarray,
-    ) -> np.ndarray:
+        f0: NDArray[np.float32],
+        phoneme: NDArray[np.float32],
+        style_id: NDArray[np.int64],
+    ) -> NDArray[np.float32]:
         """
         フレームごとの音素と音高から波形を求める関数
         Parameters
@@ -627,15 +631,15 @@ def decode_forward(
             フレームの長さ
         phoneme_size : int
             音素の種類数
-        f0 : np.ndarray
+        f0 : NDArray[np.float32]
             フレームごとの音高
-        phoneme : np.ndarray
+        phoneme : NDArray[np.float32]
             フレームごとの音素
-        style_id : np.ndarray
+        style_id : NDArray[np.int64]
             スタイル番号
         Returns
         -------
-        output : np.ndarray
+        output : NDArray[np.float32]
             音声波形
         """
 

@@ -1,8 +1,9 @@
 import json
 from pathlib import Path
 
-import numpy
+import numpy as np
 from numpy import ndarray
+from numpy.typing import NDArray
 
 from ...core_wrapper import CoreWrapper
 
@@ -65,13 +66,13 @@ def metas(self) -> str:
 
     def yukarin_s_forward(
         self, length: int, phoneme_list: ndarray, style_id: ndarray
-    ) -> ndarray:
+    ) -> NDArray[np.floating]:
         """音素系列サイズ・音素ID系列・スタイルIDから音素長系列を生成する"""
         result = []
         # mockとしての適当な処理、特に意味はない
         for i in range(length):
             result.append(round((phoneme_list[i] * 0.0625 + style_id).item(), 2))
-        return numpy.array(result)
+        return np.array(result)
 
     def yukarin_sa_forward(
         self,
@@ -83,7 +84,7 @@ def yukarin_sa_forward(
         start_accent_phrase_list: ndarray,
         end_accent_phrase_list: ndarray,
         style_id: ndarray,
-    ) -> ndarray:
+    ) -> NDArray[np.floating]:
         """モーラ系列サイズ・母音系列・子音系列・アクセント位置・アクセント句区切り・スタイルIDからモーラ音高系列を生成する"""
         assert length > 1, "前後無音を必ず付与しなければならない"
 
@@ -107,7 +108,7 @@ def yukarin_sa_forward(
                     2,
                 )
             )
-        return numpy.array(result)[numpy.newaxis]
+        return np.array(result)[np.newaxis]
 
     def decode_forward(
         self,
@@ -116,15 +117,15 @@ def decode_forward(
         f0: ndarray,
         phoneme: ndarray,
         style_id: ndarray,
-    ) -> ndarray:
+    ) -> NDArray[np.floating]:
         """フレーム長・音素種類数・フレーム音高・フレーム音素onehot・スタイルIDからダミー音声波形を生成する"""
         # 入力値を反映し、長さが 256 倍であるダミー配列を出力する
         result: list[ndarray] = []
         for i in range(length):
             result += [
-                (f0[i, 0] * (numpy.where(phoneme[i] == 1)[0] / phoneme_size) + style_id)
+                (f0[i, 0] * (np.where(phoneme[i] == 1)[0] / phoneme_size) + style_id)
             ] * 256
-        return numpy.array(result)
+        return np.array(result)
 
     def supported_devices(self):
         return json.dumps(

@@ -1,8 +1,9 @@
 import copy
 from logging import getLogger
-from typing import Any, Dict
+from typing import Any
 
 import numpy as np
+from numpy.typing import NDArray
 from pyopenjtalk import tts
 from soxr import resample
 
@@ -24,7 +25,7 @@ def synthesize_wave(
         query: AudioQuery,
         style_id: StyleId,
         enable_interrogative_upspeak: bool = True,
-    ) -> np.ndarray:
+    ) -> NDArray[np.float32]:
         """音声合成用のクエリに含まれる読み仮名に基づいてOpenJTalkで音声波形を生成する"""
         # モーフィング時などに同一参照のqueryで複数回呼ばれる可能性があるので、元の引数のqueryに破壊的変更を行わない
         query = copy.deepcopy(query)
@@ -38,9 +39,9 @@ def synthesize_wave(
         # volume
         wave *= query.volumeScale
 
-        return wave.astype("int16")
+        return wave
 
-    def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
+    def forward(self, text: str, **kwargs: dict[str, Any]) -> NDArray[np.float32]:
         """
         forward tts via pyopenjtalk.tts()
         参照→TTSEngine のdocstring [Mock]
@@ -52,7 +53,7 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
 
         Returns
         -------
-        wave [npt.NDArray[np.int16]]
+        wave [NDArray[np.float32]]
             音声波形データをNumPy配列で返します
 
         Note
@@ -63,10 +64,11 @@ def forward(self, text: str, **kwargs: Dict[str, Any]) -> np.ndarray:
         dtype=np.float64, 16 bit, mono 48000 Hz
 
         # resampleの説明
-        非モック実装（decode_forward）と合わせるために、出力を24kHzに変換した。
+        非モック実装（decode_forward）と合わせるために、出力を24kHz、32bit浮動小数に変換した。
         """
         logger = getLogger("uvicorn")  # FastAPI / Uvicorn 内からの利用のため
         logger.info("[Mock] input text: %s" % text)
         wave, sr = tts(text)
+        wave /= 2**15
         wave = resample(wave, 48000, 24000)
-        return wave
+        return wave.astype(np.float32)