ailia_voice  1.5.0.0
APIの使用方法

ailia AI VoiceのAPIの概要

基本的な使用方法

C++を使用して音声合成をする例です。AILIAVoiceのインスタンスを作成し、ailiaVoiceOpenModelFileでモデルを開き、ailiaVoiceGraphemeToPhonemeで音素を取得、ailiaVoiceInferenceで音声合成、ailiaVoiceGetWaveで波形を取得可能です。GPT-SoVITSを使用する場合は、ailiaVoiceInferenceの前に、リファレンスとなる音声ファイルをailiaVoiceSetReferenceで与えます。GPT-SoVITS v3を使用する場合は、ailiaVoiceOpenGPTSoVITSV3ModelFileAでモデルを開きます。GPT-SoVITS v2-proを使用する場合は、ailiaVoiceOpenGPTSoVITSV2ProModelFileAでモデルを開きます。ailiaVoiceSetSampleStepsでCFMのサンプリングステップ数を変更することもできます(v3のみ)。中国語を使用する場合、V2/V3/V2-ProではG2P_CNとG2PWの両方の辞書を読み込む必要があります。

#include "ailia_voice.h"
#include "ailia_voice_util.h"
#include <stdio.h>
#include <vector>
#include <string>
#include <string.h>
#include "wave_reader.h"
#include "wave_writer.h"
int main(int argc, char *argv[]){
AILIAVoiceApiCallback callback = ailiaVoiceUtilGetCallback();
printf("Usage : ailia_voice_sample [tacotron2/gpt-sovits/gpt-sovits-en/gpt-sovits-zh/gpt-sovits-v2/gpt-sovits-v2-en/gpt-sovits-v2-zh/gpt-sovits-v3/gpt-sovits-v3-en/gpt-sovits-v3-zh/gpt-sovits-v2-pro/gpt-sovits-v2-pro-en/gpt-sovits-v2-pro-zh] [input_text]\n");
const char * input_text = "";
const char * lang = "";
const char * model = "tacotron2";
if (argc >= 2){
model = argv[1];
if (!(strcmp(model, "tacotron2") == 0 || strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0 || strcmp(model, "gpt-sovits-zh") == 0 || strcmp(model, "gpt-sovits-v2") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v2-zh") == 0 || strcmp(model, "gpt-sovits-v3") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0 || strcmp(model, "gpt-sovits-v2-pro") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0)){
printf("model must be tacotron2, gpt-sovits, gpt-sovits-en, gpt-sovits-zh, gpt-sovits-v2, gpt-sovits-v2-en, gpt-sovits-v2-zh, gpt-sovits-v3, gpt-sovits-v3-en, gpt-sovits-v3-zh, gpt-sovits-v2-pro, gpt-sovits-v2-pro-en or gpt-sovits-v2-pro-zh\n");
return -1;
}
}
if (argc >= 3){
input_text = argv[2];
}
if (strcmp(model, "tacotron2") == 0 || strcmp(model, "gpt-sovits-en") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0){
if (strlen(input_text) == 0){
input_text = u8"Hello world.";
}
lang = "en";
}else if (strcmp(model, "gpt-sovits-zh") == 0 || strcmp(model, "gpt-sovits-v2-zh") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0){
if (strlen(input_text) == 0){
input_text = u8"你好,世界。今天天气真好。";
}
lang = "zh";
}else{
if (strlen(input_text) == 0){
input_text = u8"こんにちは。今日は新しいAIエンジンであるアイリアSDKを紹介します。";
}
lang = "ja";
}
printf("Model : %s\n", model);
printf("Input text : %s\n", input_text);
printf("Language : %s\n", lang);
AILIAVoice *net;
int env_id = AILIA_ENVIRONMENT_ID_AUTO;
int num_thread = AILIA_MULTITHREAD_AUTO;
int memory_mode = AILIA_MEMORY_REDUCE_CONSTANT | AILIA_MEMORY_REDUCE_CONSTANT_WITH_INPUT_INITIALIZER | AILIA_MEMORY_REUSE_INTERSTAGE;
bool enable_user_dictionary = true;
int status = ailiaVoiceCreate(&net, env_id, num_thread, memory_mode, AILIA_VOICE_FLAG_NONE, callback, AILIA_VOICE_API_CALLBACK_VERSION);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceCreate error %d\n", status);
return -1;
}
if (strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0 || strcmp(model, "gpt-sovits-v2") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v3") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v2-pro") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0){
if (enable_user_dictionary){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceSetUserDictionaryFileA error %d\n", status);
return -1;
}
}
status = ailiaVoiceOpenDictionaryFileA(net, "./open_jtalk_dic_utf_8-1.11", AILIA_VOICE_DICTIONARY_TYPE_OPEN_JTALK);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA error %d\n", status);
return -1;
}
}
if (strcmp(model, "gpt-sovits-en") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA g2p_en error %d\n", status);
return -1;
}
}
if (strcmp(model, "gpt-sovits-zh") == 0){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA g2p_cn error %d\n", status);
return -1;
}
}
if (strcmp(model, "gpt-sovits-v2-zh") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0){
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA g2p_cn error %d\n", status);
return -1;
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenDictionaryFileA g2pw error %d\n", status);
return -1;
}
}
if (strcmp(model, "tacotron2") == 0){
status = ailiaVoiceOpenTacotron2ModelFileA(net, "../onnx/nvidia/encoder.onnx", "../onnx/nvidia/decoder_iter.onnx", "../onnx/nvidia/postnet.onnx", "../onnx/nvidia/waveglow.onnx", AILIA_VOICE_CLEANER_TYPE_BASIC);
}else if (strcmp(model, "gpt-sovits-v3") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0){
status = ailiaVoiceOpenGPTSoVITSV3ModelFileA(net, "../onnx/gpt-sovits-v3/t2s_encoder.onnx", "../onnx/gpt-sovits-v3/t2s_fsdec.onnx", "../onnx/gpt-sovits-v3/t2s_sdec.onnx", "../onnx/gpt-sovits-v3/cnhubert.onnx", "../onnx/gpt-sovits-v3/vq_model.onnx", "../onnx/gpt-sovits-v3/vq_cfm.onnx", "../onnx/gpt-sovits-v3/bigvgan_model.onnx", "../onnx/gpt-sovits-v3/chinese-roberta.onnx", "../onnx/gpt-sovits-v3/vocab.txt");
}else if (strcmp(model, "gpt-sovits-v2-pro") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0){
status = ailiaVoiceOpenGPTSoVITSV2ProModelFileA(net, "../onnx/gpt-sovits-v3/t2s_encoder.onnx", "../onnx/gpt-sovits-v3/t2s_fsdec.onnx", "../onnx/gpt-sovits-v3/t2s_sdec.opt.onnx", "../onnx/gpt-sovits-v3/cnhubert.onnx", "../onnx/gpt-sovits-v2-pro/vits.onnx", "../onnx/gpt-sovits-v2-pro/sv.onnx", "../onnx/gpt-sovits-v2-pro/chinese-roberta.onnx", "../onnx/gpt-sovits-v2-pro/vocab.txt");
}else if (strcmp(model, "gpt-sovits-zh") == 0){
status = ailiaVoiceOpenGPTSoVITSV1ModelFileA(net, "../onnx/gpt-sovits-zh/t2s_encoder.onnx", "../onnx/gpt-sovits-zh/t2s_fsdec.onnx", "../onnx/gpt-sovits-zh/t2s_sdec.opt3.onnx", "../onnx/gpt-sovits-zh/vits.onnx", "../onnx/gpt-sovits-zh/cnhubert.onnx");
}else if (strcmp(model, "gpt-sovits-v2") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v2-zh") == 0){
status = ailiaVoiceOpenGPTSoVITSV2ModelFileA(net, "../onnx/gpt-sovits-v2/t2s_encoder.onnx", "../onnx/gpt-sovits-v2/t2s_fsdec.onnx", "../onnx/gpt-sovits-v2/t2s_sdec.onnx", "../onnx/gpt-sovits-v2/vits.onnx", "../onnx/gpt-sovits-v2/cnhubert.onnx", "../onnx/gpt-sovits-v2/chinese-roberta.onnx", "../onnx/gpt-sovits-v2/vocab.txt");
}else{
status = ailiaVoiceOpenGPTSoVITSV1ModelFileA(net, "../onnx/gpt-sovits/t2s_encoder.onnx", "../onnx/gpt-sovits/t2s_fsdec.onnx", "../onnx/gpt-sovits/t2s_sdec.opt3.onnx", "../onnx/gpt-sovits/vits.onnx", "../onnx/gpt-sovits/cnhubert.onnx");
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceOpenModelFileA error %d\n", status);
return -1;
}
if (strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-en") == 0 || strcmp(model, "gpt-sovits-zh") == 0 || strcmp(model, "gpt-sovits-v2") == 0 || strcmp(model, "gpt-sovits-v2-en") == 0 || strcmp(model, "gpt-sovits-v2-zh") == 0 || strcmp(model, "gpt-sovits-v3") == 0 || strcmp(model, "gpt-sovits-v3-en") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0 || strcmp(model, "gpt-sovits-v2-pro") == 0 || strcmp(model, "gpt-sovits-v2-pro-en") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0){
int sampleRate, nChannels, nSamples;
const char *ref_audio = "../onnx/gpt-sovits/reference_audio_girl.wav";
const char *ref_text;
int ref_g2p_type;
ref_text = u8"水をマレーシアから買わなくてはならない。";
std::vector<float> wave = read_wave_file(ref_audio, &sampleRate, &nChannels, &nSamples);
status = ailiaVoiceGraphemeToPhoneme(net, ref_text, ref_g2p_type);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGraphemeToPhoneme error %d\n", status);
return -1;
}
unsigned int len = 0;
status = ailiaVoiceGetFeatureLength(net, &len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatureLength error %d\n", status);
return -1;
}
std::vector<char> ref_features;
ref_features.resize(len);
status = ailiaVoiceGetFeatures(net, &ref_features[0], len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatures error %d\n", status);
return -1;
}
printf("Reference Features : %s\n", &ref_features[0]);
status = ailiaVoiceSetReference(net, &wave[0], wave.size() * sizeof(float), nChannels, sampleRate, &ref_features[0]);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceSetReference error %d\n", status);
return -1;
}
}
std::vector<char> features;
if (strcmp(model, "tacotron2") == 0){
status = ailiaVoiceInference(net, input_text);
}else{
if (strcmp(model, "gpt-sovits") == 0 || strcmp(model, "gpt-sovits-v2") == 0 || strcmp(model, "gpt-sovits-v3") == 0 || strcmp(model, "gpt-sovits-v2-pro") == 0){
}else if (strcmp(model, "gpt-sovits-zh") == 0 || strcmp(model, "gpt-sovits-v2-zh") == 0 || strcmp(model, "gpt-sovits-v3-zh") == 0 || strcmp(model, "gpt-sovits-v2-pro-zh") == 0){
}else{
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGraphemeToPhoneme error %d\n", status);
return -1;
}
unsigned int len = 0;
status = ailiaVoiceGetFeatureLength(net, &len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatureLength error %d\n", status);
return -1;
}
features.resize(len);
status = ailiaVoiceGetFeatures(net, &features[0], len);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetFeatures error %d\n", status);
return -1;
}
printf("Features : %s\n", &features[0]);
status = ailiaVoiceInference(net, &features[0]);
}
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceInference error %d\n", status);
return -1;
}
unsigned int samples, channels, sampling_rate;
status = ailiaVoiceGetWaveInfo(net, &samples, &channels, &sampling_rate);
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetWaveInfo error %d\n", status);
return -1;
}
std::vector<float> buf(samples * channels);
status = ailiaVoiceGetWave(net, &buf[0], buf.size() * sizeof(float));
if (status != AILIA_STATUS_SUCCESS){
printf("ailiaVoiceGetWave error %d\n", status);
return -1;
}
printf("Wave samples : %d\nWave channles : %d\nWave sampling rate : %d\n", samples, channels, sampling_rate);
write_wave_file("output.wav", buf, sampling_rate);
return 0;
}
ailia Voice ライブラリ
#define AILIA_VOICE_G2P_TYPE_GPT_SOVITS_JA
GPT_SOVITSの日本語向けの処理
Definition: ailia_voice.h:198
int AILIA_API ailiaVoiceGraphemeToPhoneme(struct AILIAVoice *net, const char *utf8, int g2p_type)
G2Pを行います。
int AILIA_API ailiaVoiceSetReference(struct AILIAVoice *net, float *buf, unsigned int buf_size, unsigned int channels, unsigned int sampling_rate, const char *features)
0ショット音声合成のリファレンスとなる波形とテキストを設定します。
#define AILIA_VOICE_DICTIONARY_TYPE_OPEN_JTALK
OpenJtalk形式
Definition: ailia_voice.h:43
#define AILIA_VOICE_G2P_TYPE_GPT_SOVITS_EN
GPT_SOVITSの英語向けの処理
Definition: ailia_voice.h:187
int AILIA_API ailiaVoiceSetUserDictionaryFileA(struct AILIAVoice *net, const char *dictionary_path, int dictionary_type)
ユーザ辞書を指定します。(MBSC)
int AILIA_API ailiaVoiceOpenGPTSoVITSV1ModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *wave, const char *ssl)
GPT-SoVITS V1向けのモデルを指定します。(MBSC)
int AILIA_API ailiaVoiceInference(struct AILIAVoice *net, const char *utf8)
推論を行います。
int AILIA_API ailiaVoiceOpenGPTSoVITSV2ProModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *ssl, const char *vits, const char *sv, const char *chinese_bert, const char *vocab)
GPT-SoVITS V2-Pro向けのモデルを指定します。(MBSC)
int AILIA_API ailiaVoiceGetFeatureLength(struct AILIAVoice *net, unsigned int *len)
フィーチャーの長さを取得します。(NULL文字含む)
int AILIA_API ailiaVoiceGetWave(struct AILIAVoice *net, float *buf, unsigned int buf_size)
波形を取得します。
int AILIA_API ailiaVoiceGetWaveInfo(struct AILIAVoice *net, unsigned int *samples, unsigned int *channels, unsigned int *sampling_rate)
波形の情報を取得します。
#define AILIA_VOICE_G2P_TYPE_GPT_SOVITS_ZH
GPT_SOVITSの中国語向けの処理
Definition: ailia_voice.h:209
int AILIA_API ailiaVoiceCreate(struct AILIAVoice **net, int env_id, int num_thread, int memory_mode, int flags, AILIAVoiceApiCallback callback, int version)
ボイスオブジェクトを作成します。
#define AILIA_VOICE_FLAG_NONE
フラグを設定しません
Definition: ailia_voice.h:172
#define AILIA_VOICE_DICTIONARY_TYPE_G2P_CN
G2P_CN形式
Definition: ailia_voice.h:65
int AILIA_API ailiaVoiceOpenGPTSoVITSV2ModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *wave, const char *ssl, const char *chinese_bert, const char *vocab)
GPT-SoVITS V2向けのモデルを指定します。(MBSC)
int AILIA_API ailiaVoiceGetFeatures(struct AILIAVoice *net, char *features, unsigned int len)
フィーチャーを取得します。
int AILIA_API ailiaVoiceOpenTacotron2ModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *wave, int cleaner_type)
Tacotron2向けのモデルを指定します。(MBSC)
int AILIA_API ailiaVoiceOpenDictionaryFileA(struct AILIAVoice *net, const char *dictionary_path, int dictionary_type)
辞書を指定します。(MBSC)
#define AILIA_VOICE_DICTIONARY_TYPE_G2P_EN
G2P_EN形式
Definition: ailia_voice.h:54
int AILIA_API ailiaVoiceOpenGPTSoVITSV3ModelFileA(struct AILIAVoice *net, const char *encoder, const char *decoder1, const char *decoder2, const char *ssl, const char *vq, const char *cfm, const char *bigvgan, const char *chinese_bert, const char *vocab)
GPT-SoVITS V3向けのモデルを指定します。(MBSC)
#define AILIA_VOICE_API_CALLBACK_VERSION
構造体バージョン
Definition: ailia_voice.h:260
#define AILIA_VOICE_DICTIONARY_TYPE_G2PW
G2PW形式(中国語多音字対応)
Definition: ailia_voice.h:76
void AILIA_API ailiaVoiceDestroy(struct AILIAVoice *net)
ボイスオブジェクトを破棄します。
#define AILIA_VOICE_CLEANER_TYPE_BASIC
BasicCleaner
Definition: ailia_voice.h:146
Definition: ailia_voice.h:263

ユーザ辞書の使用

pyopenjtalkで作成したuserdic.dicは、ailiaVoiceOpenDictionaryFile APIの前にailiaVoiceSetUserDictionaryFile APIを実行することで読み込ませることが可能です。

ailiaVoiceSetUserDictionaryFileA(net, "./userdic/userdic.dic", AILIA_VOICE_DICTIONARY_TYPE_OPEN_JTALK);
ailiaVoiceOpenDictionaryFileA(net, "./open_jtalk_dic_utf_8-1.11", AILIA_VOICE_DICTIONARY_TYPE_OPEN_JTALK);

GPUの使用

GPUを使用するには、ailiaVoiceCreateのenv_id引数にGPUのenv_idを指定します。 デフォルトでは、AILIA_ENVIRONMENT_ID_AUTOが指定されており、CPUで推論が行われます。 GPUのenv_idを取得する方法は、ailia_voice_sample.cppを参照してください。