diff options
author | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2021-11-04 14:33:37 +0200 |
---|---|---|
committer | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2022-04-28 14:35:41 +0300 |
commit | 6ab672d1ef7ece5c3019d46aeb98df3686f37e26 (patch) | |
tree | be10d088e90c6a9e60efef823f54f9aa0d70aa07 /platform/windows | |
parent | 3e1b824c050b765095285c67b3e4c8092e1f88c6 (diff) | |
download | redot-engine-6ab672d1ef7ece5c3019d46aeb98df3686f37e26.tar.gz |
Implement text-to-speech support on Android, iOS, HTML5, Linux, macOS and Windows.
Implement TextServer word break method.
Diffstat (limited to 'platform/windows')
-rw-r--r-- | platform/windows/SCsub | 1 | ||||
-rw-r--r-- | platform/windows/detect.py | 2 | ||||
-rw-r--r-- | platform/windows/display_server_windows.cpp | 43 | ||||
-rw-r--r-- | platform/windows/display_server_windows.h | 12 | ||||
-rw-r--r-- | platform/windows/tts_windows.cpp | 269 | ||||
-rw-r--r-- | platform/windows/tts_windows.h | 80 |
6 files changed, 407 insertions, 0 deletions
diff --git a/platform/windows/SCsub b/platform/windows/SCsub index 76234c3065..7e412b140f 100644 --- a/platform/windows/SCsub +++ b/platform/windows/SCsub @@ -13,6 +13,7 @@ common_win = [ "display_server_windows.cpp", "key_mapping_windows.cpp", "joypad_windows.cpp", + "tts_windows.cpp", "windows_terminal_logger.cpp", "vulkan_context_win.cpp", "gl_manager_windows.cpp", diff --git a/platform/windows/detect.py b/platform/windows/detect.py index 249a0d2e79..0b18fb74fb 100644 --- a/platform/windows/detect.py +++ b/platform/windows/detect.py @@ -252,6 +252,7 @@ def configure_msvc(env, manual_msvc_config): "kernel32", "ole32", "oleaut32", + "sapi", "user32", "gdi32", "IPHLPAPI", @@ -426,6 +427,7 @@ def configure_mingw(env): "ws2_32", "kernel32", "oleaut32", + "sapi", "dinput8", "dxguid", "ksuser", diff --git a/platform/windows/display_server_windows.cpp b/platform/windows/display_server_windows.cpp index 31bad0f053..0412eb2d9c 100644 --- a/platform/windows/display_server_windows.cpp +++ b/platform/windows/display_server_windows.cpp @@ -84,6 +84,7 @@ bool DisplayServerWindows::has_feature(Feature p_feature) const { case FEATURE_NATIVE_ICON: case FEATURE_SWAP_BUFFERS: case FEATURE_KEEP_SCREEN_ON: + case FEATURE_TEXT_TO_SPEECH: return true; default: return false; @@ -133,6 +134,41 @@ void DisplayServerWindows::_set_mouse_mode_impl(MouseMode p_mode) { } } +bool DisplayServerWindows::tts_is_speaking() const { + ERR_FAIL_COND_V(!tts, false); + return tts->is_speaking(); +} + +bool DisplayServerWindows::tts_is_paused() const { + ERR_FAIL_COND_V(!tts, false); + return tts->is_paused(); +} + +Array DisplayServerWindows::tts_get_voices() const { + ERR_FAIL_COND_V(!tts, Array()); + return tts->get_voices(); +} + +void DisplayServerWindows::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) { + ERR_FAIL_COND(!tts); + tts->speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt); +} + +void DisplayServerWindows::tts_pause() { + ERR_FAIL_COND(!tts); + tts->pause(); +} + +void DisplayServerWindows::tts_resume() { + ERR_FAIL_COND(!tts); + tts->resume(); +} + +void DisplayServerWindows::tts_stop() { + ERR_FAIL_COND(!tts); + tts->stop(); +} + void DisplayServerWindows::mouse_set_mode(MouseMode p_mode) { _THREAD_SAFE_METHOD_ @@ -3497,6 +3533,9 @@ DisplayServerWindows::DisplayServerWindows(const String &p_rendering_driver, Win rendering_driver = p_rendering_driver; + // Init TTS + tts = memnew(TTS_Windows); + // Note: Wacom WinTab driver API for pen input, for devices incompatible with Windows Ink. HMODULE wintab_lib = LoadLibraryW(L"wintab32.dll"); if (wintab_lib) { @@ -3739,4 +3778,8 @@ DisplayServerWindows::~DisplayServerWindows() { gl_manager = nullptr; } #endif + if (tts) { + memdelete(tts); + } + CoUninitialize(); } diff --git a/platform/windows/display_server_windows.h b/platform/windows/display_server_windows.h index fcf4b5a728..80faf71bd4 100644 --- a/platform/windows/display_server_windows.h +++ b/platform/windows/display_server_windows.h @@ -46,6 +46,7 @@ #include "servers/rendering/renderer_compositor.h" #include "servers/rendering/renderer_rd/renderer_compositor_rd.h" #include "servers/rendering_server.h" +#include "tts_windows.h" #ifdef XAUDIO2_ENABLED #include "drivers/xaudio2/audio_driver_xaudio2.h" @@ -320,6 +321,8 @@ class DisplayServerWindows : public DisplayServer { String rendering_driver; bool app_focused = false; + TTS_Windows *tts = nullptr; + struct WindowData { HWND hWnd; //layered window @@ -454,6 +457,15 @@ public: virtual bool has_feature(Feature p_feature) const override; virtual String get_name() const override; + virtual bool tts_is_speaking() const override; + virtual bool tts_is_paused() const override; + virtual Array tts_get_voices() const override; + + virtual void tts_speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false) override; + virtual void tts_pause() override; + virtual void tts_resume() override; + virtual void tts_stop() override; + virtual void mouse_set_mode(MouseMode p_mode) override; virtual MouseMode mouse_get_mode() const override; diff --git a/platform/windows/tts_windows.cpp b/platform/windows/tts_windows.cpp new file mode 100644 index 0000000000..05249934ba --- /dev/null +++ b/platform/windows/tts_windows.cpp @@ -0,0 +1,269 @@ +/*************************************************************************/ +/* tts_windows.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "tts_windows.h" + +TTS_Windows *TTS_Windows::singleton = nullptr; + +void __stdcall TTS_Windows::speech_event_callback(WPARAM wParam, LPARAM lParam) { + TTS_Windows *tts = TTS_Windows::get_singleton(); + SPEVENT event; + while (tts->synth->GetEvents(1, &event, NULL) == S_OK) { + if (tts->ids.has(event.ulStreamNum)) { + if (event.eEventId == SPEI_START_INPUT_STREAM) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_STARTED, tts->ids[event.ulStreamNum].id); + } else if (event.eEventId == SPEI_END_INPUT_STREAM) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_ENDED, tts->ids[event.ulStreamNum].id); + tts->ids.erase(event.ulStreamNum); + tts->_update_tts(); + } else if (event.eEventId == SPEI_WORD_BOUNDARY) { + const Char16String &string = tts->ids[event.ulStreamNum].string; + int pos = 0; + for (int i = 0; i < MIN(event.lParam, string.length()); i++) { + char16_t c = string[i]; + if ((c & 0xfffffc00) == 0xd800) { + i++; + } + pos++; + } + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_BOUNDARY, tts->ids[event.ulStreamNum].id, pos - tts->ids[event.ulStreamNum].offset); + } + } + } +} + +void TTS_Windows::_update_tts() { + if (!is_speaking() && !paused && queue.size() > 0) { + DisplayServer::TTSUtterance &message = queue.front()->get(); + + String text; + DWORD flags = SPF_ASYNC | SPF_PURGEBEFORESPEAK | SPF_IS_XML; + String pitch_tag = String("<pitch absmiddle=\"") + String::num_int64(message.pitch * 10 - 10, 10) + String("\">"); + text = pitch_tag + message.text + String("</pitch>"); + + IEnumSpObjectTokens *cpEnum; + ISpObjectToken *cpVoiceToken; + ULONG ulCount = 0; + ULONG stream_number = 0; + ISpObjectTokenCategory *cpCategory; + HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory); + if (SUCCEEDED(hr)) { + hr = cpCategory->SetId(SPCAT_VOICES, false); + if (SUCCEEDED(hr)) { + hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum); + if (SUCCEEDED(hr)) { + hr = cpEnum->GetCount(&ulCount); + while (SUCCEEDED(hr) && ulCount--) { + wchar_t *w_id = 0L; + hr = cpEnum->Next(1, &cpVoiceToken, nullptr); + cpVoiceToken->GetId(&w_id); + if (String::utf16((const char16_t *)w_id) == message.voice) { + synth->SetVoice(cpVoiceToken); + cpVoiceToken->Release(); + break; + } + cpVoiceToken->Release(); + } + cpEnum->Release(); + } + } + cpCategory->Release(); + } + + UTData ut; + ut.string = text.utf16(); + ut.offset = pitch_tag.length(); // Substract injected <pitch> tag offset. + ut.id = message.id; + + synth->SetVolume(message.volume); + synth->SetRate(10.f * log10(message.rate) / log10(3.f)); + synth->Speak((LPCWSTR)ut.string.get_data(), flags, &stream_number); + + ids[stream_number] = ut; + + queue.pop_front(); + } +} + +bool TTS_Windows::is_speaking() const { + ERR_FAIL_COND_V(!synth, false); + + SPVOICESTATUS status; + synth->GetStatus(&status, nullptr); + return (status.dwRunningState == SPRS_IS_SPEAKING); +} + +bool TTS_Windows::is_paused() const { + ERR_FAIL_COND_V(!synth, false); + return paused; +} + +Array TTS_Windows::get_voices() const { + Array list; + IEnumSpObjectTokens *cpEnum; + ISpObjectToken *cpVoiceToken; + ISpDataKey *cpDataKeyAttribs; + ULONG ulCount = 0; + ISpObjectTokenCategory *cpCategory; + HRESULT hr = CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_INPROC_SERVER, IID_ISpObjectTokenCategory, (void **)&cpCategory); + if (SUCCEEDED(hr)) { + hr = cpCategory->SetId(SPCAT_VOICES, false); + if (SUCCEEDED(hr)) { + hr = cpCategory->EnumTokens(nullptr, nullptr, &cpEnum); + if (SUCCEEDED(hr)) { + hr = cpEnum->GetCount(&ulCount); + while (SUCCEEDED(hr) && ulCount--) { + hr = cpEnum->Next(1, &cpVoiceToken, nullptr); + HRESULT hr_attr = cpVoiceToken->OpenKey(SPTOKENKEY_ATTRIBUTES, &cpDataKeyAttribs); + if (SUCCEEDED(hr_attr)) { + wchar_t *w_id = nullptr; + wchar_t *w_lang = nullptr; + wchar_t *w_name = nullptr; + cpVoiceToken->GetId(&w_id); + cpDataKeyAttribs->GetStringValue(L"Language", &w_lang); + cpDataKeyAttribs->GetStringValue(nullptr, &w_name); + LCID locale = wcstol(w_lang, nullptr, 16); + + int locale_chars = GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, nullptr, 0); + int region_chars = GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, nullptr, 0); + wchar_t *w_lang_code = new wchar_t[locale_chars]; + wchar_t *w_reg_code = new wchar_t[region_chars]; + GetLocaleInfoW(locale, LOCALE_SISO639LANGNAME, w_lang_code, locale_chars); + GetLocaleInfoW(locale, LOCALE_SISO3166CTRYNAME, w_reg_code, region_chars); + + Dictionary voice_d; + voice_d["id"] = String::utf16((const char16_t *)w_id); + if (w_name) { + voice_d["name"] = String::utf16((const char16_t *)w_name); + } else { + voice_d["name"] = voice_d["id"].operator String().replace("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech\\Voices\\Tokens\\", ""); + } + voice_d["language"] = String::utf16((const char16_t *)w_lang_code) + "_" + String::utf16((const char16_t *)w_reg_code); + list.push_back(voice_d); + + delete[] w_lang_code; + delete[] w_reg_code; + + cpDataKeyAttribs->Release(); + } + cpVoiceToken->Release(); + } + cpEnum->Release(); + } + } + cpCategory->Release(); + } + return list; +} + +void TTS_Windows::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) { + ERR_FAIL_COND(!synth); + if (p_interrupt) { + stop(); + } + + if (p_text.is_empty()) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, p_utterance_id); + return; + } + + DisplayServer::TTSUtterance message; + message.text = p_text; + message.voice = p_voice; + message.volume = CLAMP(p_volume, 0, 100); + message.pitch = CLAMP(p_pitch, 0.f, 2.f); + message.rate = CLAMP(p_rate, 0.1f, 10.f); + message.id = p_utterance_id; + queue.push_back(message); + + if (is_paused()) { + resume(); + } else { + _update_tts(); + } +} + +void TTS_Windows::pause() { + ERR_FAIL_COND(!synth); + if (!paused) { + if (synth->Pause() == S_OK) { + paused = true; + } + } +} + +void TTS_Windows::resume() { + ERR_FAIL_COND(!synth); + synth->Resume(); + paused = false; +} + +void TTS_Windows::stop() { + ERR_FAIL_COND(!synth); + + SPVOICESTATUS status; + synth->GetStatus(&status, nullptr); + if (ids.has(status.ulCurrentStream)) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[status.ulCurrentStream].id); + ids.erase(status.ulCurrentStream); + } + for (DisplayServer::TTSUtterance &message : queue) { + DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, message.id); + } + queue.clear(); + synth->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr); + synth->Resume(); + paused = false; +} + +TTS_Windows *TTS_Windows::get_singleton() { + return singleton; +} + +TTS_Windows::TTS_Windows() { + singleton = this; + CoInitialize(nullptr); + + if (SUCCEEDED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, (void **)&synth))) { + ULONGLONG event_mask = SPFEI(SPEI_END_INPUT_STREAM) | SPFEI(SPEI_START_INPUT_STREAM) | SPFEI(SPEI_WORD_BOUNDARY); + synth->SetInterest(event_mask, event_mask); + synth->SetNotifyCallbackFunction(&speech_event_callback, (WPARAM)(this), 0); + print_verbose("Text-to-Speech: SAPI initialized."); + } else { + print_verbose("Text-to-Speech: Cannot initialize ISpVoice!"); + } +} + +TTS_Windows::~TTS_Windows() { + if (synth) { + synth->Release(); + } + singleton = nullptr; +} diff --git a/platform/windows/tts_windows.h b/platform/windows/tts_windows.h new file mode 100644 index 0000000000..5da404baf9 --- /dev/null +++ b/platform/windows/tts_windows.h @@ -0,0 +1,80 @@ +/*************************************************************************/ +/* tts_windows.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef TTS_WINDOWS_H +#define TTS_WINDOWS_H + +#include "core/string/ustring.h" +#include "core/templates/list.h" +#include "core/templates/map.h" +#include "core/variant/array.h" +#include "servers/display_server.h" + +#include <objbase.h> +#include <sapi.h> +#include <wchar.h> +#include <winnls.h> + +#define WIN32_LEAN_AND_MEAN +#include <windows.h> + +class TTS_Windows { + List<DisplayServer::TTSUtterance> queue; + ISpVoice *synth = nullptr; + bool paused = false; + struct UTData { + Char16String string; + int offset; + int id; + }; + Map<ULONG, UTData> ids; + + static void __stdcall speech_event_callback(WPARAM wParam, LPARAM lParam); + void _update_tts(); + + static TTS_Windows *singleton; + +public: + static TTS_Windows *get_singleton(); + + bool is_speaking() const; + bool is_paused() const; + Array get_voices() const; + + void speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false); + void pause(); + void resume(); + void stop(); + + TTS_Windows(); + ~TTS_Windows(); +}; + +#endif // TTS_WINDOWS_H |