From 4638ade13f7cdc449656e9ff985420607a1fe299 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pa=CC=84vels=20Nadtoc=CC=8Cajevs?=
 <7645683+bruvzg@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:59:04 +0300
Subject: [PATCH] Enable TTS on demand, instead of fully disabling it when
 project setting is not set.

---
 doc/classes/DisplayServer.xml                 |  9 --
 doc/classes/ProjectSettings.xml               |  2 +-
 platform/android/tts_android.cpp              | 97 +++++++++++++------
 platform/android/tts_android.h                |  2 +
 platform/ios/display_server_ios.h             |  2 +
 platform/ios/display_server_ios.mm            | 40 ++++++--
 .../wayland/display_server_wayland.cpp        | 30 +++++-
 .../linuxbsd/wayland/display_server_wayland.h |  2 +
 platform/linuxbsd/x11/display_server_x11.cpp  | 41 ++++++--
 platform/linuxbsd/x11/display_server_x11.h    |  2 +
 platform/macos/display_server_macos.h         |  2 +
 platform/macos/display_server_macos.mm        | 41 ++++++--
 platform/web/display_server_web.cpp           | 10 +-
 platform/web/display_server_web.h             |  1 -
 platform/windows/display_server_windows.cpp   | 41 ++++++--
 platform/windows/display_server_windows.h     |  2 +
 16 files changed, 240 insertions(+), 84 deletions(-)
diff --git a/doc/classes/DisplayServer.xml b/doc/classes/DisplayServer.xml
index 7e6e579114e..2922ebe3d2b 100644
--- a/doc/classes/DisplayServer.xml
+++ b/doc/classes/DisplayServer.xml
@@ -1325,7 +1325,6 @@
 				- [code]language[/code] is language code in [code]lang_Variant[/code] format. The [code]lang[/code] part is a 2 or 3-letter code based on the ISO-639 standard, in lowercase. The [code skip-lint]Variant[/code] part is an engine-dependent string describing country, region or/and dialect.
 				Note that Godot depends on system libraries for text-to-speech functionality. These libraries are installed by default on Windows and macOS, but not on all Linux distributions. If they are not present, this method will return an empty list. This applies to both Godot users on Linux, as well as end-users on Linux running Godot games that use text-to-speech.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_get_voices_for_language" qualifiers="const">
@@ -1334,7 +1333,6 @@
 			<description>
 				Returns an [PackedStringArray] of voice identifiers for the [param language].
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_is_paused" qualifiers="const">
@@ -1342,7 +1340,6 @@
 			<description>
 				Returns [code]true[/code] if the synthesizer is in a paused state.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_is_speaking" qualifiers="const">
@@ -1350,7 +1347,6 @@
 			<description>
 				Returns [code]true[/code] if the synthesizer is generating speech, or have utterance waiting in the queue.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_pause">
@@ -1358,7 +1354,6 @@
 			<description>
 				Puts the synthesizer into a paused state.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_resume">
@@ -1366,7 +1361,6 @@
 			<description>
 				Resumes the synthesizer if it was paused.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_set_utterance_callback">
@@ -1379,7 +1373,6 @@
 				- [constant TTS_UTTERANCE_BOUNDARY] callable's method should take two [int] parameters, the index of the character and the utterance ID.
 				[b]Note:[/b] The granularity of the boundary callbacks is engine dependent.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_speak">
@@ -1401,7 +1394,6 @@
 				[b]Note:[/b] On Windows and Linux (X11/Wayland), utterance [param text] can use SSML markup. SSML support is engine and voice dependent. If the engine does not support SSML, you should strip out all XML markup before calling [method tts_speak].
 				[b]Note:[/b] The granularity of pitch, rate, and volume is engine and voice dependent. Values may be truncated.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Wayland), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="tts_stop">
@@ -1409,7 +1401,6 @@
 			<description>
 				Stops synthesis in progress and removes all utterances from the queue.
 				[b]Note:[/b] This method is implemented on Android, iOS, Web, Linux (X11/Linux), macOS, and Windows.
-				[b]Note:[/b] [member ProjectSettings.audio/general/text_to_speech] should be [code]true[/code] to use text-to-speech.
 			</description>
 		</method>
 		<method name="unregister_additional_output">
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index 5efb6682284..6f40c59c193 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -427,7 +427,7 @@
 			Sets the [url=https://developer.apple.com/documentation/avfaudio/avaudiosessioncategory]AVAudioSessionCategory[/url] on iOS. Use the [code]Playback[/code] category to get sound output, even if the phone is in silent mode.
 		</member>
 		<member name="audio/general/text_to_speech" type="bool" setter="" getter="" default="false">
-			If [code]true[/code], text-to-speech support is enabled, see [method DisplayServer.tts_get_voices] and [method DisplayServer.tts_speak].
+			If [code]true[/code], text-to-speech support is enabled on startup, otherwise it is enabled first time TTS method is used, see [method DisplayServer.tts_get_voices] and [method DisplayServer.tts_speak].
 			[b]Note:[/b] Enabling TTS can cause addition idle CPU usage and interfere with the sleep mode, so consider disabling it if TTS is not used.
 		</member>
 		<member name="audio/video/video_delay_compensation_ms" type="int" setter="" getter="" default="0">
diff --git a/platform/android/tts_android.cpp b/platform/android/tts_android.cpp
index d560f853f82..3dd9ff7b302 100644
--- a/platform/android/tts_android.cpp
+++ b/platform/android/tts_android.cpp
@@ -49,30 +49,37 @@ jmethodID TTS_Android::_stop_speaking = nullptr;
 
 HashMap<int, Char16String> TTS_Android::ids;
 
+void TTS_Android::initialize_tts() {
+	JNIEnv *env = get_jni_env();
+	ERR_FAIL_NULL(env);
+
+	if (_init) {
+		env->CallVoidMethod(tts, _init);
+		initialized = true;
+	}
+}
+
 void TTS_Android::setup(jobject p_tts) {
+	JNIEnv *env = get_jni_env();
+	ERR_FAIL_NULL(env);
+
+	tts = env->NewGlobalRef(p_tts);
+
+	jclass c = env->GetObjectClass(tts);
+	cls = (jclass)env->NewGlobalRef(c);
+
+	_init = env->GetMethodID(cls, "init", "()V");
+	_is_speaking = env->GetMethodID(cls, "isSpeaking", "()Z");
+	_is_paused = env->GetMethodID(cls, "isPaused", "()Z");
+	_get_voices = env->GetMethodID(cls, "getVoices", "()[Ljava/lang/String;");
+	_speak = env->GetMethodID(cls, "speak", "(Ljava/lang/String;Ljava/lang/String;IFFIZ)V");
+	_pause_speaking = env->GetMethodID(cls, "pauseSpeaking", "()V");
+	_resume_speaking = env->GetMethodID(cls, "resumeSpeaking", "()V");
+	_stop_speaking = env->GetMethodID(cls, "stopSpeaking", "()V");
+
 	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
 	if (tts_enabled) {
-		JNIEnv *env = get_jni_env();
-		ERR_FAIL_NULL(env);
-
-		tts = env->NewGlobalRef(p_tts);
-
-		jclass c = env->GetObjectClass(tts);
-		cls = (jclass)env->NewGlobalRef(c);
-
-		_init = env->GetMethodID(cls, "init", "()V");
-		_is_speaking = env->GetMethodID(cls, "isSpeaking", "()Z");
-		_is_paused = env->GetMethodID(cls, "isPaused", "()Z");
-		_get_voices = env->GetMethodID(cls, "getVoices", "()[Ljava/lang/String;");
-		_speak = env->GetMethodID(cls, "speak", "(Ljava/lang/String;Ljava/lang/String;IFFIZ)V");
-		_pause_speaking = env->GetMethodID(cls, "pauseSpeaking", "()V");
-		_resume_speaking = env->GetMethodID(cls, "resumeSpeaking", "()V");
-		_stop_speaking = env->GetMethodID(cls, "stopSpeaking", "()V");
-
-		if (_init) {
-			env->CallVoidMethod(tts, _init);
-			initialized = true;
-		}
+		initialize_tts();
 	}
 }
 
@@ -80,12 +87,19 @@ void TTS_Android::terminate() {
 	JNIEnv *env = get_jni_env();
 	ERR_FAIL_NULL(env);
 
-	env->DeleteGlobalRef(cls);
-	env->DeleteGlobalRef(tts);
+	if (cls) {
+		env->DeleteGlobalRef(cls);
+	}
+	if (tts) {
+		env->DeleteGlobalRef(tts);
+	}
 }
 
 void TTS_Android::_java_utterance_callback(int p_event, int p_id, int p_pos) {
-	ERR_FAIL_COND_MSG(!initialized, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	if (ids.has(p_id)) {
 		int pos = 0;
 		if ((DisplayServer::TTSUtteranceEvent)p_event == DisplayServer::TTS_UTTERANCE_BOUNDARY) {
@@ -106,7 +120,10 @@ void TTS_Android::_java_utterance_callback(int p_event, int p_id, int p_pos) {
 }
 
 bool TTS_Android::is_speaking() {
-	ERR_FAIL_COND_V_MSG(!initialized, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	if (_is_speaking) {
 		JNIEnv *env = get_jni_env();
 
@@ -118,7 +135,10 @@ bool TTS_Android::is_speaking() {
 }
 
 bool TTS_Android::is_paused() {
-	ERR_FAIL_COND_V_MSG(!initialized, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	if (_is_paused) {
 		JNIEnv *env = get_jni_env();
 
@@ -130,7 +150,10 @@ bool TTS_Android::is_paused() {
 }
 
 Array TTS_Android::get_voices() {
-	ERR_FAIL_COND_V_MSG(!initialized, Array(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, Array());
 	Array list;
 	if (_get_voices) {
 		JNIEnv *env = get_jni_env();
@@ -158,7 +181,10 @@ Array TTS_Android::get_voices() {
 }
 
 void TTS_Android::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_COND_MSG(!initialized, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	if (p_interrupt) {
 		stop();
 	}
@@ -183,7 +209,10 @@ void TTS_Android::speak(const String &p_text, const String &p_voice, int p_volum
 }
 
 void TTS_Android::pause() {
-	ERR_FAIL_COND_MSG(!initialized, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	if (_pause_speaking) {
 		JNIEnv *env = get_jni_env();
 
@@ -193,7 +222,10 @@ void TTS_Android::pause() {
 }
 
 void TTS_Android::resume() {
-	ERR_FAIL_COND_MSG(!initialized, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	if (_resume_speaking) {
 		JNIEnv *env = get_jni_env();
 
@@ -203,7 +235,10 @@ void TTS_Android::resume() {
 }
 
 void TTS_Android::stop() {
-	ERR_FAIL_COND_MSG(!initialized, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!initialized)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	for (const KeyValue<int, Char16String> &E : ids) {
 		DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, E.key);
 	}
diff --git a/platform/android/tts_android.h b/platform/android/tts_android.h
index 86068e20c4f..063af55271e 100644
--- a/platform/android/tts_android.h
+++ b/platform/android/tts_android.h
@@ -54,6 +54,8 @@ class TTS_Android {
 
 	static HashMap<int, Char16String> ids;
 
+	static void initialize_tts();
+
 public:
 	static void setup(jobject p_tts);
 	static void terminate();
diff --git a/platform/ios/display_server_ios.h b/platform/ios/display_server_ios.h
index b88e4115610..f0cd2fceda6 100644
--- a/platform/ios/display_server_ios.h
+++ b/platform/ios/display_server_ios.h
@@ -83,6 +83,8 @@ class DisplayServerIOS : public DisplayServer {
 
 	void perform_event(const Ref<InputEvent> &p_event);
 
+	void initialize_tts() const;
+
 	DisplayServerIOS(const String &p_rendering_driver, DisplayServer::WindowMode p_mode, DisplayServer::VSyncMode p_vsync_mode, uint32_t p_flags, const Vector2i *p_position, const Vector2i &p_resolution, int p_screen, Context p_context, int64_t p_parent_window, Error &r_error);
 	~DisplayServerIOS();
 
diff --git a/platform/ios/display_server_ios.mm b/platform/ios/display_server_ios.mm
index 3fbc7f467d5..4a5a05d0ece 100644
--- a/platform/ios/display_server_ios.mm
+++ b/platform/ios/display_server_ios.mm
@@ -61,7 +61,7 @@ DisplayServerIOS::DisplayServerIOS(const String &p_rendering_driver, WindowMode
 	// Init TTS
 	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
 	if (tts_enabled) {
-		tts = [[TTS_IOS alloc] init];
+		initialize_tts();
 	}
 	native_menu = memnew(NativeMenu);
 
@@ -389,39 +389,63 @@ bool DisplayServerIOS::has_feature(Feature p_feature) const {
 String DisplayServerIOS::get_name() const {
 	return "iOS";
 }
+void DisplayServerIOS::initialize_tts() const {
+	const_cast<DisplayServerIOS *>(this)->tts = [[TTS_IOS alloc] init];
+}
 
 bool DisplayServerIOS::tts_is_speaking() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return [tts isSpeaking];
 }
 
 bool DisplayServerIOS::tts_is_paused() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return [tts isPaused];
 }
 
 TypedArray<Dictionary> DisplayServerIOS::tts_get_voices() const {
-	ERR_FAIL_NULL_V_MSG(tts, TypedArray<Dictionary>(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, TypedArray<Dictionary>());
 	return [tts getVoices];
 }
 
 void DisplayServerIOS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts speak:p_text voice:p_voice volume:p_volume pitch:p_pitch rate:p_rate utterance_id:p_utterance_id interrupt:p_interrupt];
 }
 
 void DisplayServerIOS::tts_pause() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts pauseSpeaking];
 }
 
 void DisplayServerIOS::tts_resume() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts resumeSpeaking];
 }
 
 void DisplayServerIOS::tts_stop() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts stopSpeaking];
 }
 
diff --git a/platform/linuxbsd/wayland/display_server_wayland.cpp b/platform/linuxbsd/wayland/display_server_wayland.cpp
index 160b1a364b4..1fe5df1d770 100644
--- a/platform/linuxbsd/wayland/display_server_wayland.cpp
+++ b/platform/linuxbsd/wayland/display_server_wayland.cpp
@@ -245,37 +245,62 @@ String DisplayServerWayland::get_name() const {
 
 #ifdef SPEECHD_ENABLED
 
+void DisplayServerWayland::initialize_tts() const {
+	const_cast<DisplayServerWayland *>(this)->tts = memnew(TTS_Linux);
+}
+
 bool DisplayServerWayland::tts_is_speaking() const {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_speaking();
 }
 
 bool DisplayServerWayland::tts_is_paused() const {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_paused();
 }
 
 TypedArray<Dictionary> DisplayServerWayland::tts_get_voices() const {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL_V(tts, TypedArray<Dictionary>());
 	return tts->get_voices();
 }
 
 void DisplayServerWayland::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL(tts);
 	tts->speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
 }
 
 void DisplayServerWayland::tts_pause() {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL(tts);
 	tts->pause();
 }
 
 void DisplayServerWayland::tts_resume() {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL(tts);
 	tts->resume();
 }
 
 void DisplayServerWayland::tts_stop() {
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
 	ERR_FAIL_NULL(tts);
 	tts->stop();
 }
@@ -1439,7 +1464,10 @@ DisplayServerWayland::DisplayServerWayland(const String &p_rendering_driver, Win
 
 #ifdef SPEECHD_ENABLED
 	// Init TTS
-	tts = memnew(TTS_Linux);
+	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
+	if (tts_enabled) {
+		initialize_tts();
+	}
 #endif
 
 	rendering_driver = p_rendering_driver;
diff --git a/platform/linuxbsd/wayland/display_server_wayland.h b/platform/linuxbsd/wayland/display_server_wayland.h
index 1b32cb75f37..c5f7e0548d2 100644
--- a/platform/linuxbsd/wayland/display_server_wayland.h
+++ b/platform/linuxbsd/wayland/display_server_wayland.h
@@ -165,6 +165,8 @@ class DisplayServerWayland : public DisplayServer {
 
 	void try_suspend();
 
+	void initialize_tts() const;
+
 public:
 	virtual bool has_feature(Feature p_feature) const override;
 
diff --git a/platform/linuxbsd/x11/display_server_x11.cpp b/platform/linuxbsd/x11/display_server_x11.cpp
index 577e2d19593..edc6f848809 100644
--- a/platform/linuxbsd/x11/display_server_x11.cpp
+++ b/platform/linuxbsd/x11/display_server_x11.cpp
@@ -340,38 +340,63 @@ void DisplayServerX11::_flush_mouse_motion() {
 
 #ifdef SPEECHD_ENABLED
 
+void DisplayServerX11::initialize_tts() const {
+	const_cast<DisplayServerX11 *>(this)->tts = memnew(TTS_Linux);
+}
+
 bool DisplayServerX11::tts_is_speaking() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_speaking();
 }
 
 bool DisplayServerX11::tts_is_paused() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_paused();
 }
 
 TypedArray<Dictionary> DisplayServerX11::tts_get_voices() const {
-	ERR_FAIL_NULL_V_MSG(tts, TypedArray<Dictionary>(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, TypedArray<Dictionary>());
 	return tts->get_voices();
 }
 
 void DisplayServerX11::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
 }
 
 void DisplayServerX11::tts_pause() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->pause();
 }
 
 void DisplayServerX11::tts_resume() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->resume();
 }
 
 void DisplayServerX11::tts_stop() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->stop();
 }
 
@@ -6781,7 +6806,7 @@ DisplayServerX11::DisplayServerX11(const String &p_rendering_driver, WindowMode
 	// Init TTS
 	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
 	if (tts_enabled) {
-		tts = memnew(TTS_Linux);
+		initialize_tts();
 	}
 #endif
 
diff --git a/platform/linuxbsd/x11/display_server_x11.h b/platform/linuxbsd/x11/display_server_x11.h
index 56c9b0d952f..fe9322ae565 100644
--- a/platform/linuxbsd/x11/display_server_x11.h
+++ b/platform/linuxbsd/x11/display_server_x11.h
@@ -394,6 +394,8 @@ class DisplayServerX11 : public DisplayServer {
 	void _set_window_taskbar_pager_enabled(Window p_window, bool p_enabled);
 	Rect2i _screens_get_full_rect() const;
 
+	void initialize_tts() const;
+
 protected:
 	void _window_changed(XEvent *event);
 
diff --git a/platform/macos/display_server_macos.h b/platform/macos/display_server_macos.h
index 8119d07a534..e6cfd976ae4 100644
--- a/platform/macos/display_server_macos.h
+++ b/platform/macos/display_server_macos.h
@@ -237,6 +237,8 @@ private:
 
 	Error _file_dialog_with_options_show(const String &p_title, const String &p_current_directory, const String &p_root, const String &p_filename, bool p_show_hidden, FileDialogMode p_mode, const Vector<String> &p_filters, const TypedArray<Dictionary> &p_options, const Callable &p_callback, bool p_options_in_cb, WindowID p_window_id);
 
+	void initialize_tts() const;
+
 public:
 	void menu_callback(id p_sender);
 
diff --git a/platform/macos/display_server_macos.mm b/platform/macos/display_server_macos.mm
index 610b2cdebb7..942d3a27e3e 100644
--- a/platform/macos/display_server_macos.mm
+++ b/platform/macos/display_server_macos.mm
@@ -858,38 +858,63 @@ Callable DisplayServerMacOS::_help_get_action_callback() const {
 	return help_action_callback;
 }
 
+void DisplayServerMacOS::initialize_tts() const {
+	const_cast<DisplayServerMacOS *>(this)->tts = [[TTS_MacOS alloc] init];
+}
+
 bool DisplayServerMacOS::tts_is_speaking() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return [tts isSpeaking];
 }
 
 bool DisplayServerMacOS::tts_is_paused() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return [tts isPaused];
 }
 
 TypedArray<Dictionary> DisplayServerMacOS::tts_get_voices() const {
-	ERR_FAIL_NULL_V_MSG(tts, Array(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, TypedArray<Dictionary>());
 	return [tts getVoices];
 }
 
 void DisplayServerMacOS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts speak:p_text voice:p_voice volume:p_volume pitch:p_pitch rate:p_rate utterance_id:p_utterance_id interrupt:p_interrupt];
 }
 
 void DisplayServerMacOS::tts_pause() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts pauseSpeaking];
 }
 
 void DisplayServerMacOS::tts_resume() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts resumeSpeaking];
 }
 
 void DisplayServerMacOS::tts_stop() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	[tts stopSpeaking];
 }
 
@@ -3753,7 +3778,7 @@ DisplayServerMacOS::DisplayServerMacOS(const String &p_rendering_driver, WindowM
 	// Init TTS
 	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
 	if (tts_enabled) {
-		tts = [[TTS_MacOS alloc] init];
+		initialize_tts();
 	}
 
 	native_menu = memnew(NativeMenuMacOS);
diff --git a/platform/web/display_server_web.cpp b/platform/web/display_server_web.cpp
index 180c05ef411..8e99ee27d81 100644
--- a/platform/web/display_server_web.cpp
+++ b/platform/web/display_server_web.cpp
@@ -384,12 +384,10 @@ const char *DisplayServerWeb::godot2dom_cursor(DisplayServer::CursorShape p_shap
 }
 
 bool DisplayServerWeb::tts_is_speaking() const {
-	ERR_FAIL_COND_V_MSG(!tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	return godot_js_tts_is_speaking();
 }
 
 bool DisplayServerWeb::tts_is_paused() const {
-	ERR_FAIL_COND_V_MSG(!tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	return godot_js_tts_is_paused();
 }
 
@@ -424,13 +422,11 @@ void DisplayServerWeb::_update_voices_callback(const Vector<String> &p_voices) {
 }
 
 TypedArray<Dictionary> DisplayServerWeb::tts_get_voices() const {
-	ERR_FAIL_COND_V_MSG(!tts, TypedArray<Dictionary>(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	godot_js_tts_get_voices(update_voices_callback);
 	return voices;
 }
 
 void DisplayServerWeb::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_COND_MSG(!tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	if (p_interrupt) {
 		tts_stop();
 	}
@@ -447,17 +443,14 @@ void DisplayServerWeb::tts_speak(const String &p_text, const String &p_voice, in
 }
 
 void DisplayServerWeb::tts_pause() {
-	ERR_FAIL_COND_MSG(!tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	godot_js_tts_pause();
 }
 
 void DisplayServerWeb::tts_resume() {
-	ERR_FAIL_COND_MSG(!tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	godot_js_tts_resume();
 }
 
 void DisplayServerWeb::tts_stop() {
-	ERR_FAIL_COND_MSG(!tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
 	for (const KeyValue<int, CharString> &E : utterance_ids) {
 		tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, E.key);
 	}
@@ -1086,7 +1079,6 @@ DisplayServer *DisplayServerWeb::create_func(const String &p_rendering_driver, W
 DisplayServerWeb::DisplayServerWeb(const String &p_rendering_driver, WindowMode p_window_mode, VSyncMode p_vsync_mode, uint32_t p_flags, const Point2i *p_position, const Size2i &p_resolution, int p_screen, Context p_context, int64_t p_parent_window, Error &r_error) {
 	r_error = OK; // Always succeeds for now.
 
-	tts = GLOBAL_GET("audio/general/text_to_speech");
 	native_menu = memnew(NativeMenu); // Dummy native menu.
 
 	// Ensure the canvas ID.
@@ -1199,7 +1191,7 @@ bool DisplayServerWeb::has_feature(Feature p_feature) const {
 		case FEATURE_VIRTUAL_KEYBOARD:
 			return godot_js_display_vk_available() != 0;
 		case FEATURE_TEXT_TO_SPEECH:
-			return tts && (godot_js_display_tts_available() != 0);
+			return godot_js_display_tts_available() != 0;
 		default:
 			return false;
 	}
diff --git a/platform/web/display_server_web.h b/platform/web/display_server_web.h
index 8eadda8539c..858e35bfc33 100644
--- a/platform/web/display_server_web.h
+++ b/platform/web/display_server_web.h
@@ -102,7 +102,6 @@ private:
 	int key_event_pos = 0;
 
 	bool swap_cancel_ok = false;
-	bool tts = false;
 	NativeMenu *native_menu = nullptr;
 
 	MouseMode mouse_mode_base = MOUSE_MODE_VISIBLE;
diff --git a/platform/windows/display_server_windows.cpp b/platform/windows/display_server_windows.cpp
index 9a3f884aeff..79de8c4b135 100644
--- a/platform/windows/display_server_windows.cpp
+++ b/platform/windows/display_server_windows.cpp
@@ -254,38 +254,63 @@ void DisplayServerWindows::_register_raw_input_devices(WindowID p_target_window)
 	}
 }
 
+void DisplayServerWindows::initialize_tts() const {
+	const_cast<DisplayServerWindows *>(this)->tts = memnew(TTS_Windows);
+}
+
 bool DisplayServerWindows::tts_is_speaking() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_speaking();
 }
 
 bool DisplayServerWindows::tts_is_paused() const {
-	ERR_FAIL_NULL_V_MSG(tts, false, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, false);
 	return tts->is_paused();
 }
 
 TypedArray<Dictionary> DisplayServerWindows::tts_get_voices() const {
-	ERR_FAIL_NULL_V_MSG(tts, TypedArray<Dictionary>(), "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL_V(tts, TypedArray<Dictionary>());
 	return tts->get_voices();
 }
 
 void DisplayServerWindows::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
 }
 
 void DisplayServerWindows::tts_pause() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->pause();
 }
 
 void DisplayServerWindows::tts_resume() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->resume();
 }
 
 void DisplayServerWindows::tts_stop() {
-	ERR_FAIL_NULL_MSG(tts, "Enable the \"audio/general/text_to_speech\" project setting to use text-to-speech.");
+	if (unlikely(!tts)) {
+		initialize_tts();
+	}
+	ERR_FAIL_NULL(tts);
 	tts->stop();
 }
 
@@ -6626,7 +6651,7 @@ DisplayServerWindows::DisplayServerWindows(const String &p_rendering_driver, Win
 	// Init TTS
 	bool tts_enabled = GLOBAL_GET("audio/general/text_to_speech");
 	if (tts_enabled) {
-		tts = memnew(TTS_Windows);
+		initialize_tts();
 	}
 	native_menu = memnew(NativeMenuWindows);
 
diff --git a/platform/windows/display_server_windows.h b/platform/windows/display_server_windows.h
index 432a293d2f0..cf50408f760 100644
--- a/platform/windows/display_server_windows.h
+++ b/platform/windows/display_server_windows.h
@@ -687,6 +687,8 @@ class DisplayServerWindows : public DisplayServer {
 
 	HWND _find_window_from_process_id(OS::ProcessID p_pid, HWND p_current_hwnd);
 
+	void initialize_tts() const;
+
 public:
 	LRESULT WndProcFileDialog(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam);
 	LRESULT WndProc(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lParam);