support audio ducking for SAPI5 on Windows 11 (#13118)

michaelDCurran · web-flow · commit 20d5a25dced4 · 2022-01-12T13:34:36.000+10:00
Fixes #12913 Summary of the issue: NVDA's SAPI5 synthDriver instructed Windows to duck and unduck background audio via hooked winmm waveOut functions. However, on Windows 11, it seems that these functions are no longer used by SAPI5, and therefore audio ducking no longer worked for SAPI5. Description of how this pull request fixes the issue: Rather than hooking winmm functions, instead make use of SAPI5's own events, and other SynthDriver methods to enable and disable ducking. Specifically: * On SAPISink.StartStream: enable ducking * On SAPISink.EndStream: disable ducking * SynthDriver.cancel: disable ducking * SynthDriver.pause: disable ducking if pausing and enable ducking if unpausing. * SynthDriver.speak: temporarily enable audio ducking around the call to speak so that audio ducking can enforce its initial delay before speaking (as StartStream and EndStream are asynchronous).
diff --git a/source/synthDrivers/sapi5.py b/source/synthDrivers/sapi5.py
@@ -4,6 +4,7 @@
 # This file is covered by the GNU General Public License.
 # See the file COPYING for more details.
 
+from typing import Optional
 from enum import IntEnum
 import locale
 from collections import OrderedDict
@@ -13,7 +14,6 @@
 from comtypes import COMError
 import winreg
 import audioDucking
-import NVDAHelper
 from synthDriverHandler import SynthDriver, VoiceInfo, synthIndexReached, synthDoneSpeaking
 import config
 import nvwave
@@ -50,85 +50,11 @@ class SpeechVoiceSpeakFlags(IntEnum):
 
 class SpeechVoiceEvents(IntEnum):
 	# https://msdn.microsoft.com/en-us/previous-versions/windows/desktop/ms720886(v=vs.85)
+	StartInputStream = 2
 	EndInputStream = 4
 	Bookmark = 16
 
 
-class FunctionHooker(object):
-	def __init__(
-		self,
-		targetDll: str,
-		importDll: str,
-		funcName: str,
-		newFunction # result of ctypes.WINFUNCTYPE
-	):
-		# dllImportTableHooks_hookSingle expects byte strings.
-		try:
-			self._hook=NVDAHelper.localLib.dllImportTableHooks_hookSingle(
-				targetDll.encode("mbcs"),
-				importDll.encode("mbcs"),
-				funcName.encode("mbcs"),
-				newFunction
-			)
-		except UnicodeEncodeError:
-			log.error("Error encoding FunctionHooker input parameters", exc_info=True)
-			self._hook = None
-		if self._hook:
-			log.debug(f"Hooked {funcName}")
-		else:
-			log.error(f"Could not hook {funcName}")
-			raise RuntimeError(f"Could not hook {funcName}")
-
-	def __del__(self):
-		if self._hook:
-			NVDAHelper.localLib.dllImportTableHooks_unhookSingle(self._hook)
-
-
-_duckersByHandle={}
-
-
-@WINFUNCTYPE(windll.winmm.waveOutOpen.restype,*windll.winmm.waveOutOpen.argtypes,use_errno=False,use_last_error=False)
-def waveOutOpen(pWaveOutHandle,deviceID,wfx,callback,callbackInstance,flags):
-	if audioDucking._isDebug():
-		log.debugWarning("Ducking audio requested for SAPI5 synthdriver")
-	try:
-		res=windll.winmm.waveOutOpen(pWaveOutHandle,deviceID,wfx,callback,callbackInstance,flags) or 0
-	except WindowsError as e:
-		res=e.winerror
-	if res==0 and pWaveOutHandle:
-		h=pWaveOutHandle.contents.value
-		d=audioDucking.AudioDucker()
-		if not d.enable():
-			log.warning("Ducking audio failed for SAPI5 synthdriver")
-		_duckersByHandle[h]=d
-	else:
-		log.warning("Opening wave out failed for SAPI5 synthdriver")
-		log.debugWarning(f"Win Error: {res}\n WaveOutHandle: {pWaveOutHandle}")
-	return res
-
-@WINFUNCTYPE(c_long,c_long)
-def waveOutClose(waveOutHandle):
-	if audioDucking._isDebug():
-		log.debugWarning("End ducking audio requested for SAPI5 synthdriver")
-	try:
-		res=windll.winmm.waveOutClose(waveOutHandle) or 0
-	except WindowsError as e:
-		res=e.winerror
-	if res==0 and waveOutHandle:
-		_duckersByHandle.pop(waveOutHandle,None)
-	else:
-		log.warning("Closing wave out failed for SAPI5 synthdriver")
-		log.debugWarning(f"Res: {res}\n waveOutHandle: {waveOutHandle}")
-	return res
-
-_waveOutHooks=[]
-def ensureWaveOutHooks():
-	if not _waveOutHooks and audioDucking.isAudioDuckingSupported():
-		sapiPath=os.path.join(os.path.expandvars("$SYSTEMROOT"),"system32","speech","common","sapi.dll")
-		_waveOutHooks.append(FunctionHooker(sapiPath,"WINMM.dll","waveOutOpen",waveOutOpen))
-		_waveOutHooks.append(FunctionHooker(sapiPath,"WINMM.dll","waveOutClose",waveOutClose))
-
-
 class SapiSink(object):
 	"""Handles SAPI event notifications.
 	See https://msdn.microsoft.com/en-us/library/ms723587(v=vs.85).aspx
@@ -137,6 +63,16 @@ class SapiSink(object):
 	def __init__(self, synthRef: weakref.ReferenceType):
 		self.synthRef = synthRef
 
+	def StartStream(self, streamNum, pos):
+		synth = self.synthRef()
+		if synth is None:
+			log.debugWarning("Called StartStream method on SapiSink while driver is dead")
+			return
+		if synth._audioDucker:
+			if audioDucking._isDebug():
+				log.debug("Enabling audio ducking due to starting speech stream")
+			synth._audioDucker.enable()
+
 	def Bookmark(self, streamNum, pos, bookmark, bookmarkId):
 		synth = self.synthRef()
 		if synth is None:
@@ -150,6 +86,10 @@ def EndStream(self, streamNum, pos):
 			log.debugWarning("Called Bookmark method on EndStream while driver is dead")
 			return
 		synthDoneSpeaking.notify(synth=synth)
+		if synth._audioDucker:
+			if audioDucking._isDebug():
+				log.debug("Disabling audio ducking due to speech stream end")
+			synth._audioDucker.disable()
 
 
 class SynthDriver(SynthDriver):
@@ -181,13 +121,15 @@ def check(cls):
 			return False
 
 	ttsAudioStream=None #: Holds the ISPAudio interface for the current voice, to aid in stopping and pausing audio
+	_audioDucker: Optional[audioDucking.AudioDucker] = None
 
 	def __init__(self,_defaultVoiceToken=None):
 		"""
 		@param _defaultVoiceToken: an optional sapi voice token which should be used as the default voice (only useful for subclasses)
 		@type _defaultVoiceToken: ISpeechObjectToken
 		"""
-		ensureWaveOutHooks()
+		if audioDucking.isAudioDuckingSupported():
+			self._audioDucker = audioDucking.AudioDucker()
 		self._pitch=50
 		self._initTts(_defaultVoiceToken)
 
@@ -261,7 +203,9 @@ def _initTts(self, voice=None):
 		if outputDeviceID>=0:
 			self.tts.audioOutput=self.tts.getAudioOutputs()[outputDeviceID]
 		self._eventsConnection = comtypes.client.GetEvents(self.tts, SapiSink(weakref.ref(self)))
-		self.tts.EventInterests = SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
+		self.tts.EventInterests = (
+			SpeechVoiceEvents.StartInputStream | SpeechVoiceEvents.Bookmark | SpeechVoiceEvents.EndInputStream
+		)
 		from comInterfaces.SpeechLib import ISpAudio
 		try:
 			self.ttsAudioStream=self.tts.audioOutputStream.QueryInterface(ISpAudio)
@@ -396,18 +340,74 @@ def outputTags():
 
 		text = "".join(textList)
 		flags = SpeechVoiceSpeakFlags.IsXML | SpeechVoiceSpeakFlags.Async
-		self.tts.Speak(text, flags)
+		# Ducking should be complete before the synth starts producing audio.
+		# For this to happen, the speech method must block until ducking is complete.
+		# Ducking should be disabled when the synth is finished producing audio.
+		# Note that there may be calls to speak with a string that results in no audio,
+		# it is important that in this case the audio does not get stuck ducked.
+		# When there is no audio produced the startStream and endStream handlers are not called.
+		# To prevent audio getting stuck ducked, it is unducked at the end of speech.
+		# There are some known issues:
+		# - When there is no audio produced by the synth, a user may notice volume lowering (ducking) temporarily.
+		# - If the call to startStream handler is delayed significantly, users may notice a variation in volume
+		# (as ducking is disabled at the end of speak, and re-enabled when the startStream handler is called)
+		
+		# A note on the synchronicity of components of this approach:
+		# SAPISink.StartStream event handler (callback):
+		# the synth speech is not blocked by this event callback.
+		# SAPISink.EndStream event handler (callback):
+		# assumed also to be async but not confirmed. Synchronicity is irrelevant to the current approach.
+		# AudioDucker.disable returns before the audio is completely unducked.
+		# AudioDucker.enable() ducking will complete before the function returns.
+		# It is not possible to "double duck the audio", calling twice yields the same result as calling once.
+		# AudioDucker class instances count the number of enables/disables,
+		# in order to unduck there must be no remaining enabled audio ducker instances.
+		# Due to this a temporary audio ducker is used around the call to speak.
+		# SAPISink.StartStream: Ducking here may allow the early speech to start before ducking is completed.
+		if audioDucking.isAudioDuckingSupported():
+			tempAudioDucker = audioDucking.AudioDucker()
+		else:
+			tempAudioDucker = None
+		if tempAudioDucker:
+			if audioDucking._isDebug():
+				log.debug("Enabling audio ducking due to speak call")
+			tempAudioDucker.enable()
+		try:
+			self.tts.Speak(text, flags)
+		finally:
+			if tempAudioDucker:
+				if audioDucking._isDebug():
+					log.debug("Disabling audio ducking  after speak call")
+				tempAudioDucker.disable()
 
 	def cancel(self):
 		# SAPI5's default means of stopping speech can sometimes lag at end of speech, especially with Win8 / Win 10 Microsoft Voices.
 		# Therefore  instruct the underlying audio interface to stop first, before interupting and purging any remaining speech.
 		if self.ttsAudioStream:
 			self.ttsAudioStream.setState(SPAudioState.STOP, 0)
 		self.tts.Speak(None, SpeechVoiceSpeakFlags.Async | SpeechVoiceSpeakFlags.PurgeBeforeSpeak)
+		if self._audioDucker:
+			if audioDucking._isDebug():
+				log.debug("Disabling audio ducking due to setting output audio state to stop")
+			self._audioDucker.disable()
 
 	def pause(self, switch: bool):
 		# SAPI5's default means of pausing in most cases is either extremely slow
 		# (e.g. takes more than half a second) or does not work at all.
 		# Therefore instruct the underlying audio interface to pause instead.
 		if self.ttsAudioStream:
-			self.ttsAudioStream.setState(SPAudioState.PAUSE if switch else SPAudioState.RUN, 0)
+			oldState = self.ttsAudioStream.GetStatus().State
+			if switch and oldState == SPAudioState.RUN:
+				# pausing
+				if self._audioDucker:
+					if audioDucking._isDebug():
+						log.debug("Disabling audio ducking due to setting output audio state to pause")
+					self._audioDucker.disable()
+				self.ttsAudioStream.setState(SPAudioState.PAUSE, 0)
+			elif not switch and oldState == SPAudioState.PAUSE:
+				# unpausing
+				if self._audioDucker:
+					if audioDucking._isDebug():
+						log.debug("Enabling audio ducking due to setting output audio state to run")
+					self._audioDucker.enable()
+				self.ttsAudioStream.setState(SPAudioState.RUN, 0)
diff --git a/user_docs/en/changes.t2t b/user_docs/en/changes.t2t
@@ -39,6 +39,7 @@ What's New in NVDA
 - MS word with UIA: heading quick nav in browse mode no longer gets stuck on the final heading of a document, nor is this heading shown twice in the NVDA elements list. (#9540)
 - In Windows 8 and later , the File Explorer status bar can now be retrieved using the standard gesture NVDA+end (desktop) / NVDA+shift+end (laptop). (#12845)
 - Incoming messages in the chat of Skype for Business are reported again. (#9295)
+- NVDA can again duck audio when using the SAPI5 synthesizer on Windows 11. (#12913)
 -
 
 == Changes for Developers ==

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ What's New in NVDA`
`39`	`39`	`- MS word with UIA: heading quick nav in browse mode no longer gets stuck on the final heading of a document, nor is this heading shown twice in the NVDA elements list. (#9540)`
`40`	`40`	`- In Windows 8 and later , the File Explorer status bar can now be retrieved using the standard gesture NVDA+end (desktop) / NVDA+shift+end (laptop). (#12845)`
`41`	`41`	`- Incoming messages in the chat of Skype for Business are reported again. (#9295)`
	`42`	`+- NVDA can again duck audio when using the SAPI5 synthesizer on Windows 11. (#12913)`
`42`	`43`	`-`
`43`	`44`
`44`	`45`	`== Changes for Developers ==`