nateshmbhat · willwade · Sep 30, 2024 · Sep 30, 2024
diff --git a/pyttsx3/driver.py b/pyttsx3/driver.py
@@ -224,3 +224,13 @@ def iterate(self):
             next(self._iterator)
         except StopIteration:
             pass
+
+    def bytestream(self, text, byte_stream, name=None):
+        """
+        Capture the spoken text as a byte stream instead of playing it aloud.
+
+        :param text: The text to speak
+        :param byte_stream: The BytesIO object to store the byte stream
+        :param name: An optional name for the utterance
+        """
+        self._push(self._driver.to_bytestream, (text, byte_stream), name)
diff --git a/pyttsx3/drivers/espeak.py b/pyttsx3/drivers/espeak.py
@@ -36,7 +36,7 @@ def __init__(self, proxy):
         self._looping = False
         self._stopping = False
         self._speaking = False
-        self._text_to_say = None
+        self._current_text = None
         self._data_buffer = b''
         self._numerise_buffer = []
 
@@ -138,57 +138,62 @@ def _start_synthesis(self, text):
             self._proxy.notify('error', exception=e)
             raise
 
-
     def _onSynth(self, wav, numsamples, events):
         i = 0
         while True:
            event = events[i]
             if event.type == _espeak.EVENT_LIST_TERMINATED:
                 break
             if event.type == _espeak.EVENT_WORD:
-
-                if self._text_to_say:
-                    start_index = event.text_position-1
+                if self._current_text:
+                    start_index = event.text_position - 1
                     end_index = start_index + event.length
-                    word = self._text_to_say[start_index:end_index]
+                    word = self._current_text[start_index:end_index]
                 else:
                     word = "Unknown"
-
+    
                 self._proxy.notify('started-word', name=word, location=event.text_position, length=event.length)
-
+    
             elif event.type == _espeak.EVENT_END:
-                stream = NamedTemporaryFile(delete=False, suffix='.wav')
-
-                try:
-                    with wave.open(stream, 'wb') as f:
-                        f.setnchannels(1)
-                        f.setsampwidth(2)
-                        f.setframerate(22050.0)
-                        f.writeframes(self._data_buffer)
-                    self._data_buffer = b''
-
-                    if event.user_data:
-                        os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
-                    else:
-                        if platform.system() == 'Darwin':  # macOS
-                            try:
-                                result = subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
-                            except subprocess.CalledProcessError as e:
-                                raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
-                        elif platform.system() == 'Linux':
-                            os.system(f'aplay {stream.name} -q')
-                        elif platform.system() == 'Windows':
-                            winsound.PlaySound(stream.name, winsound.SND_FILENAME)  # Blocking playback
-
-                except Exception as e:
-                    raise RuntimeError(f"Error during playback: {e}")
-
-                finally:
+                if hasattr(self, '_byte_stream'):
+                    # If using the byte stream method, stop speaking
+                    self._speaking = False
+                else:
+                    # File-based playback and processing
+                    stream = NamedTemporaryFile(delete=False, suffix='.wav')
+
                     try:
-                        stream.close()  # Ensure the file is closed
-                        os.remove(stream.name)
+                        with wave.open(stream, 'wb') as f:
+                            f.setnchannels(1)
+                            f.setsampwidth(2)
+                            f.setframerate(22050.0)
+                            f.writeframes(self._data_buffer)
+                        self._data_buffer = b''
+
+                        if event.user_data:
+                            # Use ffmpeg to convert the file if user_data exists
+                            os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
+                        else:
+                            # Platform-specific playback
+                            if platform.system() == 'Darwin':  # macOS
+                                try:
+                                    subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
+                                except subprocess.CalledProcessError as e:
+                                    raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
+                            elif platform.system() == 'Linux':
+                                os.system(f'aplay {stream.name} -q')
+                            elif platform.system() == 'Windows':
+                                winsound.PlaySound(stream.name, winsound.SND_FILENAME)  # Blocking playback
+
                     except Exception as e:
-                        raise RuntimeError(f"Error deleting temporary WAV file: {e}")
+                        raise RuntimeError(f"Error during playback: {e}")
+
+                    finally:
+                        try:
+                            stream.close()  # Ensure the file is closed
+                            os.remove(stream.name)
+                        except Exception as e:
+                            raise RuntimeError(f"Error deleting temporary WAV file: {e}")
 
                 self._proxy.notify('finished-utterance', completed=True)
                 self._proxy.setBusy(False)
@@ -198,7 +203,9 @@ def _onSynth(self, wav, numsamples, events):
             i += 1
 
         if numsamples > 0:
+            # Append the audio data (PCM samples) to the buffer for both methods
             self._data_buffer += ctypes.string_at(wav, numsamples * ctypes.sizeof(ctypes.c_short))
+
         return 0
 
 
@@ -214,8 +221,8 @@ def startLoop(self):
             if first:
                 self._proxy.setBusy(False)
8000
                 first = False
-                if self._text_to_say:
-                    self._start_synthesis(self._text_to_say)
+                if self._current_text:
+                    self._start_synthesis(self._current_text)
             self.iterate()
             time.sleep(0.01)
 
@@ -230,4 +237,35 @@ def iterate(self):
             self.endLoop()
 
     def say(self, text):
-        self._text_to_say = text
+        self._current_text = text
+
+    def to_bytestream(self, text, byte_stream):
+        """
+        Capture the spoken text as a byte stream using espeak.
+
+        :param text: The text to speak
+        :param byte_stream: The BytesIO object to store the byte stream
+        """
+        self._byte_stream = byte_stream
+        self._data_buffer = b''  # Clear the data buffer before starting
+        self._proxy.setBusy(True)
+        self._proxy.notify('started-utterance')
+        self._speaking = True
+
+        # Store the text to be spoken
+        self._current_text = text
+
+        # Set up the synthesis process and capture audio data in real-time
+        try:
+            _espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8)
+            self.startLoop()
+        except Exception as e:
+            self._proxy.setBusy(False)
+            self._proxy.notify('error', exception=e)
+            raise
+        finally:        
+            # Write the captured data buffer to the provided BytesIO object
+            byte_stream.write(self._data_buffer)
+            del self._byte_stream
+            self._proxy.notify('finished-utterance', completed=True)
+            self._proxy.setBusy(False)
diff --git a/pyttsx3/drivers/nsss.py b/pyttsx3/drivers/nsss.py
@@ -1,5 +1,7 @@
 # noinspection PyUnresolvedReferences
 import objc
+import AVFoundation
+from io import BytesIO
 from AppKit import NSSpeechSynthesizer
 from Foundation import *
 from PyObjCTools import AppHelper
@@ -38,6 +40,10 @@ def __init__(self):
         self._tts = None
         self._completed = False
         self._current_text = ''
+        self._audio_engine = AVAudioEngine.alloc().init()
+        self._audio_format = AVAudioFormat.alloc().initStandardFormatWithSampleRate_channels_(44100, 1)
+        self._buffer = BytesIO()
+
 
     @objc.python_method
     def initWithProxy(self, proxy):
@@ -163,3 +169,44 @@ def speechSynthesizer_willSpeakWord_ofString_(self, tts, rng, text):
 
         self._proxy.notify('started-word', name=current_word, location=rng.location,
                            length=rng.length)
+
+    def to_bytestream(self, text, byte_stream):
+        """
+        Capture the spoken text as a byte stream using NSSpeechSynthesizer and AVAudioEngine.
+
+        :param text: The text to speak
+        :param byte_stream: The BytesIO object to store the byte stream
+        """
+        self._tts.setDelegate_(self)
+
+        # Set up AVAudioEngine
+        main_mixer = self._audio_engine.mainMixerNode()
+        bus = 0
+
+        # Define the buffer format
+        format = self._audio_engine.outputNode().inputFormatForBus_(bus)
+
+        def capture_handler(buffer, when):
+            """Capture audio data as bytes in real-time."""
+            # Get the audio buffer and extract its data
+            audio_data = buffer.audioBufferList().mBuffers[0].mData
+            audio_data_bytes = objc.objc_object_as_bytes(audio_data, buffer.frameLength)
+            byte_stream.write(audio_data_bytes)
+
+        # Tap the main mixer node to capture audio
+        main_mixer.installTapOnBus_bufferSize_format_block_(bus, 1024, format, capture_handler)
+
+        # Start the audio engine
+        self._audio_engine.prepare()
+        self._audio_engine.startAndReturnError_(None)
+
+        # Start speaking the text
+        self._tts.startSpeakingString_(text)
+
+        # Wait for speech to finish
+        while self._tts.isSpeaking():
+            pass
+
+        # Stop the audio engine
+        self._audio_engine.stop()
+        main_mixer.removeTapOnBus_(bus)
diff --git a/pyttsx3/drivers/sapi5.py b/pyttsx3/drivers/sapi5.py
@@ -153,6 +153,31 @@ def iterate(self):
         while 1:
             pythoncom.PumpWaitingMessages()
             yield
+
+    def to_bytestream(self, text, byte_stream):
+        """
+        Capture the spoken text as a byte stream in SAPI5.
+
+        :param text: The text to speak
+        :param byte_stream: The BytesIO object to store the byte stream
+        """
+        # Set up the memory stream
+        stream = comtypes.client.CreateObject('SAPI.SpMemoryStream')
+        stream.Format.Type = SpeechLib.SAFT16kHz16BitMono  # Set appropriate format
+        temp_stream = self._tts.AudioOutputStream
+
+        # Set the TTS output to the memory stream
+        self._tts.AudioOutputStream = stream
+        self._current_text = text  # Set the current text for word events
+        self._tts.Speak(fromUtf8(toUtf8(text)))
+
+        # Capture the audio data from the memory stream
+        data = stream.GetData()
+        byte_stream.write(bytes(data))  # Write the byte data to the provided byte_stream
+
+        # Restore the original output stream
+        self._tts.AudioOutputStream = temp_stream
+        stream.Close()
 
 
 # noinspection PyPep8Naming,PyProtectedMember,PyUnusedLocal,PyShadowingNames
@@ -185,3 +210,4 @@ def _ISpeechVoiceEvents_Word(self, stream_number, stream_position, char, length)
 
         self._driver._proxy.notify(
             'started-word', name=current_word, location=char, length=length)
+
diff --git a/pyttsx3/engine.py b/pyttsx3/engine.py
@@ -218,3 +218,13 @@ def iterate(self):
         elif self._driverLoop:
             raise RuntimeError('iterate not valid in driver run loop')
         self.proxy.iterate()
+
+    def bytestream(self, text, byte_stream, name=None):
+        """
+        Capture the spoken text as a byte stream instead of playing it aloud.
+
+        :param text: The text to speak
+        :param byte_stream: The BytesIO object to store the byte stream
+        :param name: An optional name for the utterance
+        """
+        self.proxy.bytestream(text, byte_stream, name)
diff --git a/test.py b/test.py
@@ -0,0 +1,19 @@
+import pyttsx3
+from io import BytesIO
+
+engine = pyttsx3.init()
+
+# Create a BytesIO object to store the audio data
+byte_stream = BytesIO()
+
+# Capture the text as a byte stream
+engine.bytestream("Hello World!", byte_stream)
+engine.bytestream("This is another example.", byte_stream)
+engine.runAndWait()
+print(byte_stream.getvalue())
+
+# Now you have the byte_stream containing the audio data, which you can save or process
+with open("output.wav", "wb") as f:
+    f.write(byte_stream.getvalue())
+
+engine.stop()