8000 WIP - To bytestream by willwade · Pull Request #333 · nateshmbhat/pyttsx3 · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

WIP - To bytestream #333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions pyttsx3/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,13 @@ def iterate(self):
next(self._iterator)
except StopIteration:
pass

def bytestream(self, text, byte_stream, name=None):
"""
Capture the spoken text as a byte stream instead of playing it aloud.

:param text: The text to speak
:param byte_stream: The BytesIO object to store the byte stream
:param name: An optional name for the utterance
"""
self._push(self._driver.to_bytestream, (text, byte_stream), name)
120 changes: 79 additions & 41 deletions pyttsx3/drivers/espeak.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, proxy):
self._looping = False
self._stopping = False
self._speaking = False
self._text_to_say = None
self._current_text = None
self._data_buffer = b''
self._numerise_buffer = []

Expand Down Expand Up @@ -138,57 +138,62 @@ def _start_synthesis(self, text):
self._proxy.notify('error', exception=e)
raise


def _onSynth(self, wav, numsamples, events):
i = 0
while True:
event = events[i]
if event.type == _espeak.EVENT_LIST_TERMINATED:
break
if event.type == _espeak.EVENT_WORD:

if self._text_to_say:
start_index = event.text_position-1
if self._current_text:
start_index = event.text_position - 1
end_index = start_index + event.length
word = self._text_to_say[start_index:end_index]
word = self._current_text[start_index:end_index]
else:
word = "Unknown"

self._proxy.notify('started-word', name=word, location=event.text_position, length=event.length)

elif event.type == _espeak.EVENT_END:
stream = NamedTemporaryFile(delete=False, suffix='.wav')

try:
with wave.open(stream, 'wb') as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(22050.0)
f.writeframes(self._data_buffer)
self._data_buffer = b''

if event.user_data:
os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
else:
if platform.system() == 'Darwin': # macOS
try:
result = subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
elif platform.system() == 'Linux':
os.system(f'aplay {stream.name} -q')
elif platform.system() == 'Windows':
winsound.PlaySound(stream.name, winsound.SND_FILENAME) # Blocking playback

except Exception as e:
raise RuntimeError(f"Error during playback: {e}")

finally:
if hasattr(self, '_byte_stream'):
# If using the byte stream method, stop speaking
self._speaking = False
else:
# File-based playback and processing
stream = NamedTemporaryFile(delete=False, suffix='.wav')

try:
stream.close() # Ensure the file is closed
os.remove(stream.name)
with wave.open(stream, 'wb') as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(22050.0)
f.writeframes(self._data_buffer)
self._data_buffer = b''

if event.user_data:
# Use ffmpeg to convert the file if user_data exists
os.system(f'ffmpeg -y -i {stream.name} {self.decode_numeric(event.user_data)} -loglevel quiet')
else:
# Platform-specific playback
if platform.system() == 'Darwin': # macOS
try:
subprocess.run(['afplay', stream.name], check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"[EspeakDriver._onSynth] Mac afplay failed with error: {e}")
elif platform.system() == 'Linux':
os.system(f'aplay {stream.name} -q')
elif platform.system() == 'Windows':
winsound.PlaySound(stream.name, winsound.SND_FILENAME) # Blocking playback

except Exception as e:
raise RuntimeError(f"Error deleting temporary WAV file: {e}")
raise RuntimeError(f"Error during playback: {e}")

finally:
try:
stream.close() # Ensure the file is closed
os.remove(stream.name)
except Exception as e:
raise RuntimeError(f"Error deleting temporary WAV file: {e}")

self._proxy.notify('finished-utterance', completed=True)
self._proxy.setBusy(False)
Expand All @@ -198,7 +203,9 @@ def _onSynth(self, wav, numsamples, events):
i += 1

if numsamples > 0:
# Append the audio data (PCM samples) to the buffer for both methods
self._data_buffer += ctypes.string_at(wav, numsamples * ctypes.sizeof(ctypes.c_short))

return 0


Expand All @@ -214,8 +221,8 @@ def startLoop(self):
if first:
self._proxy.setBusy(False) 8000
first = False
if self._text_to_say:
self._start_synthesis(self._text_to_say)
if self._current_text:
self._start_synthesis(self._current_text)
self.iterate()
time.sleep(0.01)

Expand All @@ -230,4 +237,35 @@ def iterate(self):
self.endLoop()

def say(self, text):
self._text_to_say = text
self._current_text = text

def to_bytestream(self, text, byte_stream):
"""
Capture the spoken text as a byte stream using espeak.

:param text: The text to speak
:param byte_stream: The BytesIO object to store the byte stream
"""
self._byte_stream = byte_stream
self._data_buffer = b'' # Clear the data buffer before starting
self._proxy.setBusy(True)
self._proxy.notify('started-utterance')
self._speaking = True

# Store the text to be spoken
self._current_text = text

# Set up the synthesis process and capture audio data in real-time
try:
_espeak.Synth(toUtf8(text), flags=_espeak.ENDPAUSE | _espeak.CHARS_UTF8)
self.startLoop()
except Exception as e:
self._proxy.setBusy(False)
self._proxy.notify('error', exception=e)
raise
finally:
# Write the captured data buffer to the provided BytesIO object
byte_stream.write(self._data_buffer)
del self._byte_stream
self._proxy.notify('finished-utterance', completed=True)
self._proxy.setBusy(False)
47 changes: 47 additions & 0 deletions pyttsx3/drivers/nsss.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# noinspection PyUnresolvedReferences
import objc
import AVFoundation
from io import BytesIO
from AppKit import NSSpeechSynthesizer
from Foundation import *
from PyObjCTools import AppHelper
Expand Down Expand Up @@ -38,6 +40,10 @@ def __init__(self):
self._tts = None
self._completed = False
self._current_text = ''
self._audio_engine = AVAudioEngine.alloc().init()
self._audio_format = AVAudioFormat.alloc().initStandardFormatWithSampleRate_channels_(44100, 1)
self._buffer = BytesIO()


@objc.python_method
def initWithProxy(self, proxy):
Expand Down Expand Up @@ -163,3 +169,44 @@ def speechSynthesizer_willSpeakWord_ofString_(self, tts, rng, text):

self._proxy.notify('started-word', name=current_word, location=rng.location,
length=rng.length)

def to_bytestream(self, text, byte_stream):
"""
Capture the spoken text as a byte stream using NSSpeechSynthesizer and AVAudioEngine.

:param text: The text to speak
:param byte_stream: The BytesIO object to store the byte stream
"""
self._tts.setDelegate_(self)

# Set up AVAudioEngine
main_mixer = self._audio_engine.mainMixerNode()
bus = 0

# Define the buffer format
format = self._audio_engine.outputNode().inputFormatForBus_(bus)

def capture_handler(buffer, when):
"""Capture audio data as bytes in real-time."""
# Get the audio buffer and extract its data
audio_data = buffer.audioBufferList().mBuffers[0].mData
audio_data_bytes = objc.objc_object_as_bytes(audio_data, buffer.frameLength)
byte_stream.write(audio_data_bytes)

# Tap the main mixer node to capture audio
main_mixer.installTapOnBus_bufferSize_format_block_(bus, 1024, format, capture_handler)

# Start the audio engine
self._audio_engine.prepare()
self._audio_engine.startAndReturnError_(None)

# Start speaking the text
self._tts.startSpeakingString_(text)

# Wait for speech to finish
while self._tts.isSpeaking():
pass

# Stop the audio engine
self._audio_engine.stop()
main_mixer.removeTapOnBus_(bus)
26 changes: 26 additions & 0 deletions pyttsx3/drivers/sapi5.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,31 @@ def iterate(self):
while 1:
pythoncom.PumpWaitingMessages()
yield

def to_bytestream(self, text, byte_stream):
"""
Capture the spoken text as a byte stream in SAPI5.

:param text: The text to speak
:param byte_stream: The BytesIO object to store the byte stream
"""
# Set up the memory stream
stream = comtypes.client.CreateObject('SAPI.SpMemoryStream')
stream.Format.Type = SpeechLib.SAFT16kHz16BitMono # Set appropriate format
temp_stream = self._tts.AudioOutputStream

# Set the TTS output to the memory stream
self._tts.AudioOutputStream = stream
self._current_text = text # Set the current text for word events
self._tts.Speak(fromUtf8(toUtf8(text)))

# Capture the audio data from the memory stream
data = stream.GetData()
byte_stream.write(bytes(data)) # Write the byte data to the provided byte_stream

# Restore the original output stream
self._tts.AudioOutputStream = temp_stream
stream.Close()


# noinspection PyPep8Naming,PyProtectedMember,PyUnusedLocal,PyShadowingNames
Expand Down Expand Up @@ -185,3 +210,4 @@ def _ISpeechVoiceEvents_Word(self, stream_number, stream_position, char, length)

self._driver._proxy.notify(
'started-word', name=current_word, location=char, length=length)

10 changes: 10 additions & 0 deletions pyttsx3/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,13 @@ def iterate(self):
elif self._driverLoop:
raise RuntimeError('iterate not valid in driver run loop')
self.proxy.iterate()

def bytestream(self, text, byte_stream, name=None):
"""
Capture the spoken text as a byte stream instead of playing it aloud.

:param text: The text to speak
:param byte_stream: The BytesIO object to store the byte stream
:param name: An optional name for the utterance
"""
self.proxy.bytestream(text, byte_stream, name)
19 changes: 19 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pyttsx3
from io import BytesIO

engine = pyttsx3.init()

# Create a BytesIO object to store the audio data
byte_stream = BytesIO()

# Capture the text as a byte stream
engine.bytestream("Hello World!", byte_stream)
engine.bytestream("This is another example.", byte_stream)
engine.runAndWait()
print(byte_stream.getvalue())

# Now you have the byte_stream containing the audio data, which you can save or process
with open("output.wav", "wb") as f:
f.write(byte_stream.getvalue())

engine.stop()
0