8000 TTSService: do not push LLMFullResponseEndFrame if not needed by aconchillo · Pull Request #1681 · pipecat-ai/pipecat · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

TTSService: do not push LLMFullResponseEndFrame if not needed #1681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8000
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- Fixed a TTS services issue that could cause assistant output not to be
aggregated to the context when also using `TTSSpeakFrame`s.

- Fixed an issue where the `SmartTurnMetricsData` was reporting 0ms for
inference and processing time when using the `FalSmartTurnAnalyzer`.

Expand Down
4 changes: 1 addition & 3 deletions src/pipecat/services/cartesia/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,7 @@ async def _receive_messages(self):
continue
if msg["type"] == "done":
await self.stop_ttfb_metrics()
await self.add_word_timestamps(
[("TTSStoppedFrame", 0), ("LLMFullResponseEndFrame", 0), ("Reset", 0)]
)
await self.add_word_timestamps([("TTSStoppedFrame", 0), ("Reset", 0)])
await self.remove_audio_context(msg["context_id"])
elif msg["type"] == "timestamps":
await self.add_word_timestamps(
Expand Down
4 changes: 2 additions & 2 deletions src/pipecat/services/elevenlabs/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
self._started = False
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])

async def _connect(self):
await self._connect_websocket()
Expand Down Expand Up @@ -526,7 +526,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
self._reset_state()

if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])

elif isinstance(frame, LLMFullResponseEndFrame):
# End of turn - reset previous text
Expand Down
2 changes: 1 addition & 1 deletion src/pipecat/services/rime/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ async def push_frame(self, frame: Frame, direction: FrameDirection = FrameDirect
await super().push_frame(frame, direction)
if isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
if isinstance(frame, TTSStoppedFrame):
await self.add_word_timestamps([("LLMFullResponseEndFrame", 0), ("Reset", 0)])
await self.add_word_timestamps([("Reset", 0)])

async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
"""Generate speech from text.
Expand Down
16 changes: 11 additions & 5 deletions src/pipecat/services/tts_service.py
8F31
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Frame,
InterimTranscriptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
StartFrame,
StartInterruptionFrame,
TextFrame,
Expand Down Expand Up @@ -308,6 +309,7 @@ def __init__(self, **kwargs):
self._initial_word_timestamp = -1
self._words_queue = asyncio.Queue()
self._words_task = None
self._llm_response_started: bool = False

def start_word_timestamps(self):
if self._initial_word_timestamp == -1:
Expand Down Expand Up @@ -335,11 +337,14 @@ async def cancel(self, frame: CancelFrame):
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
if isinstance(frame, LLMFullResponseStartFrame):
self._llm_response_started = True
elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
await self.flush_audio()

async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
await super()._handle_interruption(frame, direction)
self._llm_response_started = False
self.reset_word_timestamps()

def _create_words_task(self):
Expand All @@ -354,13 +359,14 @@ async def _stop_words_task(self):
async def _words_task_handler(self):
last_pts = 0
while True:
frame = None
(word, timestamp) = await self._words_queue.get()
if word == "Reset" and timestamp == 0:
self.reset_word_timestamps()
frame = None
elif word == "LLMFullResponseEndFrame" and timestamp == 0:
frame = LLMFullResponseEndFrame()
frame.pts = last_pts
if self._llm_response_started:
self._llm_response_started = False
frame = LLMFullResponseEndFrame()
frame.pts = last_pts
elif word == "TTSStoppedFrame" and timestamp == 0:
frame = TTSStoppedFrame()
frame.pts = last_pts
Expand Down
0