#!/usr/bin/env python # -*- coding: utf-8 -*- # (c...

Creado el: 12 de julio de 2025

Respondido usando O3 por Chat01

Pregunta

#!/usr/bin/env python

-- coding: utf-8 --

(c) 2025 iwyxdxl – GPL‑3.0+

"""
Pipecat低延迟语音管道 - 使用LocalAudioTransport实现真正的流式低延迟对话

  • 核心Pipeline: STT → LLM → TTS 流式处理 (20ms帧级别)
  • 音频I/O: LocalAudioTransport + Windows音频回环
  • 服务: Fish Audio STT + OpenAI兼容LLM + Fish Audio/MiniMax TTS
  • 延迟: 首包120-180ms,全程流式处理
  • 情感功能: 自动启用,无需配置
    """

import os
import asyncio
import logging
import threading
import time
import tempfile
import win32gui
import win32con
from datetime import datetime
from typing import Optional, Callable, Dict, Any, List
import numpy as np

自动启用情感功能 - 一键式,无需用户配置

try:
from auto_emotion import init_auto_emotion
init_auto_emotion() # 程序启动时自动初始化情感系统
except:
pass # 静默失败,不影响主程序

Local imports

from emotion_control import (
EmotionMapper,
extract_emotion_from_ai_response,
TTSProvider
)

Pipecat core imports (适配新版本 ≥ 0.7)

try:
# 核心Pipeline (必须)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineTask # ✅ 0.7.x 必需
from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.frames.frames import (
Frame, AudioRawFrame, TextFrame, LLMMessagesFrame,
TTSAudioRawFrame, UserStartedSpeakingFrame, UserStoppedSpeakingFrame
)
PIPECAT_CORE_AVAILABLE = True
except ImportError:
PIPECAT_CORE_AVAILABLE = False

Pipecat服务 (可选,取决于使用的STT/LLM/TTS服务)

try:
from pipecat.services.openai import OpenAILLMService, OpenAITTSService
PIPECAT_OPENAI_AVAILABLE = True
except ImportError:
PIPECAT_OPENAI_AVAILABLE = False

Pipecat VAD (可选,仅需要自动静音检测/打断时使用)

try:
# >=0.7.x 正确路径 (最新)
from pipecat.audio.vad.silero_vad import SileroVADAnalyzer
PIPECAT_VAD_AVAILABLE = True
except ImportError as e:
# 尝试旧路径作为备选
try:
from pipecat.audio.vad.silero import SileroVADAnalyzer
PIPECAT_VAD_AVAILABLE = True
logging.warning("Using deprecated Silero VAD import path, please upgrade Pipecat")
except ImportError:
PIPECAT_VAD_AVAILABLE = False
logging.warning(f"Silero VAD unavailable: {e}")

总体可用性检查

PIPECAT_AVAILABLE = PIPECAT_CORE_AVAILABLE

if not PIPECAT_AVAILABLE:
logging.warning("Pipecat core not available. Voice calling will be disabled.")

text
# 创建占位符类以防止ImportError class FrameProcessor: pass class Pipeline: pass class PipelineRunner: pass class PipelineTask: # ✅ 新增占位符 pass class LocalAudioTransport: pass class LocalAudioTransportParams: pass class Frame: pass class AudioRawFrame: pass class TextFrame: pass class LLMMessagesFrame: pass class TTSAudioRawFrame: pass class UserStartedSpeakingFrame: pass class UserStoppedSpeakingFrame: pass class OpenAILLMService: pass class OpenAITTSService: pass class WebRTCVADAnalyzer: pass class SileroVADAnalyzer: pass

DashScope Paraformer SDK for STT

try:
import websockets
import json
DASHSCOPE_AVAILABLE = True
except ImportError:
DASHSCOPE_AVAILABLE = False
logging.warning("websockets not available. DashScope STT will be disabled.")

Fish Audio SDK

try:
from fish_audio_sdk import Session, ASRRequest, TTSRequest
FISH_AUDIO_AVAILABLE = True
except ImportError:
FISH_AUDIO_AVAILABLE = False
logging.warning("Fish Audio SDK not available.")

Setup logging

logger = logging.getLogger("VoicePipeline")

Audio constants

RATE = 16000 # Sample rate for WebRTC VAD
CHANNELS = 1 # Mono
VAD_FRAME_MS = 20 # WebRTC VAD frame duration (20ms for low latency)

class WindowsAudioDeviceDetector:
"""Windows音频设备检测器,使用双虚拟设备防止回声"""

text
@staticmethod def detect_wechat_voice_window() -> bool: """检测微信语音通话窗口""" try: hwnd = win32gui.FindWindow("AudioWnd", "微信") if hwnd: win32gui.ShowWindow(hwnd, win32con.SW_RESTORE) logger.info("检测到微信语音通话窗口") return True return False except Exception as e: logger.error(f"检测微信窗口失败: {e}") return False @staticmethod def get_audio_device_indices(): """ 检测双虚拟音频设备配置 (防回声方案) 正确的双向音频流 (无回声): 通道A (用户声音): 用户→微信→VB-Cable A→AI程序 - 微信扬声器: CABLE A Input (微信播放对方声音) - AI程序输入: CABLE A Output (AI接收对方声音) 通道B (AI声音): AI程序→VB-Cable B→微信→用户 - AI程序输出: CABLE B Input (AI播放回复声音) - 微信麦克风: CABLE B Output (微信接收AI声音) 物理设备: - 用户麦克风: 物理麦克风 (用户说话给微信) - 用户扬声器: 物理扬声器 (用户听到AI回复) """ try: import sounddevice as sd import re devices = sd.query_devices() # 查找VB-Cable A设备 (用户声音通道: 微信→AI) cable_a_input_idx = None # 微信播放对方声音的虚拟扬声器 cable_a_output_idx = None # AI接收对方声音的虚拟麦克风 cable_a_input_name = None # 设备名称 cable_a_output_name = None # 查找VB-Cable B设备 (AI声音通道: AI→微信) cable_b_input_idx = None # AI播放回复声音的虚拟扬声器 cable_b_output_idx = None # 微信接收AI声音的虚拟麦克风 cable_b_input_name = None # 设备名称 cable_b_output_name = None # 使用多候选策略+精确正则,优先选择低通道数设备 cable_a_input_candidates = [] # 微信扬声器候选 cable_a_output_candidates = [] # AI录音输入候选 cable_b_input_candidates = [] # AI音频输出候选 cable_b_output_candidates = [] # 微信麦克风候选 for i, device in enumerate(devices): name = device['name'].lower() device_name = device['name'] # 🔧 修复1: 精确匹配Cable A设备,避免字母"a"误匹配Cable B if re.search(r'cable[\s\-]?a\s+(input|输入)', name, re.I) and device['max_output_channels'] > 0: cable_a_input_candidates.append((i, device, device_name)) elif re.search(r'cable[\s\-]?a\s+(output|输出)', name, re.I) and device['max_input_channels'] > 0: cable_a_output_candidates.append((i, device, device_name)) # 🔧 修复1: 精确匹配Cable B设备 elif re.search(r'cable[\s\-]?b\s+(input|输入)', name, re.I) and device['max_output_channels'] > 0: cable_b_input_candidates.append((i, device, device_name)) elif re.search(r'cable[\s\-]?b\s+(output|输出)', name, re.I) and device['max_input_channels'] > 0: cable_b_output_candidates.append((i, device, device_name)) # 备选: 标准VB-Cable检测 (不含A/B标识的设备) elif re.search(r'^cable.*(input|输入)', name, re.I) and not re.search(r'cable[\s\-]?[ab]', name, re.I) and device['max_output_channels'] > 0: if not cable_a_input_candidates: # 仅在没有找到Cable A时作为备选 cable_a_input_candidates.append((i, device, device_name)) elif re.search(r'^cable.*(output|输出)', name, re.I) and not re.search(r'cable[\s\-]?[ab]', name, re.I) and device['max_input_channels'] > 0: if not cable_a_output_candidates: # 仅在没有找到Cable A时作为备选 cable_a_output_candidates.append((i, device, device_name)) # 🔧 修复2: 优先选择低通道数+16kHz设备 def select_best_device(candidates, is_input_device=True): """从候选设备中选择最佳设备:优先级 ≤2ch & 16kHz > ≤2ch > 最早出现""" if not candidates: return None, None, None # 按优先级排序 def device_priority(candidate): idx, device, name = candidate channels = device['max_input_channels'] if is_input_device else device['max_output_channels'] rate = device['default_samplerate'] # 优先级计算:通道数越少越好,16kHz最佳 channel_score = 0 if channels <= 2 else 100 # ≤2ch优先 rate_score = 0 if abs(rate - 16000) < 1000 else 10 # 16kHz优先 order_score = idx # 索引越小越优先(备选) return (channel_score, rate_score, order_score) candidates.sort(key=device_priority) best_idx, best_device, best_name = candidates[0] channels = best_device['max_input_channels'] if is_input_device else best_device['max_output_channels'] rate = best_device['default_samplerate'] logger.info(f"选择最佳设备: 索引 {best_idx}, 名称: {best_name}, " f"通道: {channels}, 采样率: {rate:.0f}Hz") return best_idx, best_device, best_name # 选择最佳设备 cable_a_input_idx, _, cable_a_input_name = select_best_device(cable_a_input_candidates, False) cable_a_output_idx, _, cable_a_output_name = select_best_device(cable_a_output_candidates, True) cable_b_input_idx, _, cable_b_input_name = select_best_device(cable_b_input_candidates, False) cable_b_output_idx, _, cable_b_output_name = select_best_device(cable_b_output_candidates, True) # 检查是否找到完整的双Cable配置 if (cable_a_input_idx is not None and cable_a_output_idx is not None and cable_b_input_idx is not None and cable_b_output_idx is not None): logger.info("✅ 检测到完整的双VB-Cable配置 (防回声)") return { "ai_input_device": cable_a_output_idx, # AI从Cable A Output接收用户声音 "ai_output_device": cable_b_input_idx, # AI向Cable B Input播放回复 "wechat_speaker_device": cable_a_input_idx, # 微信扬声器: Cable A Input "wechat_mic_device": cable_b_output_idx, # 微信麦克风: Cable B Output "ai_input_device_name": cable_a_output_name, # 设备名称 "ai_output_device_name": cable_b_input_name, "wechat_speaker_device_name": cable_a_input_name, "wechat_mic_device_name": cable_b_output_name, "config_complete": True } else: logger.warning("❌ 未检测到完整的双VB-Cable配置") return { "ai_input_device": cable_a_output_idx, "ai_output_device": cable_b_input_idx, "wechat_speaker_device": cable_a_input_idx, "wechat_mic_device": cable_b_output_idx, "ai_input_device_name": cable_a_output_name, "ai_output_device_name": cable_b_input_name, "wechat_speaker_device_name": cable_a_input_name, "wechat_mic_device_name": cable_b_output_name, "config_complete": False } except Exception as e: logger.error(f"音频设备检测失败: {e}") return { "ai_input_device": None, "ai_output_device": None, "wechat_speaker_device": None, "wechat_mic_device": None, "ai_input_device_name": None, "ai_output_device_name": None, "wechat_speaker_device_name": None, "wechat_mic_device_name": None, "config_complete": False } @staticmethod def _probe_channels_for_recording(py_index: int) -> int: """ 探测录音设备的最佳通道数配置 VB-Cable策略:使用2声道避免"Invalid channels"错误,后续downmix到mono """ try: import pyaudio pa = pyaudio.PyAudio() try: info = pa.get_device_info_by_index(py_index) max_channels = int(info['maxInputChannels']) if max_channels <= 0: logger.warning(f"设备不支持录音: {info['name']}") return 1 elif max_channels == 1: logger.info(f"设备支持单声道录音: {info['name']}") return 1 else: # VB-Cable多声道设备使用2声道,避免兼容性问题 logger.info(f"设备支持{max_channels}声道,使用2声道录音: {info['name']}") return 2 finally: pa.terminate() except Exception as e: logger.warning(f"无法探测录音设备通道数: {e}") return 2 # 默认使用2声道,提高兼容性 @staticmethod def _probe_channels_for_playback(py_index: int) -> int: """ 探测播放设备的通道数配置 播放端通常使用单声道即可 """ try: import pyaudio pa = pyaudio.PyAudio() try: info = pa.get_device_info_by_index(py_index) max_channels = int(info['maxOutputChannels']) if max_channels <= 0: logger.warning(f"设备不支持播放: {info['name']}") return 1 else: # 播放端使用单声道,节省带宽 return 1 finally: pa.terminate() except Exception as e: logger.warning(f"无法探测播放设备通道数: {e}") return 1 @staticmethod def _probe_channels(py_index: int, want_input: bool) -> int: """ 探测设备的最大可用声道数(不低于1) 用于动态适配VB-Cable的声道配置 """ try: import pyaudio pa = pyaudio.PyAudio() try: info = pa.get_device_info_by_index(py_index) channels = int(info['maxInputChannels' if want_input else 'maxOutputChannels']) return channels if channels > 0 else 1 finally: pa.terminate() except Exception as e: logger.warning(f"无法探测设备通道数: {e}") return 1 # 回退到单声道 @staticmethod def get_pyaudio_device_index(device_name: str, is_input: bool = True): """ 根据设备名称获取PyAudio设备索引 解决sounddevice与PyAudio设备索引不匹配的问题 """ try: import pyaudio pa = pyaudio.PyAudio() try: logger.info(f"正在查找PyAudio设备: {device_name}, 输入设备: {is_input}") # 首先打印所有PyAudio设备用于调试 logger.info("PyAudio设备列表:") for i in range(pa.get_device_count()): try: info = pa.get_device_info_by_index(i) device_type = "🔊" if info['maxOutputChannels'] > 0 else "🎤" logger.info(f" [{i:02d}] {device_type} {info['name']} " f"(输入:{info['maxInputChannels']}, 输出:{info['maxOutputChannels']}, " f"采样率:{info['defaultSampleRate']:.0f}Hz)") except Exception as e: logger.warning(f" [{i:02d}] 无法获取设备信息: {e}") # 查找匹配的设备 best_match_idx = None best_match_score = 0 for i in range(pa.get_device_count()): try: info = pa.get_device_info_by_index(i) pa_device_name = info['name'] # 检查通道数是否匹配需求 if is_input and info['maxInputChannels'] == 0: continue # 需要输入设备但该设备不支持输入 if not is_input and info['maxOutputChannels'] == 0: continue # 需要输出设备但该设备不支持输出 # 计算名称匹配度 device_name_lower = device_name.lower() pa_device_name_lower = pa_device_name.lower() # 完全匹配 if device_name_lower == pa_device_name_lower: logger.info(f"✅ 找到完全匹配的PyAudio设备: 索引 {i}, 名称: {pa_device_name}") return i # 包含匹配 (计算匹配度) match_score = 0 if device_name_lower in pa_device_name_lower: match_score += 2 elif pa_device_name_lower in device_name_lower: match_score += 1 # 关键词匹配 keywords = ['cable', 'vb-cable', 'input', 'output', 'a', 'b'] for keyword in keywords: if keyword in device_name_lower and keyword in pa_device_name_lower: match_score += 1 if match_score > best_match_score: best_match_score = match_score best_match_idx = i except Exception as e: logger.warning(f"检查PyAudio设备 {i} 时出错: {e}") if best_match_idx is not None: info = pa.get_device_info_by_index(best_match_idx) logger.info(f"✅ 找到最佳匹配的PyAudio设备: 索引 {best_match_idx}, 名称: {info['name']}, 匹配度: {best_match_score}") # 验证设备参数 if is_input: if info['maxInputChannels'] == 0: logger.error(f"❌ 设备 {info['name']} 不支持输入") return None logger.info(f" 支持输入通道: {info['maxInputChannels']}") else: if info['maxOutputChannels'] == 0: logger.error(f"❌ 设备 {info['name']} 不支持输出") return None logger.info(f" 支持输出通道: {info['maxOutputChannels']}") logger.info(f" 默认采样率: {info['defaultSampleRate']:.0f}Hz") return best_match_idx else: logger.error(f"❌ 未找到匹配的PyAudio设备: {device_name}") return None finally: pa.terminate() except Exception as e: logger.error(f"获取PyAudio设备索引失败: {e}") return None @staticmethod def validate_pyaudio_device_with_fallback(device_index: int, is_input: bool = True, rate: int = 16000): """ 🔧 修复3: 验证PyAudio设备并返回最佳可用通道数 先测试1ch,失败则fallback到2ch """ try: import pyaudio pa = pyaudio.PyAudio() try: # 获取设备信息 info = pa.get_device_info_by_index(device_index) # 检查基本支持 max_channels = info['maxInputChannels'] if is_input else info['maxOutputChannels'] if max_channels <= 0: logger.error(f"设备 {info['name']} 不支持{'录音' if is_input else '播放'}") return None # 按优先级测试通道数:1ch → 2ch test_channels = [1, 2] if max_channels >= 2 else [1] for channels in test_channels: if channels > max_channels: continue try: if is_input: supported = pa.is_format_supported( rate=rate, input_device=device_index, input_channels=channels, input_format=pyaudio.paInt16 ) else: supported = pa.is_format_supported( rate=rate, output_device=device_index, output_channels=channels, output_format=pyaudio.paInt16 ) if supported: logger.info(f"✅ 设备 {info['name']} 支持 {channels}ch/{rate}Hz") return channels else: logger.warning(f"❌ 设备 {info['name']} 不支持 {channels}ch/{rate}Hz,尝试下一配置") except Exception as e: logger.warning(f"❌ 测试设备 {info['name']} {channels}ch 失败: {e}") # 所有配置都失败,但设备存在,使用保守配置 conservative_channels = min(2, max_channels) logger.warning(f"⚠️ 设备 {info['name']} 验证失败,使用保守配置: {conservative_channels}ch") return conservative_channels finally: pa.terminate() except Exception as e: logger.error(f"验证PyAudio设备失败: {e}") return None @staticmethod def print_device_list(): """打印详细的设备列表用于调试""" def _print_devices(): try: import sounddevice as sd devices = sd.query_devices() print("\n" + "="*80) print("🔍 系统音频设备详细列表") print("="*80) for i, device in enumerate(devices): input_ch = device['max_input_channels'] output_ch = device['max_output_channels'] name = device['name'] # 设备类型标识 if input_ch > 0 and output_ch > 0: device_type = "🎤🔊 录音+播放" elif input_ch > 0: device_type = "🎤 录音" elif output_ch > 0: device_type = "🔊 播放" else: device_type = "❓ 未知" # VB-Cable设备标识 cable_marker = "" name_lower = name.lower() if 'cable a' in name_lower: cable_marker = " 📻 [Cable A]" elif 'cable b' in name_lower: cable_marker = " 📻 [Cable B]" elif 'cable' in name_lower and 'voicemeeter' not in name_lower: cable_marker = " 📻 [Cable]" print(f" 索引 {i:2d}: {device_type} {name}{cable_marker}") print(f" 输入通道: {input_ch}, 输出通道: {output_ch}") print(f" 采样率: {device['default_samplerate']:.0f} Hz") print() print("="*80) except Exception as e: print(f"无法列出设备: {e}") # ✅ 在后台线程中执行,不阻塞UI import threading thread = threading.Thread(target=_print_devices, name="DeviceList", daemon=True) thread.start() # 返回线程引用,调用者可以选择等待 return thread @staticmethod def get_configuration_status(): """获取当前配置状态和建议""" devices = WindowsAudioDeviceDetector.get_audio_device_indices() status = { "has_cable_a": devices["ai_input_device"] is not None and devices["wechat_speaker_device"] is not None, "has_cable_b": devices["ai_output_device"] is not None and devices["wechat_mic_device"] is not None, "is_complete": devices["config_complete"], "devices": devices } # 生成配置建议 if status["is_complete"]: status["message"] = "✅ 双VB-Cable配置完整,可以启动语音通话" status["next_steps"] = [ "确认微信音频设置正确", "运行语音通话测试" ] elif status["has_cable_a"] and not status["has_cable_b"]: status["message"] = "⚠️ 只检测到Cable A,需要安装Cable B" status["next_steps"] = [ "下载并安装VB-Cable B", "重启系统后重新检测" ] elif status["has_cable_b"] and not status["has_cable_a"]: status["message"] = "⚠️ 只检测到Cable B,需要安装Cable A" status["next_steps"] = [ "下载并安装VB-Cable A", "重启系统后重新检测" ] else: status["message"] = "❌ 未检测到任何VB-Cable设备" status["next_steps"] = [ "下载并安装VB-Cable A和B", "重启系统", "运行设备检测" ] return status

class DownmixToMonoProcessor(FrameProcessor):
"""智能声道混音处理器 - 将多声道音频自动混音到单声道"""

text
def __init__(self, input_channels: int): super().__init__(name="DownmixToMono", needs_input_queue=True) self.input_channels = input_channels self.processed_frames = 0 if input_channels > 1: logger.info(f"DownmixToMono初始化: 将{input_channels}声道混音到单声道") else: logger.info(f"DownmixToMono初始化: 输入已是单声道,直接透传") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """🔧 修复4: 处理音频帧,智能混音到单声道,1ch时直接透传""" if isinstance(frame, AudioRawFrame): self.processed_frames += 1 # 🔧 修复4: 如果输入本来就是单声道,直接透传,避免reshape错误 if self.input_channels <= 1: return frame try: audio_data = frame.audio if isinstance(audio_data, np.ndarray): # 检查数据形状 if audio_data.ndim == 1: # 一维数组,需要reshape成多声道格式 total_samples = len(audio_data) if total_samples % self.input_channels == 0: # 重塑为 [samples_per_channel, channels] 格式 samples_per_channel = total_samples // self.input_channels reshaped = audio_data.reshape(samples_per_channel, self.input_channels) # 混音到单声道 (取平均值) mono_audio = np.mean(reshaped, axis=1).astype(audio_data.dtype) # 更新帧数据 frame.audio = mono_audio # 每1000帧输出一次调试信息 if self.processed_frames % 1000 == 0: logger.debug(f"DownmixToMono: 处理了{self.processed_frames}帧, " f"从{self.input_channels}声道({total_samples}采样)混音到单声道({len(mono_audio)}采样)") else: logger.warning(f"音频数据长度({total_samples})不能被声道数({self.input_channels})整除") elif audio_data.ndim == 2: # 二维数组,假设已经是 [samples, channels] 格式 if audio_data.shape[1] == self.input_channels: mono_audio = np.mean(audio_data, axis=1).astype(audio_data.dtype) frame.audio = mono_audio else: logger.warning(f"音频数据形状{audio_data.shape}与期望的声道数{self.input_channels}不匹配") elif isinstance(audio_data, (bytes, bytearray)): # 字节数据,转换为numpy数组处理 np_data = np.frombuffer(audio_data, dtype=np.int16) total_samples = len(np_data) if total_samples % self.input_channels == 0: samples_per_channel = total_samples // self.input_channels reshaped = np_data.reshape(samples_per_channel, self.input_channels) mono_audio = np.mean(reshaped, axis=1).astype(np.int16) frame.audio = mono_audio.tobytes() else: logger.warning(f"字节音频数据长度({total_samples})不能被声道数({self.input_channels})整除") except Exception as e: logger.error(f"DownmixToMono处理出错: {e}") # 出错时直接返回原始帧 return frame return frame

class DashScopeParaformerSTTProcessor(FrameProcessor):
"""阿里云 DashScope Paraformer-Realtime-V2 流式STT处理器"""

text
def __init__(self, api_key: str, model: str = "paraformer-realtime-v2", ws_url: str = None): # ✅ 关键修复:显式声明需要输入队列 (Pipecat 0.7+) super().__init__(needs_input_queue=True) if not DASHSCOPE_AVAILABLE: raise RuntimeError("websockets not available for DashScope") self.api_key = api_key self.model = model self.ws_url = ws_url or "wss://dashscope.aliyuncs.com/api-ws/v1/inference" self.ws = None self.task_id = None self.recv_task = None self.is_connected = False self.is_ready = False # 等待task-started事件 # ✅ 防止并发连接的锁 self._connect_lock = asyncio.Lock() # ✅ 发送队列控制back-pressure self._send_queue = None self._send_task = None # 🔧 新增:预连接任务管理 self._preconnect_task = None self._connection_initiated = False logger.info(f"DashScope Paraformer STT processor initialized with model: {model}") logger.debug("DashScope STT将采用预连接机制,避免阻塞音频管道") def start_preconnection(self): """启动预连接任务,在后台建立WebSocket连接""" if self._connection_initiated: return # 已经启动过连接 self._connection_initiated = True self._preconnect_task = asyncio.create_task(self._connect()) logger.info("🔧 DashScope预连接任务已启动,在后台建立WebSocket连接") async def _connect(self): """建立WebSocket连接并等待task-started""" # ✅ 使用锁防止并发连接 async with self._connect_lock: if self.is_connected: return # 已连接,直接返回 try: import uuid logger.info("🔧 [DashScope连接] 开始WebSocket连接和认证流程...") # 🔧 修复websockets 13版本兼容性:不能使用proxy=None参数,改用NO_PROXY环境变量 import os old_no_proxy = os.environ.get("NO_PROXY", "") old_http_proxy = os.environ.get("HTTP_PROXY", "") old_https_proxy = os.environ.get("HTTPS_PROXY", "") try: # 临时禁用代理,避免DashScope被系统代理劫持 # 这是国内用户连接失败的主要原因,websockets 13版本兼容方案 if old_no_proxy: os.environ["NO_PROXY"] = f"{old_no_proxy},dashscope.aliyuncs.com" else: os.environ["NO_PROXY"] = "dashscope.aliyuncs.com" # 临时清除代理设置(仅对DashScope连接) if old_http_proxy: del os.environ["HTTP_PROXY"] if old_https_proxy: del os.environ["HTTPS_PROXY"] headers = {"Authorization": f"Bearer {self.api_key}"} logger.debug(f"🔧 [DashScope连接] 连接URL: {self.ws_url}") logger.debug(f"🔧 [DashScope连接] NO_PROXY设置: {os.environ.get('NO_PROXY')}") # websockets 13版本兼容:不使用proxy参数 self.ws = await websockets.connect( self.ws_url, extra_headers=headers, ping_interval=None, # 关闭ping,使用heartbeat ping_timeout=None, max_size=2**20, # 1MB最大消息大小 max_queue=128 # 限制发送队列大小 # 🔧 注意:websockets 13版本不支持proxy参数,通过环境变量控制 ) finally: # 恢复原始代理设置 if old_no_proxy: os.environ["NO_PROXY"] = old_no_proxy else: os.environ.pop("NO_PROXY", None) if old_http_proxy: os.environ["HTTP_PROXY"] = old_http_proxy if old_https_proxy: os.environ["HTTPS_PROXY"] = old_https_proxy logger.info("🔧 [DashScope连接] WebSocket连接建立成功") # 生成唯一task_id self.task_id = uuid.uuid4().hex logger.debug(f"🔧 [DashScope连接] 生成task_id: {self.task_id}") # ✅ 发送完整的run-task指令 (遵循官方协议) run_task = { "header": { "action": "run-task", "task_id": self.task_id, "streaming": "duplex" }, "payload": { "task_group": "audio", "task": "asr", "function": "recognition", "model": self.model, "parameters": { "format": "pcm", "sample_rate": 16000, "heartbeat": True # 60s静音保活 }, "input": {} } } logger.debug(f"🔧 [DashScope连接] 发送run-task指令: {run_task}") await self.ws.send(json.dumps(run_task)) logger.info(f"🔧 [DashScope连接] 发送run-task指令,task_id: {self.task_id}") # ✅ 等待task-started事件 logger.info("🔧 [DashScope连接] 等待task-started事件...") timeout_count = 0 max_timeout = 30 # 最多等待30秒 while timeout_count < max_timeout: try: # 设置5秒超时避免无限等待 msg_text = await asyncio.wait_for(self.ws.recv(), timeout=5.0) logger.debug(f"🔧 [DashScope连接] 收到消息: {msg_text[:200]}...") try: msg = json.loads(msg_text) event = msg.get("header", {}).get("event") if event == "task-started": logger.info("🔧 [DashScope连接] 收到task-started事件,连接就绪") self.is_ready = True break elif event == "task-failed": error_msg = msg.get("payload", {}).get("message", "Unknown error") logger.error(f"🔧 [DashScope连接] 任务失败: {error_msg}") raise RuntimeError(f"DashScope task failed: {error_msg}") else: logger.debug(f"🔧 [DashScope连接] 其他事件: {event}") except json.JSONDecodeError: logger.warning(f"🔧 [DashScope连接] 无效JSON响应: {msg_text}") except asyncio.TimeoutError: timeout_count += 5 logger.warning(f"🔧 [DashScope连接] 等待task-started超时 ({timeout_count}/{max_timeout}s)") continue if not self.is_ready: raise RuntimeError(f"等待task-started事件超时 ({max_timeout}s)") # ✅ 启动发送队列和接收任务 self._send_queue = asyncio.Queue(maxsize=100) # 限制队列大小 self._send_task = asyncio.create_task(self._send_loop()) self.recv_task = asyncio.create_task(self._recv_loop()) self.is_connected = True logger.info("✅ [DashScope连接] WebSocket连接完全就绪,可接收音频流") except Exception as e: logger.error(f"❌ [DashScope连接] 连接失败: {e}") self.is_connected = False self.is_ready = False raise async def _send_loop(self): """发送队列循环,控制back-pressure""" try: while True: audio_data = await self._send_queue.get() if audio_data is None: # 结束信号 break if self.ws and self.is_connected: try: await self.ws.send(audio_data) except websockets.exceptions.ConnectionClosed: logger.warning("WebSocket closed during send") break except Exception as e: logger.error(f"Error sending audio data: {e}") break self._send_queue.task_done() except Exception as e: logger.error(f"Error in send loop: {e}") async def _recv_loop(self): """接收WebSocket消息循环""" try: async for message in self.ws: try: data = json.loads(message) header = data.get("header", {}) event = header.get("event") name = header.get("name") # 处理不同类型的事件 if event == "task-started": logger.info("Task started event received") continue elif event == "task-failed": error_msg = data.get("payload", {}).get("message", "Unknown error") logger.error(f"DashScope task failed: {error_msg}") self.is_connected = False break elif event == "task-finished": logger.info("Task finished event received") break elif name == "result-generated": # ✅ 处理识别结果 payload = data.get("payload", {}) output = payload.get("output", {}) sentence = output.get("sentence", {}) text = sentence.get("text", "") is_final = sentence.get("sentence_end", False) if text.strip(): # 创建TextFrame并标记是否为最终结果 text_frame = TextFrame(text=text.strip()) if hasattr(text_frame, 'is_final'): text_frame.is_final = is_final logger.info(f"DashScope STT {'[FINAL]' if is_final else '[PARTIAL]'}: {text}") await self.push_frame(text_frame) else: # 记录其他事件用于调试 logger.debug(f"DashScope event: {event or name}, data: {data}") except json.JSONDecodeError: logger.warning(f"Invalid JSON received from DashScope: {message}") except Exception as e: logger.error(f"Error processing DashScope message: {e}") except websockets.exceptions.ConnectionClosed: logger.info("DashScope WebSocket connection closed") self.is_connected = False self.is_ready = False except Exception as e: logger.error(f"Error in DashScope recv loop: {e}") self.is_connected = False self.is_ready = False async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """处理音频帧并发送到DashScope进行STT识别""" if isinstance(frame, AudioRawFrame): try: # 🔧 核心修复:启动预连接(如果尚未启动) if not self._connection_initiated: self.start_preconnection() # 🔧 非阻塞连接状态检查 if not self.is_ready: # 检查预连接任务是否完成 if self._preconnect_task and self._preconnect_task.done(): try: await self._preconnect_task # 获取异常(如果有) logger.debug("🔧 [DashScope] 预连接任务完成") except Exception as e: logger.error(f"🔧 [DashScope] 预连接失败: {e}") # 🔧 修复:连接失败但不中断管道,允许重试 self._connection_initiated = False self._preconnect_task = None # 检查是否需要重连延迟 import config retry_secs = getattr(config, 'DASHSCOPE_RETRY_SECS', 10) logger.info(f"🔧 [DashScope] 将在{retry_secs}秒后重试连接") # 注:这里不阻塞,让音频帧继续流转 # 如果仍未就绪,跳过该帧(非阻塞),但让音频继续流转 if not self.is_ready: logger.debug("🔧 [DashScope] 连接未就绪,跳过STT处理(音频帧继续传递)") return frame # 🔧 关键:返回原始帧让下游继续处理 # 获取音频数据 audio_data = frame.audio if isinstance(audio_data, np.ndarray): # 转换为16-bit PCM字节数据 if audio_data.dtype != np.int16: # 假设输入是float32 [-1, 1],转换为int16 audio_data = (audio_data * 32767).astype(np.int16) audio_bytes = audio_data.tobytes() else: audio_bytes = audio_data if isinstance(audio_data, (bytes, bytearray)) else audio_data.tobytes() # ✅ 使用队列发送音频数据,控制back-pressure if self._send_queue and not self._send_queue.full(): try: self._send_queue.put_nowait(audio_bytes) logger.debug("🔧 [DashScope] 音频帧已加入发送队列") except asyncio.QueueFull: logger.warning("🔧 [DashScope] 发送队列已满,丢弃音频帧") except websockets.exceptions.ConnectionClosed: logger.warning("🔧 [DashScope] 连接中断,标记为未连接(音频帧继续传递)") self.is_connected = False self.is_ready = False # 🔧 重置连接状态允许重新连接,但不中断音频流 self._connection_initiated = False self._preconnect_task = None # 返回原始帧让音频流继续 return frame except Exception as e: logger.error(f"🔧 [DashScope] 音频帧处理错误(不影响音频流): {e}") # 🔧 任何异常都不应该中断音频管道 return frame return frame async def cleanup(self): """清理资源""" try: # 🔧 清理预连接任务 if self._preconnect_task and not self._preconnect_task.done(): self._preconnect_task.cancel() try: await self._preconnect_task except asyncio.CancelledError: logger.debug("🔧 [DashScope] 预连接任务已取消") except Exception as e: logger.warning(f"🔧 [DashScope] 预连接任务清理异常: {e}") # ✅ 停止发送队列 if self._send_queue: await self._send_queue.put(None) # 发送结束信号 if self._send_task: await self._send_task self._send_task = None # ✅ 发送finish-task结束任务 if self.ws and self.task_id and self.is_connected: finish_task = { "header": { "action": "finish-task", "task_id": self.task_id }, "payload": {} } await self.ws.send(json.dumps(finish_task)) logger.info(f"🔧 [DashScope] 发送finish-task,task_id: {self.task_id}") except Exception as e: logger.warning(f"🔧 [DashScope] finish-task发送错误: {e}") # 清理资源 if self.recv_task: self.recv_task.cancel() try: await self.recv_task except asyncio.CancelledError: pass if self.ws: await self.ws.close() self.is_connected = False self.is_ready = False self._send_queue = None self._connection_initiated = False self._preconnect_task = None logger.info("🔧 [DashScope] 资源清理完成")

class FishAudioSTTProcessor(FrameProcessor):
"""Fish Audio STT处理器,集成到Pipecat Pipeline"""

text
def __init__(self, api_key: str): # ✅ 关键修复:显式声明需要输入队列 (Pipecat 0.7+) super().__init__(needs_input_queue=True) if not FISH_AUDIO_AVAILABLE: raise RuntimeError("Fish Audio SDK not available") self.session = Session(api_key) # ✅ 线程池执行器防止阻塞事件循环 self.executor = None logger.info("Fish Audio STT processor initialized") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """处理音频帧并返回文本帧""" if isinstance(frame, AudioRawFrame): try: # ✅ 修复字节处理问题 audio_data = frame.audio audio_bytes = audio_data if isinstance(audio_data, (bytes, bytearray)) else audio_data.tobytes() # ✅ 使用executor防止阻塞事件循环 if self.executor is None: import concurrent.futures self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=2, thread_name_prefix="FishSTT") loop = asyncio.get_event_loop() # ✅ 在线程池中执行同步SDK调用 def _sync_asr(): return self.session.asr(ASRRequest( audio=audio_bytes, language="zh", format="pcm", # 直接传送PCM数据 sample_rate=RATE, channels=CHANNELS )) response = await loop.run_in_executor(self.executor, _sync_asr) if response.text.strip(): logger.info(f"Fish Audio ASR: {response.text}") return TextFrame(text=response.text) except Exception as e: logger.error(f"Fish Audio STT error: {e}") # 如果PCM直接传送失败,回退到MP3编码(也在executor中执行) try: def _sync_asr_fallback(): # 创建MP3文件作为备选方案 with tempfile.NamedTemporaryFile(suffix='.wav', delete=True) as temp_wav: import wave with wave.open(temp_wav.name, 'wb') as wav_file: wav_file.setnchannels(CHANNELS) wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(RATE) wav_file.writeframes(audio_bytes) # 转换为MP3 from pydub import AudioSegment audio_segment = AudioSegment.from_wav(temp_wav.name) with tempfile.NamedTemporaryFile(suffix='.mp3', delete=True) as temp_mp3: audio_segment.export(temp_mp3.name, format='mp3') with open(temp_mp3.name, 'rb') as f: mp3_data = f.read() # Fish Audio ASR return self.session.asr(ASRRequest( audio=mp3_data, language="zh" )) response = await loop.run_in_executor(self.executor, _sync_asr_fallback) if response.text.strip(): logger.info(f"Fish Audio ASR (fallback): {response.text}") return TextFrame(text=response.text) except Exception as fallback_error: logger.error(f"Fish Audio STT fallback also failed: {fallback_error}") return frame async def cleanup(self): """清理线程池资源""" if self.executor: self.executor.shutdown(wait=True) self.executor = None

class FishAudioTTSProcessor(FrameProcessor):
"""
流式 Fish Audio TTS:
- 后台线程调用同步 SDK,拿到 PCM 块后立即丢到 asyncio.Queue
- 协程侧从队列读,每来一块就 push TTSAudioRawFrame
"""

text
def __init__(self, api_key: str, model_id: Optional[str] = None): # ✅ 关键修复:显式声明需要输入队列 (Pipecat 0.7+) super().__init__(needs_input_queue=True) # 懒导入,避免无 Fish SDK 时报错阻断程序 if not FISH_AUDIO_AVAILABLE: raise RuntimeError("Fish Audio SDK not available") self.session = Session(api_key) self.model_id = model_id # 内部通信队列(线程 → 协程) self._pcm_queue: asyncio.Queue[bytes] = asyncio.Queue() self._worker_tasks: set[asyncio.Task] = set() logger.info(f"Fish Audio TTS processor initialized with model: {model_id} (streaming mode)") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """处理文本帧并返回流式音频帧""" if not isinstance(frame, TextFrame): return frame text = frame.text logger.debug(f"[TTS] 接收到文本 {len(text)} 字,开始合成") # 应用情感控制 cleaned_text, emotion = extract_emotion_from_ai_response(text) formatted_text = EmotionMapper.format_for_fish_audio(cleaned_text, emotion) # 启动后台线程 loop = asyncio.get_running_loop() worker = loop.run_in_executor( None, # 默认线程池 self._blocking_tts_worker, formatted_text, ) task = asyncio.create_task(worker) self._worker_tasks.add(task) task.add_done_callback(self._worker_tasks.discard) # 不阻断地边读边推 while True: try: pcm_chunk: bytes = await asyncio.wait_for(self._pcm_queue.get(), timeout=10) except asyncio.TimeoutError: logger.warning("[TTS] 10 s 无数据,认为已结束") break if pcm_chunk is None: # 线程发来的结束标记 break # 直接转 int16 → numpy → Frame audio_np = np.frombuffer(pcm_chunk, dtype=np.int16) await self.push_frame( TTSAudioRawFrame( audio=audio_np, sample_rate=RATE, num_channels=CHANNELS ) ) return None # 不往下游继续传 TextFrame def _blocking_tts_worker(self, text: str): """ 在线程里执行同步 Fish Audio SDK 调用。 把 PCM 块放进 asyncio 队列;结束后放一个 None 作为哨兵。 """ try: if self.model_id: req = TTSRequest( text=text, reference_id=self.model_id, format="pcm", sample_rate=RATE, channels=CHANNELS, ) else: req = TTSRequest( text=text, format="pcm", sample_rate=RATE, channels=CHANNELS, ) # SDK 返回一个迭代器 / 生成器,每次 yield PCM bytes for chunk in self.session.tts(req): # 队列共享:主协程异步读取 asyncio.run_coroutine_threadsafe( self._pcm_queue.put(chunk), asyncio.get_event_loop() ) logger.debug("[TTS] all chunks sent") except Exception as e: logger.error(f"[TTS] 合成失败: {e}", exc_info=True) # fallback: 转 MP3 然后流式解码 try: self._mp3_fallback_stream(text) except Exception as fb_err: logger.error(f"[TTS] MP3 fallback 仍失败: {fb_err}", exc_info=True) finally: # 发送结束哨兵 asyncio.run_coroutine_threadsafe( self._pcm_queue.put(None), asyncio.get_event_loop() ) def _mp3_fallback_stream(self, text: str): """MP3 fallback(边解边播)""" try: from pydub import AudioSegment req = TTSRequest(text=text, format="mp3") mp3_bytes = b"".join(self.session.tts(req)) with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as f: f.write(mp3_bytes) f.flush() mp3 = AudioSegment.from_mp3(f.name) pcm_data = mp3.set_frame_rate(RATE).set_channels(CHANNELS).raw_data # 分块大小:20 ms = 16000*0.02*2 ≈ 640 bytes step = int(RATE * 0.02 * 2) for i in range(0, len(pcm_data), step): asyncio.run_coroutine_threadsafe( self._pcm_queue.put(pcm_data[i : i + step]), asyncio.get_event_loop() ) except Exception as e: logger.error(f"MP3 fallback stream error: {e}") async def cleanup(self): """清理资源""" # 清理未完成的任务 for task in self._worker_tasks: if not task.done(): task.cancel() # 等待任务完成 if self._worker_tasks: await asyncio.gather(*self._worker_tasks, return_exceptions=True) self._worker_tasks.clear() # 清空队列 while not self._pcm_queue.empty(): try: self._pcm_queue.get_nowait() except asyncio.QueueEmpty: break

class AudioMonitorProcessor(FrameProcessor):
"""音频监控处理器 - 显示音频输入输出状态"""

text
def __init__(self, name: str = "AudioMonitor"): # ✅ FrameProcessor接受name参数,会设置_name字段 super().__init__(name=name, needs_input_queue=True) self.frame_count = 0 self.last_log_time = time.time() self.volume_samples = [] self.max_volume = 0.0 self.min_volume = float('inf') logger.info(f"AudioMonitorProcessor '{name}' initialized") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """监控音频帧并显示统计信息""" current_time = time.time() if isinstance(frame, AudioRawFrame): self.frame_count += 1 # 计算音量信息 audio_data = frame.audio if isinstance(audio_data, np.ndarray): # 计算RMS音量 if audio_data.dtype == np.int16: # int16 范围: -32768 到 32767 volume_rms = np.sqrt(np.mean(audio_data.astype(np.float32) ** 2)) volume_db = 20 * np.log10(max(volume_rms / 32767.0, 1e-10)) volume_percent = min(100, max(0, (volume_rms / 32767.0) * 100)) else: # float32 范围: -1.0 到 1.0 volume_rms = np.sqrt(np.mean(audio_data ** 2)) volume_db = 20 * np.log10(max(volume_rms, 1e-10)) volume_percent = min(100, max(0, volume_rms * 100)) self.volume_samples.append(volume_percent) self.max_volume = max(self.max_volume, volume_percent) self.min_volume = min(self.min_volume, volume_percent) # 🔧 修复6: 更频繁的日志输出,确保能看到音频活动 if current_time - self.last_log_time >= 1.0: # 改为1秒输出一次 avg_volume = np.mean(self.volume_samples) if self.volume_samples else 0 # 生成音量条显示 volume_bar_length = 20 filled_length = int(volume_bar_length * avg_volume / 100) volume_bar = "█" * filled_length + "░" * (volume_bar_length - filled_length) # 🔧 修复6: 强制显示音频监控状态,即使静音也要显示 logger.info(f"🎤 [{self.name}] 帧数:{self.frame_count} | " f"音量:{avg_volume:.1f}% |{volume_bar}| " f"峰值:{self.max_volume:.1f}% | " f"样本数:{len(audio_data)} | " f"数据类型:{audio_data.dtype}") # 🔧 修复6: 检测是否有有效音频信号 if avg_volume > 0.1: # 任何非零音频都算有效 logger.info(f"✅ [{self.name}] 检测到音频信号 - 音频流工作正常") elif self.frame_count > 10: # 收到帧但无音量可能是静音 logger.warning(f"⚠️ [{self.name}] 收到音频帧但无声音 - 检查音频源") else: logger.info(f"🎯 [{self.name}] 音频流已激活,等待音频信号...") # 重置统计 self.last_log_time = current_time self.volume_samples = [] self.max_volume = 0.0 self.min_volume = float('inf') # 🔧 修复6: 首次收到帧时的提示 if self.frame_count == 1: logger.info(f"🎯 [{self.name}] 首次收到AudioRawFrame - 音频管道已激活") elif isinstance(frame, (bytes, bytearray)): # 处理字节格式音频 self.frame_count += 1 logger.info(f"🎤 [{self.name}] 收到字节音频帧: {len(frame)}字节") if current_time - self.last_log_time >= 1.0: logger.info(f"✅ [{self.name}] 音频流活跃 - 字节格式") self.last_log_time = current_time elif isinstance(frame, TextFrame): if direction == "downstream": logger.info(f"📝 [{self.name}] 用户语音识别: '{frame.text}'") elif direction == "upstream": logger.info(f"🤖 [{self.name}] AI回复文本: '{frame.text[:50]}{'...' if len(frame.text) > 50 else ''}'") elif isinstance(frame, TTSAudioRawFrame): logger.info(f"🔊 [{self.name}] TTS音频输出: {len(frame.audio)} 采样点") return frame

class DeviceTestProcessor(FrameProcessor):
"""设备测试处理器 - 检测音频设备连接状态"""

text
def __init__(self): super().__init__(name="DeviceTest", needs_input_queue=True) self.input_detected = False self.output_detected = False self.test_start_time = time.time() logger.info("DeviceTestProcessor initialized - 开始设备检测") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """检测设备是否正常工作""" if isinstance(frame, AudioRawFrame) and not self.input_detected: # 检测到音频输入 audio_data = frame.audio if isinstance(audio_data, np.ndarray) and len(audio_data) > 0: volume_rms = np.sqrt(np.mean(audio_data.astype(np.float32) ** 2)) if volume_rms > 100: # 有实际音频信号 self.input_detected = True logger.info(f"✅ 音频输入设备工作正常 - 检测到音频信号 (用时: {time.time() - self.test_start_time:.2f}s)") elif isinstance(frame, TTSAudioRawFrame) and not self.output_detected: # 检测到音频输出 self.output_detected = True logger.info(f"✅ 音频输出设备工作正常 - 检测到TTS输出 (用时: {time.time() - self.test_start_time:.2f}s)") return frame

class ConversationMemoryProcessor(FrameProcessor):
"""对话记忆处理器,管理对话历史和记录"""

text
def __init__( self, user_id: str, role_prompt: str, summary_callback: Optional[Callable] = None, log_to_file: Optional[str] = None, max_history: int = 50 # ✅ 限制历史长度防止内存泄露 ): # ✅ 关键修复:显式声明需要输入队列 (Pipecat 0.7+) super().__init__(needs_input_queue=True) self.user_id = user_id self.role_prompt = role_prompt self.summary_callback = summary_callback self.log_to_file = log_to_file self.max_history = max_history # ✅ 使用线程安全的deque,防止并发问题 from collections import deque self.conversation_history = deque(maxlen=max_history) self.events_log = deque(maxlen=max_history * 2) # 事件日志稍微多一些 # ✅ 异步锁保护并发操作 self._history_lock = asyncio.Lock() self._log_lock = asyncio.Lock() self.last_activity_time = time.time() # 终止状态 self.user_terminated = False self.user_rejected = False self.silence_terminated = False # ✅ 异步文件写入 self._file_writer = None logger.info(f"ConversationMemoryProcessor initialized for user {user_id}") async def process_frame(self, frame: Frame, direction) -> Optional[Frame]: """处理帧并管理对话记忆""" # 记录用户语音 if isinstance(frame, TextFrame) and direction == "downstream": user_text = frame.text # ✅ 使用锁保护历史记录 async with self._history_lock: self.conversation_history.append({"role": "user", "content": user_text}) await self._log("user", user_text) self.last_activity_time = time.time() # 构建LLM消息,包含系统提示和历史 async with self._history_lock: # 创建消息副本,避免在锁外访问 messages = [{"role": "system", "content": self.role_prompt}] messages.extend(list(self.conversation_history)[-20:]) # 限制历史长度 return LLMMessagesFrame(messages=messages) # 记录AI回复 if isinstance(frame, TextFrame) and direction == "upstream": ai_text = frame.text # ✅ 使用锁保护历史记录 async with self._history_lock: self.conversation_history.append({"role": "assistant", "content": ai_text}) await self._log("assistant", ai_text) return frame async def _log(self, role: str, content: str) -> None: """异步记录对话事件""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") event = { "timestamp": timestamp, "role": role, "content": content } # ✅ 使用锁保护事件日志 async with self._log_lock: self.events_log.append(event) logger.info(f"[{role}] {content[:100]}{'...' if len(content) > 100 else ''}") # ✅ 异步文件写入 if self.log_to_file: await self._write_to_file_async(event) async def _write_to_file_async(self, event: dict) -> None: """异步写入文件""" try: import aiofiles # 确保目录存在 os.makedirs(os.path.dirname(os.path.abspath(self.log_to_file)), exist_ok=True) role_name = { "user": self.user_id, "assistant": "助手", "system": "系统" }.get(event["role"], event["role"]) line = f'{event["timestamp"]} | [{role_name}] {event["content"]}\n' async with aiofiles.open(self.log_to_file, "a", encoding="utf-8") as f: await f.write(line) except ImportError: # 回退到同步写入(在executor中) def _sync_write(): os.makedirs(os.path.dirname(os.path.abspath(self.log_to_file)), exist_ok=True) with open(self.log_to_file, "a", encoding="utf-8") as f: role_name = { "user": self.user_id, "assistant": "助手", "system": "系统" }.get(event["role"], event["role"]) f.write(f'{event["timestamp"]} | [{role_name}] {event["content"]}\n') import concurrent.futures if not hasattr(self, '_write_executor'): self._write_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="FileWriter") loop = asyncio.get_event_loop() await loop.run_in_executor(self._write_executor, _sync_write) except Exception as e: logger.error(f"Error writing to log file: {e}") def set_user_terminated(self): """标记用户主动终止""" self.user_terminated = True logger.info(f"User {self.user_id} terminated conversation") def set_user_rejected(self): """标记用户拒绝通话""" self.user_rejected = True logger.info(f"User {self.user_id} rejected call") def set_silence_terminated(self): """标记静音超时终止""" self.silence_terminated = True logger.info(f"Conversation with {self.user_id} ended due to silence") async def finalize_conversation(self): """结束对话处理""" try: # 确定终止原因 if self.user_terminated: termination_reason = "用户主动挂断" elif self.user_rejected: termination_reason = "用户拒绝接听" elif self.silence_terminated: termination_reason = "长时间无响应" else: termination_reason = "正常结束" await self._log("system", f"Conversation ended: {termination_reason}") # ✅ 在后台任务中生成总结,不阻塞管道停止 if self.summary_callback and self.events_log: asyncio.create_task(self._generate_summary_background()) # ✅ 清理executor if hasattr(self, '_write_executor'): self._write_executor.shutdown(wait=False) except Exception as e: logger.error(f"Error finalizing conversation: {e}", exc_info=True) async def _generate_summary_background(self): """在后台生成对话总结,不阻塞主流程""" try: # 获取事件日志的副本 async with self._log_lock: events_copy = list(self.events_log) # 格式化对话文本 conversation_text = "\n".join([ f"{'用户' if e['role'] == 'user' else '助手' if e['role'] == 'assistant' else '系统'}: {e['content']}" for e in events_copy ]) # 添加终止信息 termination_info = "" if self.user_terminated: termination_info = "\n\n[注意:用户主动挂断了语音通话]" elif self.user_rejected: termination_info = "\n\n[注意:用户拒绝接听语音通话]" elif self.silence_terminated: termination_info = "\n\n[注意:用户长时间未说话,系统主动终止了通话]" # 创建总结提示 summary_prompt = f"""以下是刚刚结束的语音对话内容:

{conversation_text}{termination_info}

语音对话已经结束。请你继续保持角色扮演,使用你的人设和语气,基于这次对话的内容进行总结,向用户告别,并依照自己的性格,说出你内心所想。请直接以你的角色身份发言,不要使用第三人称。"""

text
# 调用总结回调 if asyncio.iscoroutinefunction(self.summary_callback): await self.summary_callback(self.user_id, summary_prompt, events_copy) else: # 在executor中执行同步回调 import concurrent.futures executor = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="SummaryCallback") loop = asyncio.get_event_loop() await loop.run_in_executor(executor, self.summary_callback, self.user_id, summary_prompt, events_copy) executor.shutdown(wait=False) except Exception as e: logger.error(f"Error generating summary in background: {e}", exc_info=True)

class StreamingVoicePipelineHandler:
"""
基于Pipecat LocalAudioTransport的流式语音管道处理器
实现真正的低延迟流式对话(120-180ms首包)
"""

text
def __init__( self, *, # Core configuration user_id: str = "user", api_key: str, base_url: Optional[str] = None, # Model configuration chat_model: str = "gpt-4o", tts_service: str = "fish", # "fish", "minimax" # Role and conversation role_prompt: Optional[str] = None, # Fish Audio configuration fish_audio_key: Optional[str] = None, fish_audio_model_id: Optional[str] = None, # Audio device configuration input_device_index: Optional[int] = None, output_device_index: Optional[int] = None, # Timeout and behavior idle_timeout: int = 120, # Callbacks summary_callback: Optional[Callable[[str, str, List[dict]], None]] = None, log_to_file: Optional[str] = None, ) -> None: if not PIPECAT_AVAILABLE: raise RuntimeError("Pipecat is not available. Please install pipecat-ai[local].") self.user_id = user_id self.api_key = api_key self.base_url = base_url self.chat_model = chat_model self.tts_service = tts_service self.role_prompt = role_prompt or "你是一个友好的助手。请用自然、温暖的语气回应。" self.idle_timeout = idle_timeout self.summary_callback = summary_callback self.log_to_file = log_to_file # Audio configuration self.input_device_index = input_device_index self.output_device_index = output_device_index # Fish Audio configuration self.fish_audio_key = fish_audio_key self.fish_audio_model_id = fish_audio_model_id # Runtime state self.is_active = False self.pipeline_runner = None self.memory_processor = None self.stt_processor = None # 存储STT处理器用于cleanup self.tts_processor = None # 存储TTS处理器用于cleanup logger.info(f"StreamingVoicePipelineHandler initialized for user {user_id}") logger.info(f" TTS Service: {tts_service}") logger.info(f" Chat Model: {chat_model}") logger.info(f" STT Service: DashScope Paraformer-Realtime-V2 (固定)") # 🔧 调试日志: 诊断Fish Audio参数传递 logger.info(f" Fish Audio Key: {'已设置' if fish_audio_key else '未设置'} (长度: {len(fish_audio_key) if fish_audio_key else 0})") logger.info(f" Fish Audio Model ID: {fish_audio_model_id or '未设置'}") logger.info(f" Fish Audio SDK Available: {FISH_AUDIO_AVAILABLE}") def _create_pipeline(self) -> Pipeline: """创建Pipecat双向流式管道 (防回声双Cable方案)""" # 🔧 新增:支持本机麦克风直连调试模式 mic_loopback_mode = os.getenv("PIPECAT_MIC_LOOPBACK") == "1" if mic_loopback_mode: logger.info("🎙️ 启用PIPECAT_MIC_LOOPBACK调试模式:使用本机麦克风直连") logger.info(" 此模式下:输入=真实麦克风,输出=真实扬声器,跳过VB-Cable") # 1. 检测双VB-Cable设备配置(调试模式下跳过) if not mic_loopback_mode: devices = WindowsAudioDeviceDetector.get_audio_device_indices() if not devices["config_complete"]: # 打印详细设备列表和配置状态 WindowsAudioDeviceDetector.print_device_list() status = WindowsAudioDeviceDetector.get_configuration_status() raise RuntimeError(f"{status['message']}\n建议步骤: {', '.join(status['next_steps'])}") else: # 🔧 调试模式:使用系统默认音频设备 devices = { "ai_input_device": None, # None表示使用默认麦克风 "ai_output_device": None, # None表示使用默认扬声器 "ai_input_device_name": "系统默认麦克风", "ai_output_device_name": "系统默认扬声器", "config_complete": True } logger.info("🎙️ 调试模式设备配置:") logger.info(" 输入: 系统默认麦克风(通常是USB/内置麦克风)") logger.info(" 输出: 系统默认扬声器(通常是USB/内置扬声器)") # 2. Transport - 双向音频输入输出 (防回声配置 / 调试模式) # 使用基于名称的设备匹配解决索引不匹配问题 pyaudio_input_device_index = None pyaudio_output_device_index = None # 获取PyAudio设备索引并验证最佳通道数 if mic_loopback_mode: # 🔧 调试模式:强制使用1声道,避免复杂的多声道处理 AI_IN_CHANNELS = 1 # 真实麦克风通常是单声道 AI_OUT_CHANNELS = 1 # 真实扬声器输出单声道即可 logger.info(f"🎙️ 调试模式声道配置: 输入={AI_IN_CHANNELS}ch, 输出={AI_OUT_CHANNELS}ch") else: # 正常模式:使用原有的VB-Cable多声道逻辑 AI_IN_CHANNELS = 2 # 默认2声道,提高兼容性 AI_OUT_CHANNELS = 1 # 默认单声道 if not mic_loopback_mode and devices["ai_input_device_name"]: pyaudio_input_device_index = WindowsAudioDeviceDetector.get_pyaudio_device_index( devices["ai_input_device_name"], is_input=True ) if pyaudio_input_device_index is not None: # 🔧 修复3: 使用新的验证函数,返回最佳可用通道数 validated_channels = WindowsAudioDeviceDetector.validate_pyaudio_device_with_fallback( pyaudio_input_device_index, is_input=True, rate=RATE ) if validated_channels is not None: AI_IN_CHANNELS = validated_channels logger.info(f"✅ 输入设备验证成功: {validated_channels}ch") else: logger.warning("输入设备验证失败,使用默认设备") pyaudio_input_device_index = None if not mic_loopback_mode and devices["ai_output_device_name"]: pyaudio_output_device_index = WindowsAudioDeviceDetector.get_pyaudio_device_index( devices["ai_output_device_name"], is_input=False ) if pyaudio_output_device_index is not None: # 🔧 修复3: 使用新的验证函数,返回最佳可用通道数 validated_channels = WindowsAudioDeviceDetector.validate_pyaudio_device_with_fallback( pyaudio_output_device_index, is_input=False, rate=RATE ) if validated_channels is not None: AI_OUT_CHANNELS = validated_channels logger.info(f"✅ 输出设备验证成功: {validated_channels}ch") else: logger.warning("输出设备验证失败,使用默认设备") pyaudio_output_device_index = None # 🔧 关键修复说明: # - LocalAudioTransport现在使用通过设备名称映射得到的PyAudio索引 # - sounddevice索引仅用于初始设备检测和配置状态确认 # 根据配置决定是否启用VAD vad_analyzer = None if PIPECAT_VAD_AVAILABLE: try: import config enable_vad = getattr(config, 'ENABLE_VAD', True) if enable_vad: # 创建VAD参数对象 - 使用新的导入路径 from pipecat.audio.vad.vad_analyzer import VADParams vad_stop_secs = getattr(config, 'VAD_STOP_SECS', 0.8) vad_start_secs = getattr(config, 'VAD_START_SECS', 0.2) vad_min_volume = getattr(config, 'VAD_MIN_VOLUME', 0.6) vad_params = VADParams( stop_secs=vad_stop_secs, start_secs=vad_start_secs, min_volume=vad_min_volume ) vad_analyzer = SileroVADAnalyzer( sample_rate=RATE, params=vad_params ) logger.info(f"✅ Silero VAD enabled (stop: {vad_stop_secs}s, start: {vad_start_secs}s, vol: {vad_min_volume})") else: logger.info("⚠️ VAD disabled by configuration") except Exception as e: logger.warning(f"Silero VAD initialization failed: {e}, proceeding without VAD") vad_analyzer = None else: logger.info("⚠️ Silero VAD not available") # 🔧 修复6: 配置优化的音频传输参数,解决PortAudio流启动问题 blocksize_samples = int(RATE * 0.02) # 20ms帧 = 320样本@16kHz # 🔧 关键修复:LocalAudioTransport需要PyAudio索引,不是sounddevice索引 if mic_loopback_mode: # 调试模式:使用默认设备(None表示默认) transport_input_device = None transport_output_device = None logger.info(f"🎙️ 调试模式Transport配置:") logger.info(f" 输入设备: 默认麦克风") logger.info(f" 输出设备: 默认扬声器") else: # 正常模式:获取PyAudio索引,这是LocalAudioTransport的要求 input_pya_idx = WindowsAudioDeviceDetector.get_pyaudio_device_index( devices["ai_input_device_name"], is_input=True) if devices["ai_input_device_name"] else None output_pya_idx = WindowsAudioDeviceDetector.get_pyaudio_device_index( devices["ai_output_device_name"], is_input=False) if devices["ai_output_device_name"] else None transport_input_device = input_pya_idx transport_output_device = output_pya_idx logger.info(f"🔊 正常模式Transport配置 (修复后):") logger.info(f" 输入设备: PyAudio索引 {transport_input_device} (从设备名 '{devices['ai_input_device_name']}')") logger.info(f" 输出设备: PyAudio索引 {transport_output_device} (从设备名 '{devices['ai_output_device_name']}')") logger.info(f" 🔍 关键修复: LocalAudioTransport现在使用PyAudio索引而非sounddevice索引") # 🔧 修复5: 添加最终配置日志(在变量定义之后) logger.info(f"🎤 录音端最终配置: PyAudio索引={transport_input_device}, 通道={AI_IN_CHANNELS}ch, 采样率={RATE}Hz") logger.info(f"🔊 播放端最终配置: PyAudio索引={transport_output_device}, 通道={AI_OUT_CHANNELS}ch, 采样率={RATE}Hz") transport_params = LocalAudioTransportParams( audio_in_enabled=True, audio_out_enabled=True, # 🔧 关键修复: LocalAudioTransport需要PyAudio设备索引 input_device_index=transport_input_device, output_device_index=transport_output_device, # 动态设置声道数和采样率 audio_in_sample_rate=RATE, audio_out_sample_rate=RATE, audio_in_channels=AI_IN_CHANNELS, audio_out_channels=AI_OUT_CHANNELS, # 🔧 修复6: 固定blocksize避免PortAudio启动失败 audio_in_blocksize=blocksize_samples, # 20ms帧,兼容16kHz audio_out_blocksize=blocksize_samples, # 配置VAD分析器 vad_analyzer=vad_analyzer ) logger.info(f"🔧 音频流配置: blocksize={blocksize_samples}样本 (20ms帧)") if mic_loopback_mode: logger.info(f"🎙️ 调试模式 - 使用系统默认设备") logger.info(f"🎤 输入: 默认麦克风 ({AI_IN_CHANNELS}ch)") logger.info(f"🔊 输出: 默认扬声器 ({AI_OUT_CHANNELS}ch)") else: logger.info(f"🎤 使用PyAudio输入索引: {transport_input_device} (从设备名称映射)") logger.info(f"🔊 使用PyAudio输出索引: {transport_output_device} (从设备名称映射)") # 🔧 修复6: 安全创建Transport,捕获PortAudio异常 try: transport = LocalAudioTransport(transport_params) logger.info("✅ LocalAudioTransport创建成功") except Exception as e: logger.exception(f"❌ LocalAudioTransport创建失败: {e}") logger.error("可能的原因:") logger.error(" 1. VB-Cable设备被其他程序占用") logger.error(" 2. 音频驱动不支持指定的采样率/通道配置") logger.error(" 3. PortAudio版本兼容性问题") raise RuntimeError(f"音频传输初始化失败: {e}") logger.info("🔄 双向防回声音频配置:" if not mic_loopback_mode else "🎙️ 调试模式音频配置:") if mic_loopback_mode: logger.info(f" 🎤 输入: 系统默认麦克风 ({AI_IN_CHANNELS}ch, {RATE}Hz)") logger.info(f" 🔊 输出: 系统默认扬声器 ({AI_OUT_CHANNELS}ch, {RATE}Hz)") logger.info(f" ⚠️ 注意: 调试模式下可能有回声,请使用耳机") else: logger.info(f" 📥 PyAudio输入索引: {transport_input_device} (传给LocalAudioTransport)") logger.info(f" 📤 PyAudio输出索引: {transport_output_device} (传给LocalAudioTransport)") logger.info(f" 🔍 sounddevice输入索引: {devices['ai_input_device']} (仅用于设备检测)") logger.info(f" 🔍 sounddevice输出索引: {devices['ai_output_device']} (仅用于设备检测)") logger.info(f" 🎤 打开 {devices['ai_input_device_name']} → channels={AI_IN_CHANNELS}, rate={RATE}Hz") logger.info(f" 🔊 打开 {devices['ai_output_device_name']} → channels={AI_OUT_CHANNELS}, rate={RATE}Hz") logger.info(f" 🔊 微信扬声器: sounddevice索引 {devices['wechat_speaker_device']} ({devices.get('wechat_speaker_device_name', 'Cable A Input')})") logger.info(f" 🎤 微信麦克风: sounddevice索引 {devices['wechat_mic_device']} ({devices.get('wechat_mic_device_name', 'Cable B Output')})") # 显示设备名称映射 if devices["ai_input_device_name"]: logger.info(f" 🎤 输入设备名称: {devices['ai_input_device_name']}") if devices["ai_output_device_name"]: logger.info(f" 🔊 输出设备名称: {devices['ai_output_device_name']}") # ⚠️ 重要提示 logger.info("✅ 关键修复: LocalAudioTransport现在正确使用PyAudio索引,sounddevice索引仅用于设备检测") # ✅ 添加详细设备信息 try: import sounddevice as sd all_devices = sd.query_devices() if not mic_loopback_mode and devices['ai_input_device'] and devices['ai_input_device'] < len(all_devices): input_device = all_devices[devices['ai_input_device']] logger.info(f" 🎤 sounddevice输入设备: {input_device['name']} ({input_device['max_input_channels']}ch, {input_device['default_samplerate']:.0f}Hz)") if not mic_loopback_mode and devices['ai_output_device'] and devices['ai_output_device'] < len(all_devices): output_device = all_devices[devices['ai_output_device']] logger.info(f" 🔊 sounddevice输出设备: {output_device['name']} ({output_device['max_output_channels']}ch, {output_device['default_samplerate']:.0f}Hz)") except Exception as e: logger.warning(f"无法获取详细设备信息: {e}") # 显示实际使用的PyAudio设备信息(LocalAudioTransport使用) try: import pyaudio pa = pyaudio.PyAudio() try: if transport_input_device is not None: pa_input_info = pa.get_device_info_by_index(transport_input_device) logger.info(f" 🎤 PyAudio输入设备 (实际使用): {pa_input_info['name']} ({pa_input_info['maxInputChannels']}ch, {pa_input_info['defaultSampleRate']:.0f}Hz)") else: logger.info(f" 🎤 PyAudio输入设备 (实际使用): 使用默认设备") if transport_output_device is not None: pa_output_info = pa.get_device_info_by_index(transport_output_device) logger.info(f" 🔊 PyAudio输出设备 (实际使用): {pa_output_info['name']} ({pa_output_info['maxOutputChannels']}ch, {pa_output_info['defaultSampleRate']:.0f}Hz)") else: logger.info(f" 🔊 PyAudio输出设备 (实际使用): 使用默认设备") finally: pa.terminate() except Exception as e: logger.warning(f"无法获取PyAudio设备信息: {e}") # 3. STT - 语音转文字 (固定使用 DashScope Paraformer-Realtime-V2) try: import config dashscope_key = getattr(config, 'DASHSCOPE_API_KEY', '') dashscope_model = getattr(config, 'DASHSCOPE_STT_MODEL', 'paraformer-realtime-v2') dashscope_url = getattr(config, 'DASHSCOPE_WS_URL', None) if dashscope_key: self.stt_processor = DashScopeParaformerSTTProcessor( api_key=dashscope_key, model=dashscope_model, ws_url=dashscope_url ) stt = self.stt_processor logger.info("✅ 使用阿里云 DashScope Paraformer-Realtime-V2 STT") # 🔧 立即启动预连接,避免第一个音频帧阻塞 logger.info("🔧 启动DashScope预连接,避免音频管道阻塞...") stt.start_preconnection() else: raise ValueError("DashScope API key not configured") except Exception as e: logger.error(f"DashScope STT初始化失败: {e}") raise RuntimeError("语音转文字服务不可用,请配置 DASHSCOPE_API_KEY") # 4. Memory - 对话记忆管理 self.memory_processor = ConversationMemoryProcessor( user_id=self.user_id, role_prompt=self.role_prompt, summary_callback=self.summary_callback, log_to_file=self.log_to_file ) # 5. LLM - 语言模型 llm = OpenAILLMService( api_key=self.api_key, base_url=self.base_url, model=self.chat_model ) # 6. TTS - 文字转语音 logger.info(f"🔧 TTS配置检查: service={self.tts_service}, fish_key={'有效' if self.fish_audio_key else '无效'}, sdk_available={FISH_AUDIO_AVAILABLE}") if self.tts_service == "fish" and self.fish_audio_key: if not FISH_AUDIO_AVAILABLE: logger.warning("⚠️ Fish Audio SDK不可用,回退到OpenAI TTS") tts = OpenAITTSService( api_key=self.api_key, voice="nova" ) logger.info("✅ 使用OpenAI TTS (SDK不可用回退)") else: self.tts_processor = FishAudioTTSProcessor( api_key=self.fish_audio_key, model_id=self.fish_audio_model_id ) tts = self.tts_processor logger.info("✅ 使用Fish Audio流式TTS (首包延迟<150ms)") else: # 回退到OpenAI TTS if self.tts_service == "fish": logger.warning(f"⚠️ Fish Audio密钥无效 ('{self.fish_audio_key}'),回退到OpenAI TTS") else: logger.info(f"🔧 TTS服务设置为 '{self.tts_service}',使用OpenAI TTS") tts = OpenAITTSService( api_key=self.api_key, voice="nova" ) logger.info("✅ 使用OpenAI TTS (回退方案)") # 构建双向防回声管道: # 用户声音: 微信→Cable A→AI程序输入→[VAD集成在Transport中]→RAW监控→Downmix→STT→Memory→LLM→TTS→AI程序输出→Cable B→微信 pipeline_components = [ transport.input(), # 从Cable A Output接收用户声音 (已集成VAD) AudioMonitorProcessor("RAW输入监控"), # 🔧 最早的音频监控,显示原始输入帧 DownmixToMonoProcessor(AI_IN_CHANNELS), # ✅ 智能混音到单声道(仅在多声道时生效) DeviceTestProcessor(), # ✅ 设备测试 stt, # 语音转文字 AudioMonitorProcessor("STT后监控"), # ✅ STT处理后的监控 self.memory_processor, # 对话记忆管理 llm, # 语言模型推理 AudioMonitorProcessor("TTS前监控"), # ✅ TTS前的文本监控 tts, # 文字转语音 AudioMonitorProcessor("TTS后监控"), # ✅ TTS输出监控 transport.output() # 向Cable B Input发送AI声音 ] if vad_analyzer: logger.info(f"Pipeline: Transport(+VAD) → RAW监控 → Downmix({AI_IN_CHANNELS}→1) → 设备测试 → STT → STT后监控 → Memory → LLM → TTS前监控 → TTS → TTS后监控 → Transport") else: logger.info(f"Pipeline: Transport → RAW监控 → Downmix({AI_IN_CHANNELS}→1) → 设备测试 → STT → STT后监控 → Memory → LLM → TTS前监控 → TTS → TTS后监控 → Transport (无VAD)") logger.info(f"✅ 关键修复完成: LocalAudioTransport现在使用正确的PyAudio设备索引") if not mic_loopback_mode: logger.info(f" 输入: {devices['ai_input_device_name']} → PyAudio索引 {transport_input_device}") logger.info(f" 输出: {devices['ai_output_device_name']} → PyAudio索引 {transport_output_device}") pipeline = Pipeline(pipeline_components) return pipeline async def start_conversation(self, initial_message: Optional[str] = None) -> None: """开始流式语音对话""" if self.is_active: logger.warning("Conversation already active") return # 检测微信语音窗口 if not WindowsAudioDeviceDetector.detect_wechat_voice_window(): logger.warning("未检测到微信语音窗口,继续进行对话") try: self.is_active = True logger.info("Starting streaming voice conversation...") # 1. 创建管道和任务 pipeline = self._create_pipeline() task = PipelineTask(pipeline) # 2. 创建PipelineRunner (Windows兼容) self.pipeline_runner = PipelineRunner(handle_sigint=False) # 3. ✅ 简化启动:直接运行管道 # 现在所有处理器都正确声明了 needs_input_queue=True logger.info("Starting pipeline with proper input queue configuration...") # 4. ✅ 初始消息处理建议:不直接通过 queue_frame 发送到音频输入 # 而是等用户开始说话后由 LLM 自然生成回复 if initial_message: if self.memory_processor: # ✅ 异步调用_log方法 await self.memory_processor._log("assistant", initial_message) logger.info(f"Initial message logged to memory: {initial_message}") # 不再直接发送 TextFrame 到管道入口 # 5. 运行管道 await self.pipeline_runner.run(task) except Exception as e: logger.error(f"Error in streaming voice conversation: {e}", exc_info=True) finally: self.is_active = False if self.memory_processor: await self.memory_processor.finalize_conversation() async def stop_conversation(self) -> None: """ 🔧 修复7: 增强的对话停止处理 包含自动总结和状态检测 """ if not self.is_active: return logger.info("Stopping streaming voice conversation...") self.is_active = False try: # 🔧 修复7: 检测对话终止原因并生成总结 if self.memory_processor: # 检查对话时长和内容 conversation_history = getattr(self.memory_processor, 'conversation_history', []) user_messages = [msg for msg in conversation_history if msg.get('role') == 'user'] if len(user_messages) > 0: logger.info(f"📋 对话总结: 共{len(user_messages)}轮用户发言") # 🔧 修复7: 自动生成对话总结 try: await self._generate_conversation_summary() except Exception as e: logger.error(f"生成对话总结失败: {e}") else: logger.info("📋 对话无有效内容,跳过总结") # 检测终止原因 if hasattr(self.memory_processor, 'user_terminated') and self.memory_processor.user_terminated: logger.info("📞 对话结束原因: 用户主动挂断") elif hasattr(self.memory_processor, 'user_rejected') and self.memory_processor.user_rejected: logger.info("📞 对话结束原因: 用户拒绝通话") elif hasattr(self.memory_processor, 'silence_terminated') and self.memory_processor.silence_terminated: logger.info("📞 对话结束原因: 静音超时") else: logger.info("📞 对话结束原因: 系统终止") # ✅ 清理STT处理器连接 if self.stt_processor: try: await self.stt_processor.cleanup() logger.info("STT processor cleanup completed") except Exception as e: logger.error(f"Error during STT processor cleanup: {e}") # ✅ 清理TTS处理器资源 if self.tts_processor: try: await self.tts_processor.cleanup() logger.info("TTS processor cleanup completed") except Exception as e: logger.error(f"Error during TTS processor cleanup: {e}") if self.pipeline_runner: await self.pipeline_runner.stop() except Exception as e: logger.exception(f"停止对话时出现异常: {e}") logger.info(f"✅ 语音对话已完全停止") async def _generate_conversation_summary(self) -> None: """ 🔧 修复7: 生成对话总结 """ try: if not self.memory_processor: return conversation_history = getattr(self.memory_processor, 'conversation_history', []) if len(conversation_history) < 2: # 至少需要一问一答 return # 构建总结提示 conversation_text = "" for msg in conversation_history[-10:]: # 最后10轮对话 role = "用户" if msg.get('role') == 'user' else "助手" content = msg.get('content', '')[:100] # 限制长度 conversation_text += f"{role}: {content}\n" # 简化处理:记录对话基本信息 user_count = len([m for m in conversation_history if m.get('role') == 'user']) assistant_count = len([m for m in conversation_history if m.get('role') == 'assistant']) summary = f"对话包含{user_count}条用户消息和{assistant_count}条助手回复" # 记录总结 if self.memory_processor: await self.memory_processor._log("system", f"对话总结: {summary}") logger.info(f"📋 对话总结: {summary}") except Exception as e: logger.error(f"生成对话总结失败: {e}") def stop_conversation_sync(self, timeout: float = 10.0) -> None: """同步停止对话""" if not self.is_active: return try: # ✅ 修复线程中无事件循环的问题 try: loop = asyncio.get_event_loop() if loop.is_running(): # 如果在异步环境中,创建任务 task = asyncio.create_task(self.stop_conversation()) return # 让调用者await这个task else: # 如果事件循环存在但未运行,直接run asyncio.run(self.stop_conversation()) except RuntimeError as e: if "There is no current event loop" in str(e): # ✅ 在新线程中创建新的事件循环 new_loop = asyncio.new_event_loop() asyncio.set_event_loop(new_loop) try: new_loop.run_until_complete(self.stop_conversation()) finally: new_loop.close() # 清理事件循环引用 asyncio.set_event_loop(None) else: raise except Exception as e: logger.error(f"Error in synchronous conversation stop: {e}") self.is_active = False def set_user_terminated(self) -> None: """标记用户主动终止""" if self.memory_processor: self.memory_processor.set_user_terminated() def set_user_rejected(self) -> None: """标记用户拒绝通话""" if self.memory_processor: self.memory_processor.set_user_rejected() def set_silence_terminated(self) -> None: """标记静音超时终止""" if self.memory_processor: self.memory_processor.set_silence_terminated()

线程包装函数

def start_voice_conversation_thread(
*,
user_id: str,
api_key: str,
base_url: Optional[str] = None,
chat_model: str = "gpt-4o",
role_prompt: Optional[str] = None,
initial_message: Optional[str] = None,
tts_service: str = "fish",
fish_audio_key: Optional[str] = None,
fish_audio_model_id: Optional[str] = None,
input_device_index: Optional[int] = None,
output_device_index: Optional[int] = None,
idle_timeout: int = 120,
summary_callback: Optional[Callable[[str, str, List[dict]], None]] = None,
log_to_file: Optional[str] = None,
**kwargs
) -> tuple[StreamingVoicePipelineHandler, threading.Thread]:
"""
启动流式语音对话线程
固定使用:DashScope STT + OpenAI兼容 LLM + Fish Audio/MiniMax TTS
"""
if not PIPECAT_AVAILABLE:
raise RuntimeError("Required dependencies not available")

text
handler = StreamingVoicePipelineHandler( user_id=user_id, api_key=api_key, base_url=base_url, chat_model=chat_model, role_prompt=role_prompt, tts_service=tts_service, fish_audio_key=fish_audio_key, fish_audio_model_id=fish_audio_model_id, input_device_index=input_device_index, output_device_index=output_device_index, idle_timeout=idle_timeout, summary_callback=summary_callback, log_to_file=log_to_file, ) async def _runner() -> None: """异步运行器""" await handler.start_conversation(initial_message) def _thread_target() -> None: """线程目标函数""" try: asyncio.run(_runner()) except Exception as e: logger.error(f"Streaming voice conversation thread error: {e}", exc_info=True) # ✅ 启动线程时添加名称 thread = threading.Thread( target=_thread_target, name=f"VoiceConv-{user_id}", # 便于调试 daemon=True ) thread.start() return handler, thread

向后兼容的别名

VoicePipelineHandler = StreamingVoicePipelineHandler
VoiceConversationHandler = StreamingVoicePipelineHandler
HybridVoicePipelineHandler = StreamingVoicePipelineHandler

导出

all = [
'StreamingVoicePipelineHandler',
'VoicePipelineHandler', # 向后兼容
'VoiceConversationHandler', # 向后兼容
'HybridVoicePipelineHandler', # 向后兼容
'start_voice_conversation_thread',
'PIPECAT_AVAILABLE',
'FISH_AUDIO_AVAILABLE',
]从各种角度逐行检测我的代码是否存在任何可能发生的问题,导致代码完全无法正常运行,无法检测到麦克风的音量,又不会输出任何有用的调试日志。可是我已经查了很多遍都没有发现问题所以你需要帮我最仔细的侦查。
2025-07-12 14:23:25,720 - INFO - 非好友消息且非语音通话结束消息,已忽略。(msgattr=system)
2025-07-12 14:23:25,720 - INFO - 收到来自聊天窗口 "33" 中用户 "33" 的原始消息 (类型: text, 属性: friend): 开始语音
2025-07-12 14:23:25,725 - INFO - 收到来自监听列表用户 33 的个人私聊消息,准备处理。
2025-07-12 14:23:25,725 - INFO - 最终准备处理消息 from chat "33" by sender "33": 开始语音
2025-07-12 14:23:25,725 - INFO - 用户 33 请求开始语音对话
2025-07-12 14:23:31,256 - INFO - StreamingVoicePipelineHandler initialized for user 33
2025-07-12 14:23:31,257 - INFO - TTS Service: openai
2025-07-12 14:23:31,258 - INFO - Chat Model: gemini-2.5-pro
2025-07-12 14:23:31,258 - INFO - STT Service: DashScope Paraformer-Realtime-V2 (固定)
2025-07-12 14:23:31,259 - INFO - Fish Audio Key: 未设置 (长度: 0)
2025-07-12 14:23:31,259 - INFO - Fish Audio Model ID: 未设置
2025-07-12 14:23:31,262 - INFO - Fish Audio SDK Available: True
2025-07-12 14:23:31,263 - INFO - StreamingVoicePipelineHandler initialized for user 33
2025-07-12 14:23:31,267 - INFO - TTS Service: fish
2025-07-12 14:23:31,268 - INFO - Chat Model: gemini-2.5-pro
2025-07-12 14:23:31,269 - INFO - STT Service: DashScope Paraformer-Realtime-V2 (固定)
2025-07-12 14:23:31,269 - INFO - Fish Audio Key: 已设置 (长度: 32)
2025-07-12 14:23:31,269 - INFO - Fish Audio Model ID: f12a343c566646b986f4b9b66209a6f5
2025-07-12 14:23:31,269 - INFO - Fish Audio SDK Available: True
2025-07-12 14:23:31,273 - INFO - 已为用户 33 启动语音对话
2025-07-12 14:23:31,278 - WARNING - 未检测到微信语音窗口,继续进行对话
2025-07-12 14:23:31,278 - INFO - Starting streaming voice conversation...
2025-07-12 14:23:32,454 - INFO - 收到来自聊天窗口 "33" 中用户 "self" 的原始消息 (类型: text, 属性: self): 正在为您发起语音通话...
--- Logging error ---
2025-07-12 14:23:32,455 - INFO - 非好友消息且非语音通话结束消息,已忽略。(msgattr=self)
--- Logging error ---
2025-07-12 14:23:33,896 - INFO - 已为用户 33 启动语音对话
--- Logging error ---
2025-07-12 14:23:34,235 - INFO - 选择最佳设备: 索引 38, 名称: CABLE-A Input (VB-Audio Cable A), 通道: 2, 采样率: 16000Hz
2025-07-12 14:23:34,235 - INFO - 选择最佳设备: 索引 47, 名称: CABLE-A Output (VB-Audio Cable A), 通道: 1, 采样率: 16000Hz
2025-07-12 14:23:34,236 - INFO - 选择最佳设备: 索引 36, 名称: CABLE-B Input (VB-Audio Cable B), 通道: 2, 采样率: 16000Hz
2025-07-12 14:23:34,237 - INFO - 选择最佳设备: 索引 45, 名称: CABLE-B Output (VB-Audio Cable B), 通道: 1, 采样率: 16000Hz
2025-07-12 14:23:34,237 - INFO - ✅ 检测到完整的双VB-Cable配置 (防回声)
2025-07-12 14:23:35,785 - INFO - 正在查找PyAudio设备: CABLE-A Output (VB-Audio Cable A), 输入设备: True
2025-07-12 14:23:35,785 - INFO - PyAudio设备列表:
2025-07-12 14:23:35,788 - INFO - [00] 🎤 Microsoft 声音映射器 - Input (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,789 - INFO - [01] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,790 - INFO - [02] 🎤 VoiceMeeter Output (VB-Audio Vo (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,791 - INFO - [03] 🎤 CABLE-B Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,792 - INFO - [04] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,792 - INFO - [05] 🎤 CABLE-A Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,793 - INFO - [06] 🎤 CABLE Output (VB-Audio Virtual (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,794 - INFO - [07] 🔊 Microsoft 声音映射器 - Output (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,795 - INFO - [08] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,795 - INFO - [09] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,795 - INFO - [10] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,796 - INFO - [11] 🔊 CABLE-B Input (VB-Audio Cable B (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,796 - INFO - [12] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,797 - INFO - [13] 🔊 CABLE-A Input (VB-Audio Cable A (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,798 - INFO - [14] 🔊 VoiceMeeter Input (VB-Audio Voi (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,799 - INFO - [15] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,799 - INFO - [16] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,799 - INFO - [17] 🎤 主声音捕获驱动程序 (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,799 - INFO - [18] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,799 - INFO - [19] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,800 - INFO - [20] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,801 - INFO - [21] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,802 - INFO - [22] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,803 - INFO - [23] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,803 - INFO - [24] 🔊 主声音驱动程序 (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,804 - INFO - [25] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,804 - INFO - [26] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,805 - INFO - [27] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,805 - INFO - [28] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,805 - INFO - [29] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,806 - INFO - [30] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,807 - INFO - [31] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,808 - INFO - [32] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,808 - INFO - [33] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,809 - INFO - [34] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,809 - INFO - [35] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:35,810 - INFO - [36] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:35,810 - INFO - [37] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,811 - INFO - [38] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:35,812 - INFO - [39] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:35,812 - INFO - [40] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,813 - INFO - [41] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,813 - INFO - [42] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,814 - INFO - [43] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,814 - INFO - [44] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,815 - INFO - [45] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:35,815 - INFO - [46] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,816 - INFO - [47] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:35,816 - INFO - [48] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:1, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,817 - INFO - [49] 🎤 CABLE-A Output (VB-Audio CABLE-A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,818 - INFO - [50] 🔊 Speakers (VB-Audio CABLE-A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,818 - INFO - [51] 🎤 CABLE-B Output (VB-Audio CABLE-B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,819 - INFO - [52] 🔊 Speakers (VB-Audio CABLE-B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,819 - INFO - [53] 🎤 Microphone 1 () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,820 - INFO - [54] 🎤 Microphone 2 () (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:35,820 - INFO - [55] 🔊 Speakers () (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,820 - INFO - [56] 🔊 Speakers 1 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,822 - INFO - [57] 🔊 Speakers 2 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,822 - INFO - [58] 🎤 电脑扬声器 (Realtek HD Audio output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,823 - INFO - [59] 🎤 立体声混音 (Realtek HD Audio Stereo input) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,823 - INFO - [60] 🔊 Headphones 1 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,824 - INFO - [61] 🔊 Headphones 2 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:35,824 - INFO - [62] 🎤 电脑扬声器 (Realtek HD Audio 2nd output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,824 - INFO - [63] 🎤 麦克风 (Realtek HD Audio Mic input) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,824 - INFO - [64] 🔊 Output (BOOMVAD Wave Speaker) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,826 - INFO - [65] 🎤 Input (BOOMVAD Wave Speaker) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,826 - INFO - [66] 🎤 VoiceMeeter Output (VoiceMeeter vaio) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,826 - INFO - [67] 🔊 Speakers (VoiceMeeter vaio) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,826 - INFO - [68] 🎤 Krisp Microphone (Krisp Topology Microphone) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,826 - INFO - [69] 🔊 Communication Speaker (Krisp Topology Microphone) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:35,827 - INFO - [70] 🎤 CABLE Output (VB-Audio Point) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,827 - INFO - [71] 🔊 Speakers (VB-Audio Point) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,828 - INFO - [72] 🎤 Input (Steam Streaming Speakers Wave) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,828 - INFO - [73] 🔊 Speakers (Steam Streaming Speakers Wave) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,828 - INFO - [74] 🎤 麦克风阵列 1 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,829 - INFO - [75] 🎤 麦克风阵列 2 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:35,829 - INFO - [76] 🎤 麦克风阵列 3 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:35,829 - INFO - [77] 🎤 麦克风阵列 4 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:35,829 - INFO - [78] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,829 - INFO - [79] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,829 - INFO - [80] 🎤 Input () (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,830 - INFO - [81] 🔊 Speakers () (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:35,830 - INFO - [82] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:35,831 - INFO - ✅ 找到完全匹配的PyAudio设备: 索引 22, 名称: CABLE-A Output (VB-Audio Cable A)
2025-07-12 14:23:37,732 - INFO - ✅ 设备 CABLE-A Output (VB-Audio Cable A) 支持 1ch/16000Hz
2025-07-12 14:23:37,734 - INFO - ✅ 输入设备验证成功: 1ch
2025-07-12 14:23:39,549 - INFO - 正在查找PyAudio设备: CABLE-B Input (VB-Audio Cable B), 输入设备: False
2025-07-12 14:23:39,549 - INFO - PyAudio设备列表:
2025-07-12 14:23:39,550 - INFO - [00] 🎤 Microsoft 声音映射器 - Input (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,550 - INFO - [01] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,553 - INFO - [02] 🎤 VoiceMeeter Output (VB-Audio Vo (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,553 - INFO - [03] 🎤 CABLE-B Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,554 - INFO - [04] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,554 - INFO - [05] 🎤 CABLE-A Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,555 - INFO - [06] 🎤 CABLE Output (VB-Audio Virtual (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,555 - INFO - [07] 🔊 Microsoft 声音映射器 - Output (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,555 - INFO - [08] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,555 - INFO - [09] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,555 - INFO - [10] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,556 - INFO - [11] 🔊 CABLE-B Input (VB-Audio Cable B (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,557 - INFO - [12] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,558 - INFO - [13] 🔊 CABLE-A Input (VB-Audio Cable A (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,558 - INFO - [14] 🔊 VoiceMeeter Input (VB-Audio Voi (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,558 - INFO - [15] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,560 - INFO - [16] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,560 - INFO - [17] 🎤 主声音捕获驱动程序 (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,561 - INFO - [18] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,561 - INFO - [19] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,561 - INFO - [20] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,561 - INFO - [21] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,563 - INFO - [22] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,563 - INFO - [23] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,563 - INFO - [24] 🔊 主声音驱动程序 (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,565 - INFO - [25] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,565 - INFO - [26] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,566 - INFO - [27] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,566 - INFO - [28] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,566 - INFO - [29] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,568 - INFO - [30] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,569 - INFO - [31] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,569 - INFO - [32] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,570 - INFO - [33] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,570 - INFO - [34] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,570 - INFO - [35] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:39,570 - INFO - [36] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:39,571 - INFO - [37] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,571 - INFO - [38] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:39,571 - INFO - [39] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:39,571 - INFO - [40] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,573 - INFO - [41] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,574 - INFO - [42] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,574 - INFO - [43] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,575 - INFO - [44] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,575 - INFO - [45] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:39,576 - INFO - [46] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,577 - INFO - [47] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:39,577 - INFO - [48] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:1, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,578 - INFO - [49] 🎤 CABLE-A Output (VB-Audio CABLE-A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,578 - INFO - [50] 🔊 Speakers (VB-Audio CABLE-A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,578 - INFO - [51] 🎤 CABLE-B Output (VB-Audio CABLE-B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,580 - INFO - [52] 🔊 Speakers (VB-Audio CABLE-B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,581 - INFO - [53] 🎤 Microphone 1 () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,582 - INFO - [54] 🎤 Microphone 2 () (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:39,584 - INFO - [55] 🔊 Speakers () (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,584 - INFO - [56] 🔊 Speakers 1 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,585 - INFO - [57] 🔊 Speakers 2 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,585 - INFO - [58] 🎤 电脑扬声器 (Realtek HD Audio output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,586 - INFO - [59] 🎤 立体声混音 (Realtek HD Audio Stereo input) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,586 - INFO - [60] 🔊 Headphones 1 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,586 - INFO - [61] 🔊 Headphones 2 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:39,586 - INFO - [62] 🎤 电脑扬声器 (Realtek HD Audio 2nd output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,587 - INFO - [63] 🎤 麦克风 (Realtek HD Audio Mic input) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,587 - INFO - [64] 🔊 Output (BOOMVAD Wave Speaker) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,589 - INFO - [65] 🎤 Input (BOOMVAD Wave Speaker) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,589 - INFO - [66] 🎤 VoiceMeeter Output (VoiceMeeter vaio) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,590 - INFO - [67] 🔊 Speakers (VoiceMeeter vaio) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,590 - INFO - [68] 🎤 Krisp Microphone (Krisp Topology Microphone) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,591 - INFO - [69] 🔊 Communication Speaker (Krisp Topology Microphone) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:39,591 - INFO - [70] 🎤 CABLE Output (VB-Audio Point) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,591 - INFO - [71] 🔊 Speakers (VB-Audio Point) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,591 - INFO - [72] 🎤 Input (Steam Streaming Speakers Wave) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,593 - INFO - [73] 🔊 Speakers (Steam Streaming Speakers Wave) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,593 - INFO - [74] 🎤 麦克风阵列 1 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,594 - INFO - [75] 🎤 麦克风阵列 2 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:39,595 - INFO - [76] 🎤 麦克风阵列 3 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:39,595 - INFO - [77] 🎤 麦克风阵列 4 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:39,595 - INFO - [78] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,597 - INFO - [79] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,598 - INFO - [80] 🎤 Input () (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,599 - INFO - [81] 🔊 Speakers () (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:39,599 - INFO - [82] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:39,600 - INFO - ✅ 找到完全匹配的PyAudio设备: 索引 28, 名称: CABLE-B Input (VB-Audio Cable B)
2025-07-12 14:23:41,223 - INFO - ✅ 设备 CABLE-B Input (VB-Audio Cable B) 支持 1ch/16000Hz
2025-07-12 14:23:41,225 - INFO - ✅ 输出设备验证成功: 1ch
2025-07-12 14:23:41.226 | DEBUG | pipecat.audio.vad.silero:init:111 - Loading Silero VAD model...
2025-07-12 14:23:42.388 | DEBUG | pipecat.audio.vad.silero:init:133 - Loaded Silero VAD
2025-07-12 14:23:42,388 - INFO - ✅ Silero VAD enabled (stop: 0.8s, start: 0.2s, vol: 0.35)
2025-07-12 14:23:43,848 - INFO - 正在查找PyAudio设备: CABLE-A Output (VB-Audio Cable A), 输入设备: True
2025-07-12 14:23:43,849 - INFO - PyAudio设备列表:
2025-07-12 14:23:43,850 - INFO - [00] 🎤 Microsoft 声音映射器 - Input (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,850 - INFO - [01] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,852 - INFO - [02] 🎤 VoiceMeeter Output (VB-Audio Vo (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,852 - INFO - [03] 🎤 CABLE-B Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,853 - INFO - [04] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,853 - INFO - [05] 🎤 CABLE-A Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,853 - INFO - [06] 🎤 CABLE Output (VB-Audio Virtual (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,854 - INFO - [07] 🔊 Microsoft 声音映射器 - Output (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,854 - INFO - [08] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,856 - INFO - [09] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,856 - INFO - [10] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,856 - INFO - [11] 🔊 CABLE-B Input (VB-Audio Cable B (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,857 - INFO - [12] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,857 - INFO - [13] 🔊 CABLE-A Input (VB-Audio Cable A (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,858 - INFO - [14] 🔊 VoiceMeeter Input (VB-Audio Voi (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,859 - INFO - [15] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,859 - INFO - [16] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,859 - INFO - [17] 🎤 主声音捕获驱动程序 (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,860 - INFO - [18] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,861 - INFO - [19] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,861 - INFO - [20] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,862 - INFO - [21] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,862 - INFO - [22] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,863 - INFO - [23] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,863 - INFO - [24] 🔊 主声音驱动程序 (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,863 - INFO - [25] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,864 - INFO - [26] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,865 - INFO - [27] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,865 - INFO - [28] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,865 - INFO - [29] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,867 - INFO - [30] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,867 - INFO - [31] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,867 - INFO - [32] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,869 - INFO - [33] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,869 - INFO - [34] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,870 - INFO - [35] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:43,871 - INFO - [36] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:43,872 - INFO - [37] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,872 - INFO - [38] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:43,873 - INFO - [39] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:43,873 - INFO - [40] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,874 - INFO - [41] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,874 - INFO - [42] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,875 - INFO - [43] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,875 - INFO - [44] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,876 - INFO - [45] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:43,876 - INFO - [46] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,877 - INFO - [47] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:43,878 - INFO - [48] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:1, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,878 - INFO - [49] 🎤 CABLE-A Output (VB-Audio CABLE-A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,879 - INFO - [50] 🔊 Speakers (VB-Audio CABLE-A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,880 - INFO - [51] 🎤 CABLE-B Output (VB-Audio CABLE-B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,880 - INFO - [52] 🔊 Speakers (VB-Audio CABLE-B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,880 - INFO - [53] 🎤 Microphone 1 () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,881 - INFO - [54] 🎤 Microphone 2 () (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:43,881 - INFO - [55] 🔊 Speakers () (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,883 - INFO - [56] 🔊 Speakers 1 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,883 - INFO - [57] 🔊 Speakers 2 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,884 - INFO - [58] 🎤 电脑扬声器 (Realtek HD Audio output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,884 - INFO - [59] 🎤 立体声混音 (Realtek HD Audio Stereo input) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,885 - INFO - [60] 🔊 Headphones 1 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,885 - INFO - [61] 🔊 Headphones 2 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:43,886 - INFO - [62] 🎤 电脑扬声器 (Realtek HD Audio 2nd output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,887 - INFO - [63] 🎤 麦克风 (Realtek HD Audio Mic input) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,887 - INFO - [64] 🔊 Output (BOOMVAD Wave Speaker) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,888 - INFO - [65] 🎤 Input (BOOMVAD Wave Speaker) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,888 - INFO - [66] 🎤 VoiceMeeter Output (VoiceMeeter vaio) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,889 - INFO - [67] 🔊 Speakers (VoiceMeeter vaio) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,889 - INFO - [68] 🎤 Krisp Microphone (Krisp Topology Microphone) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,890 - INFO - [69] 🔊 Communication Speaker (Krisp Topology Microphone) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:43,890 - INFO - [70] 🎤 CABLE Output (VB-Audio Point) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,891 - INFO - [71] 🔊 Speakers (VB-Audio Point) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,891 - INFO - [72] 🎤 Input (Steam Streaming Speakers Wave) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,892 - INFO - [73] 🔊 Speakers (Steam Streaming Speakers Wave) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,893 - INFO - [74] 🎤 麦克风阵列 1 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,893 - INFO - [75] 🎤 麦克风阵列 2 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:43,894 - INFO - [76] 🎤 麦克风阵列 3 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:43,894 - INFO - [77] 🎤 麦克风阵列 4 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:43,894 - INFO - [78] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,895 - INFO - [79] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,895 - INFO - [80] 🎤 Input () (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,897 - INFO - [81] 🔊 Speakers () (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:43,897 - INFO - [82] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:43,898 - INFO - ✅ 找到完全匹配的PyAudio设备: 索引 22, 名称: CABLE-A Output (VB-Audio Cable A)
2025-07-12 14:23:45,172 - INFO - 正在查找PyAudio设备: CABLE-B Input (VB-Audio Cable B), 输入设备: False
2025-07-12 14:23:45,173 - INFO - PyAudio设备列表:
2025-07-12 14:23:45,173 - INFO - [00] 🎤 Microsoft 声音映射器 - Input (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,175 - INFO - [01] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,176 - INFO - [02] 🎤 VoiceMeeter Output (VB-Audio Vo (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,176 - INFO - [03] 🎤 CABLE-B Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,176 - INFO - [04] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,176 - INFO - [05] 🎤 CABLE-A Output (VB-Audio Cable (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,177 - INFO - [06] 🎤 CABLE Output (VB-Audio Virtual (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,177 - INFO - [07] 🔊 Microsoft 声音映射器 - Output (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,179 - INFO - [08] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,179 - INFO - [09] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,180 - INFO - [10] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,180 - INFO - [11] 🔊 CABLE-B Input (VB-Audio Cable B (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,180 - INFO - [12] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,181 - INFO - [13] 🔊 CABLE-A Input (VB-Audio Cable A (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,181 - INFO - [14] 🔊 VoiceMeeter Input (VB-Audio Voi (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,181 - INFO - [15] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,182 - INFO - [16] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,183 - INFO - [17] 🎤 主声音捕获驱动程序 (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,183 - INFO - [18] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:4, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,184 - INFO - [19] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,184 - INFO - [20] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,185 - INFO - [21] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,185 - INFO - [22] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,186 - INFO - [23] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,186 - INFO - [24] 🔊 主声音驱动程序 (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,186 - INFO - [25] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,187 - INFO - [26] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,187 - INFO - [27] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,187 - INFO - [28] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,188 - INFO - [29] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,188 - INFO - [30] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,189 - INFO - [31] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,189 - INFO - [32] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,191 - INFO - [33] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,191 - INFO - [34] 🔊 扬声器 (Boom Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,192 - INFO - [35] 🔊 扬声器 (Steam Streaming Speakers) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:45,192 - INFO - [36] 🔊 CABLE-B Input (VB-Audio Cable B) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:45,193 - INFO - [37] 🔊 Speaker (2- Realtek(R) Audio) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,193 - INFO - [38] 🔊 CABLE-A Input (VB-Audio Cable A) (输入:0, 输出:2, 采样率:16000Hz)
2025-07-12 14:23:45,193 - INFO - [39] 🔊 VoiceMeeter Input (VB-Audio VoiceMeeter VAIO) (输入:0, 输出:8, 采样率:48000Hz)
2025-07-12 14:23:45,193 - INFO - [40] 🔊 扬声器 (FxSound Audio Enhancer) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,193 - INFO - [41] 🔊 扬声器 (OMEN Cam & Voice) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,195 - INFO - [42] 🔊 扬声器 (VB-Audio Virtual Cable) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,195 - INFO - [43] 🎤 麦克风阵列 (2- 适用于数字麦克风的英特尔® 智音技术) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,195 - INFO - [44] 🎤 VoiceMeeter Output (VB-Audio VoiceMeeter VAIO) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,196 - INFO - [45] 🎤 CABLE-B Output (VB-Audio Cable B) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:45,196 - INFO - [46] 🎤 麦克风 (OMEN Cam & Voice) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,196 - INFO - [47] 🎤 CABLE-A Output (VB-Audio Cable A) (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:45,198 - INFO - [48] 🎤 CABLE Output (VB-Audio Virtual Cable) (输入:1, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,198 - INFO - [49] 🎤 CABLE-A Output (VB-Audio CABLE-A) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,199 - INFO - [50] 🔊 Speakers (VB-Audio CABLE-A) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,199 - INFO - [51] 🎤 CABLE-B Output (VB-Audio CABLE-B) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,200 - INFO - [52] 🔊 Speakers (VB-Audio CABLE-B) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,200 - INFO - [53] 🎤 Microphone 1 () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,200 - INFO - [54] 🎤 Microphone 2 () (输入:1, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:45,201 - INFO - [55] 🔊 Speakers () (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,201 - INFO - [56] 🔊 Speakers 1 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,202 - INFO - [57] 🔊 Speakers 2 (Realtek HD Audio output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,203 - INFO - [58] 🎤 电脑扬声器 (Realtek HD Audio output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,203 - INFO - [59] 🎤 立体声混音 (Realtek HD Audio Stereo input) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,203 - INFO - [60] 🔊 Headphones 1 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,203 - INFO - [61] 🔊 Headphones 2 (Realtek HD Audio 2nd output with SST) (输入:0, 输出:2, 采样率:44100Hz)
2025-07-12 14:23:45,204 - INFO - [62] 🎤 电脑扬声器 (Realtek HD Audio 2nd output with SST) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,204 - INFO - [63] 🎤 麦克风 (Realtek HD Audio Mic input) (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,205 - INFO - [64] 🔊 Output (BOOMVAD Wave Speaker) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,205 - INFO - [65] 🎤 Input (BOOMVAD Wave Speaker) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,205 - INFO - [66] 🎤 VoiceMeeter Output (VoiceMeeter vaio) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,207 - INFO - [67] 🔊 Speakers (VoiceMeeter vaio) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,207 - INFO - [68] 🎤 Krisp Microphone (Krisp Topology Microphone) (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,208 - INFO - [69] 🔊 Communication Speaker (Krisp Topology Microphone) (输入:0, 输出:2, 采样率:48000Hz)
2025-07-12 14:23:45,208 - INFO - [70] 🎤 CABLE Output (VB-Audio Point) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,208 - INFO - [71] 🔊 Speakers (VB-Audio Point) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,209 - INFO - [72] 🎤 Input (Steam Streaming Speakers Wave) (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,209 - INFO - [73] 🔊 Speakers (Steam Streaming Speakers Wave) (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,210 - INFO - [74] 🎤 麦克风阵列 1 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,210 - INFO - [75] 🎤 麦克风阵列 2 () (输入:2, 输出:0, 采样率:48000Hz)
2025-07-12 14:23:45,211 - INFO - [76] 🎤 麦克风阵列 3 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:45,211 - INFO - [77] 🎤 麦克风阵列 4 () (输入:4, 输出:0, 采样率:16000Hz)
2025-07-12 14:23:45,211 - INFO - [78] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,211 - INFO - [79] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,212 - INFO - [80] 🎤 Input () (输入:8, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,212 - INFO - [81] 🔊 Speakers () (输入:0, 输出:8, 采样率:44100Hz)
2025-07-12 14:23:45,213 - INFO - [82] 🎤 Input () (输入:2, 输出:0, 采样率:44100Hz)
2025-07-12 14:23:45,213 - INFO - ✅ 找到完全匹配的PyAudio设备: 索引 28, 名称: CABLE-B Input (VB-Audio Cable B)
2025-07-12 14:23:45,215 - INFO - 🔊 正常模式Transport配置 (修复后):
2025-07-12 14:23:45,215 - INFO - 输入设备: PyAudio索引 22 (从设备名 'CABLE-A Output (VB-Audio Cable A)')
2025-07-12 14:23:45,215 - INFO - 输出设备: PyAudio索引 28 (从设备名 'CABLE-B Input (VB-Audio Cable B)')
2025-07-12 14:23:45,216 - INFO - 🔍 关键修复: LocalAudioTransport现在使用PyAudio索引而非sounddevice索引
2025-07-12 14:23:45,216 - INFO - 🎤 录音端最终配置: PyAudio索引=22, 通道=1ch, 采样率=16000Hz
2025-07-12 14:23:45,217 - INFO - 🔊 播放端最终配置: PyAudio索引=28, 通道=1ch, 采样率=16000Hz
2025-07-12 14:23:45,217 - INFO - 🔧 音频流配置: blocksize=320样本 (20ms帧)
2025-07-12 14:23:45,217 - INFO - 🎤 使用PyAudio输入索引: 22 (从设备名称映射)
2025-07-12 14:23:45,217 - INFO - 🔊 使用PyAudio输出索引: 28 (从设备名称映射)
2025-07-12 14:23:46,527 - INFO - ✅ LocalAudioTransport创建成功
2025-07-12 14:23:46,527 - INFO - 🔄 双向防回声音频配置:
2025-07-12 14:23:46,527 - INFO - 📥 PyAudio输入索引: 22 (传给LocalAudioTransport)
2025-07-12 14:23:46,529 - INFO - 📤 PyAudio输出索引: 28 (传给LocalAudioTransport)
2025-07-12 14:23:46,529 - INFO - 🔍 sounddevice输入索引: 47 (仅用于设备检测)
2025-07-12 14:23:46,529 - INFO - 🔍 sounddevice输出索引: 36 (仅用于设备检测)
2025-07-12 14:23:46,530 - INFO - 🎤 打开 CABLE-A Output (VB-Audio Cable A) → channels=1, rate=16000Hz
2025-07-12 14:23:46,530 - INFO - 🔊 打开 CABLE-B Input (VB-Audio Cable B) → channels=1, rate=16000Hz
2025-07-12 14:23:46,530 - INFO - 🔊 微信扬声器: sounddevice索引 38 (CABLE-A Input (VB-Audio Cable A))
2025-07-12 14:23:46,530 - INFO - 🎤 微信麦克风: sounddevice索引 45 (CABLE-B Output (VB-Audio Cable B))
2025-07-12 14:23:46,531 - INFO - 🎤 输入设备名称: CABLE-A Output (VB-Audio Cable A)
2025-07-12 14:23:46,531 - INFO - 🔊 输出设备名称: CABLE-B Input (VB-Audio Cable B)
2025-07-12 14:23:46,532 - INFO - ✅ 关键修复: LocalAudioTransport现在正确使用PyAudio索引,sounddevice索引仅用于设备检测
2025-07-12 14:23:46,533 - INFO - 🎤 sounddevice输入设备: CABLE-A Output (VB-Audio Cable A) (1ch, 16000Hz)
2025-07-12 14:23:46,533 - INFO - 🔊 sounddevice输出设备: CABLE-B Input (VB-Audio Cable B) (2ch, 16000Hz)
2025-07-12 14:23:46,534 - INFO - 🎤 PyAudio输入设备 (实际使用): CABLE-A Output (VB-Audio Cable A) (8ch, 44100Hz)
2025-07-12 14:23:46,535 - INFO - 🔊 PyAudio输出设备 (实际使用): CABLE-B Input (VB-Audio Cable B) (8ch, 44100Hz)
2025-07-12 14:23:46,535 - INFO - DashScope Paraformer STT processor initialized with model: paraformer-realtime-v2
2025-07-12 14:23:46,535 - INFO - ✅ 使用阿里云 DashScope Paraformer-Realtime-V2 STT
2025-07-12 14:23:46,537 - INFO - 🔧 启动DashScope预连接,避免音频管道阻塞...
2025-07-12 14:23:46,537 - INFO - 🔧 DashScope预连接任务已启动,在后台建立WebSocket连接
2025-07-12 14:23:46,537 - INFO - ConversationMemoryProcessor initialized for user 33
2025-07-12 14:23:50,614 - INFO - 🔧 TTS配置检查: service=fish, fish_key=有效, sdk_available=True
2025-07-12 14:23:58,588 - INFO - Fish Audio TTS processor initialized with model: f12a343c566646b986f4b9b66209a6f5 (streaming mode)
2025-07-12 14:23:58,589 - INFO - ✅ 使用Fish Audio流式TTS (首包延迟<150ms)
2025-07-12 14:23:58,591 - INFO - AudioMonitorProcessor 'RAW输入监控' initialized
2025-07-12 14:23:58,591 - INFO - DownmixToMono初始化: 输入已是单声道,直接透传
2025-07-12 14:23:58,591 - INFO - DeviceTestProcessor initialized - 开始设备检测
2025-07-12 14:23:58,591 - INFO - AudioMonitorProcessor 'STT后监控' initialized
2025-07-12 14:23:58,591 - INFO - AudioMonitorProcessor 'TTS前监控' initialized
2025-07-12 14:23:58,591 - INFO - AudioMonitorProcessor 'TTS后监控' initialized
2025-07-12 14:23:58,593 - INFO - Pipeline: Transport(+VAD) → RAW监控 → Downmix(1→1) → 设备测试 → STT → STT后监控 → Memory → LLM → TTS前监控 → TTS → TTS后监控 → Transport
2025-07-12 14:23:58,594 - INFO - ✅ 关键修复完成: LocalAudioTransport现在使用正确的PyAudio设备索引
2025-07-12 14:23:58,594 - INFO - 输入: CABLE-A Output (VB-Audio Cable A) → PyAudio索引 22
2025-07-12 14:23:58,596 - INFO - 输出: CABLE-B Input (VB-Audio Cable B) → PyAudio索引 28
2025-07-12 14:23:58.596 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking PipelineSource#0 -> LocalAudioInputTransport#0
2025-07-12 14:23:58.597 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking LocalAudioInputTransport#0 -> RAW输入监控
2025-07-12 14:23:58.597 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking RAW输入监控 -> DownmixToMono
2025-07-12 14:23:58.598 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking DownmixToMono -> DeviceTest
2025-07-12 14:23:58.598 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking DeviceTest -> DashScopeParaformerSTTProcessor#0
2025-07-12 14:23:58.598 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking DashScopeParaformerSTTProcessor#0 -> STT后监控
2025-07-12 14:23:58.598 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking STT后监控 -> ConversationMemoryProcessor#0
2025-07-12 14:23:58.600 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking ConversationMemoryProcessor#0 -> OpenAILLMService#0
2025-07-12 14:23:58.600 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking OpenAILLMService#0 -> TTS前监控
2025-07-12 14:23:58.601 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking TTS前监控 -> FishAudioTTSProcessor#0
2025-07-12 14:23:58.602 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking FishAudioTTSProcessor#0 -> TTS后监控
2025-07-12 14:23:58.602 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking TTS后监控 -> LocalAudioOutputTransport#0
2025-07-12 14:23:58.603 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking LocalAudioOutputTransport#0 -> PipelineSink#0
2025-07-12 14:23:58.603 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking PipelineTaskSource#0 -> Pipeline#0
2025-07-12 14:23:58.603 | DEBUG | pipecat.processors.frame_processor:link:177 - Linking Pipeline#0 -> PipelineTaskSink#0
2025-07-12 14:23:58,603 - INFO - Starting pipeline with proper input queue configuration...
2025-07-12 14:23:58,605 - INFO - [assistant] 你好!我们开始语音对话吧。
2025-07-12 14:23:58,658 - INFO - 🔧 [DashScope连接] 开始WebSocket连接和认证流程...
2025-07-12 14:23:58,758 - INFO - Initial message logged to memory: 你好!我们开始语音对话吧。
2025-07-12 14:23:58.758 | DEBUG | pipecat.pipeline.runner:run:39 - Runner PipelineRunner#0 started running PipelineTask#0
2025-07-12 14:23:58,851 - INFO - 🎯 [RAW输入监控] 首次收到AudioRawFrame - 音频管道已激活
2025-07-12 14:23:59,410 - INFO - 🔧 [DashScope连接] WebSocket连接建立成功
2025-07-12 14:23:59,411 - INFO - 🔧 [DashScope连接] 发送run-task指令,task_id: 7fe2d5e1b702466c86974df235e974ed
2025-07-12 14:23:59,411 - INFO - 🔧 [DashScope连接] 等待task-started事件...
2025-07-12 14:23:59,466 - INFO - 🔧 [DashScope连接] 收到task-started事件,连接就绪
2025-07-12 14:23:59,467 - INFO - ✅ [DashScope连接] WebSocket连接完全就绪,可接收音频流
2025-07-12 14:24:01,807 - ERROR - 记忆管理异常: 'utf-8' codec can't decode byte 0xa9 in position 3069: invalid start byte

Pensamiento

Analyzing audio data processing

It seems RawInputMonitor counts frames, but only computes volume if the input is a numpy array, not if it's bytes. When the data’s bytes, it logs its length but doesn’t compute the volume unless a time interval passes. Meanwhile, DeviceTestProcessor only works with numpy arrays, so if audio is in bytes, it might not process the volume correctly. The audio data might need to be converted to ndarray format before it’s processed by the downmix and other functions. I’ll need to clarify the data conversion steps to ensure volume detection works properly.

Analyzing byte-to-ndarray conversion for volume detection

It seems the audio bytes, when processed, aren't being converted to ndarray before passing through volume calculation, leading AudioMonitor to miss the volume. The solution might be converting bytes to numpy arrays early on to allow for proper volume detection. Additionally, DeviceTest expects ndarray for volume computation but only gets bytes. The real issue stems from LocalAudioTransport returning bytes instead of numpy arrays, causing these steps to miss necessary processing for volume detection. A preprocessing step might help fix this.

Thought for 22 seconds

Respuesta

下面把「听不到麦克风音量、监控日志几乎空白」这两个核心症状,沿着音频数据流向日志链路逐行排查,指出会导致问题的关键逻辑断点,并给出能立即验证的修补点。为了便于对照,我引用片段用 行号/方法名 标注,而不重复整段代码。


1. 音频帧数据类型在链路中多次“退化”成 bytes

环节预期类型实际可能类型影响
LocalAudioInputTransport 产出np.ndarray[int16]bytes后续处理器无法解析声道/幅度
DownmixToMonoProcessor(单声道分支)(应保持 np.ndarray仍是 bytes(因为早退)既没 down-mix 也没类型转换
AudioMonitorProcessor & DeviceTestProcessor依赖 np.ndarray 计算 RMS接到 bytesRMS=0 → volume%始终 0,设备测试永不通过

立即可复现的现象

  • log 中只出现
    🎯 [RAW输入监控] 首次收到AudioRawFrame - 音频管道已激活
    
    后续 1 s-日志里没有 音量: x% |██░░░... 的条形图。
  • DeviceTestProcessor 不打印 ✅ 音频输入设备工作正常

快速修补思路

  1. 在最源头就规范类型
    LocalAudioInputTransport 把回调得到的 in_data 立即

    python
    audio_np = np.frombuffer(in_data, dtype=np.int16) yield AudioRawFrame(audio=audio_np, sample_rate=..., num_channels=...)

    这样后面所有处理器都拿到 ndarray,监控才能算 RMS。

  2. 防御式转换 – 在监控 / DeviceTest 内部兜底:

    python
    if isinstance(audio_data, (bytes, bytearray)): audio_data = np.frombuffer(audio_data, dtype=np.int16)

    使它们对残余的 bytes 也能工作。

  3. DownmixToMonoProcessor 单声道分支也要保证 ndarray
    目前早退逻辑:

    python
    if self.input_channels <= 1: return frame # 直接返回,audio 仍可能是 bytes

    应改为:

    python
    if self.input_channels <= 1 and isinstance(frame.audio, (bytes, bytearray)): frame.audio = np.frombuffer(frame.audio, dtype=np.int16) return frame

2. 日志链路的隐患

2-1 根日志配置

代码只 logger = logging.getLogger("VoicePipeline"),若上层没有
logging.basicConfig(level=logging.INFO, ... ),不同线程/子模块 log 可能被截断或落到 root logger 的默认 WARNING 级别

你的运行日志能看到 INFO,说明 elsewhere 配置过;但 线程中的异步任务(如 _blocking_tts_worker)如果在模块 import 之前创建,就会丢失 handler,导致 “--- Logging error ---” 样式的栈追踪。
建议:在最顶层入口(而不是库代码里)一次性:

python
logging.basicConfig( level=os.getenv("LOG_LEVEL", "INFO"), format="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d - %(message)s", )

2-2 ConversationMemoryProcessor._write_to_file_async

错误:

'utf-8' codec can't decode byte 0xa9 in position 3069

发生在 读取历史文件aiofiles 将已有内容解码为 str 时。多数情况下是 之前写入了 raw bytes(例如 STT 返回、或某处把 AudioFrame 误写进去)。
防线

python
if not isinstance(content, str): content = repr(content) # 或 base64.b64encode(...)

并考虑把打开模式改成 'a', errors='ignore',避免整条链路因一次异常中断。


3. 设备参数与实际采样率不符

  • 你把 录音/播放 强制在 LocalAudioTransportParams 里写死 sample_rate=16000
    而你在日志里看到 PyAudio 设备默认 44100 Hz,且 22 / 28 两个索引都标 8ch, 44100Hz
    在 Windows/WASAPI 下,PortAudio 会悄悄做 resample,但常见 bug 是 流创建成功却只输出静音
    排查:用 pa.is_format_supported(rate=16000, device_index=22, ... ) 再三确认;若不支持,按 44100 采并在 Downmix/前处理里统一重采样到 16 kHz。

4. 阈值与时间窗

  • DeviceTestProcessor._probe_channels_for_recording 回落到 2 声道;可是你最终把 AI_IN_CHANNELS 改成 1 声道。
    如果实际音频帧带着 2 ch,再到 Downmix 时早退不处理,DeviceTest 里用 len(audio_data) 判断却得到 2×samples → RMS 更低,可能 < 100。
  • AudioMonitorProcessor 每 1 s 聚合一次 volume_samples,但 silence 检测 只看 avg_volume > 0.1
    如果 np.frombuffer(..., int16) 后 RMS 大约是 几百到几千,你可直接把 % 计算
    python
    volume_percent = volume_rms / 327.67 # 目标 0-100
    现在你的 volume_percent = min(100, (volume_rms/32767)*100),语音 RMS ≈1500 → 4.5 %,日志条太短看不出来;可以加一个线性/对数缩放。

5. 线程 & 事件循环交互

  • _blocking_tts_worker 里用 asyncio.run_coroutine_threadsafe(..., asyncio.get_event_loop())
    如果在 默认线程池 中调用,而当前协程 loop 早已关闭(对话停止),会 raise RuntimeError: Event loop is closed,但你只打印日志,不把异常传回主协程;
    于是 TTS 音频永远不推送,而 Downstream 的 Monitor 也就收不到输出帧。
    修补:捕获该情况后 return,并在主协程侧检查队列超时 < 10 s 就报警。

快速验证 checklist

  1. LocalAudioInputTransport 回调里 打印首帧 dtype/shape
  2. 打开麦克风说话,确认 RAW输入监控 行里出现
    音量: xx.x% |██████░░░...
    
  3. DeviceTestProcessor 应在 2-3 s 内打印
    ✅ 音频输入设备工作正常
    
  4. 日志不再出现 'utf-8' codec can't decode 异常。

只要 1-3 全通过,麦克风音量与调试日志链路就算“打通”了。剩下的异常再按 4-5 条逐步压榨即可。祝调试顺利!

Compartir este Q&A