Stabilize speech audio capture

- Keep Web Audio source and processor nodes in refs so microphone processing is not lost during speech recognition. - Explicitly resume the AudioContext before sending PCM frames to the speech WebSocket proxy. - Disconnect audio nodes and clear speech state when listening stops or the WebSocket closes. - Show a clear prompt when audio frames were captured but no recognizable text was returned. - Update progress documentation for the speech capture hardening.
2026-05-02 06:20:48 +08:00
parent 5d936832da
commit 87ab7d4b9c
2 changed files with 39 additions and 1 deletions
--- a/docs/progress.md
+++ b/docs/progress.md
@@ -93,3 +93,4 @@
 | 2026-05-02 | 模板 HTML 导出包补充模板字段和字段管理设置，导入时恢复字段库元数据。 |
 | 2026-05-02 | 修复报告编辑器加载已有 AI 区域后下拉栏初始显示“无可用 AI 区域”的问题。 |
 | 2026-05-02 | 调整抽帧百分比为两位小数保序保存；自动截图按时间顺序执行，自动插入按配置顺序执行。 |
+| 2026-05-02 | 加固报告编辑器语音采集，保留 Web Audio 节点引用、显式恢复 AudioContext，并在无识别文本时给出提示。 |
--- a/src/pages/ReportEditor.tsx
+++ b/src/pages/ReportEditor.tsx
@@ -105,6 +105,11 @@ export default function ReportEditor() {
  const xfWsRef = useRef<WebSocket | null>(null);
  const xfAudioContextRef = useRef<AudioContext | null>(null);
  const xfMediaStreamRef = useRef<MediaStream | null>(null);
+  const xfAudioSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const xfAudioProcessorRef = useRef<ScriptProcessorNode | null>(null);
+  const xfSpeechFrameCountRef = useRef(0);
+  const xfSpeechTextReceivedRef = useRef(false);
+  const xfSpeechUserStoppedRef = useRef(false);
  const [quickPrompts, setQuickPrompts] = useState<string[]>([
    '请完善报告内容', '请对内容做如下修改：'
  ]);
@@ -1143,6 +1148,17 @@ export default function ReportEditor() {
  const toggleListening = async () => {
    // 专门提取一个彻底关闭物理麦克风的函数
    const stopMicrophone = () => {
+      if (xfAudioProcessorRef.current) {
+        try {
+          xfAudioProcessorRef.current.onaudioprocess = null;
+          xfAudioProcessorRef.current.disconnect();
+        } catch {}
+        xfAudioProcessorRef.current = null;
+      }
+      if (xfAudioSourceRef.current) {
+        try { xfAudioSourceRef.current.disconnect(); } catch {}
+        xfAudioSourceRef.current = null;
+      }
      if (xfAudioContextRef.current) {
        try { xfAudioContextRef.current.close(); } catch {}
        xfAudioContextRef.current = null;
@@ -1155,6 +1171,7 @@ export default function ReportEditor() {

    if (isListening) {
      setIsListening(false);
+      xfSpeechUserStoppedRef.current = true;
      stopMicrophone();

      if (xfWsRef.current && xfWsRef.current.readyState === WebSocket.OPEN) {
@@ -1182,6 +1199,9 @@ export default function ReportEditor() {

      const ws = new WebSocket(getSpeechIatWebSocketUrl());
      xfWsRef.current = ws;
+      xfSpeechFrameCountRef.current = 0;
+      xfSpeechTextReceivedRef.current = false;
+      xfSpeechUserStoppedRef.current = false;
      let frameStatus = 0;

      ws.onopen = async () => {
@@ -1190,8 +1210,13 @@ export default function ReportEditor() {
          xfMediaStreamRef.current = stream;
          const audioContext = new AudioContextClass({ sampleRate: 16000 });
          xfAudioContextRef.current = audioContext;
+          if (audioContext.state === 'suspended') {
+            await audioContext.resume();
+          }
          const source = audioContext.createMediaStreamSource(stream);
          const processor = audioContext.createScriptProcessor(4096, 1, 1);
+          xfAudioSourceRef.current = source;
+          xfAudioProcessorRef.current = processor;

          processor.onaudioprocess = (e) => {
            if (ws.readyState !== WebSocket.OPEN || !xfAudioContextRef.current) return;
@@ -1200,6 +1225,7 @@ export default function ReportEditor() {
            const base64Audio = arrayBufferToBase64(pcmBuffer);
            const frame: any = { data: { status: frameStatus, format: 'audio/L16;rate=16000', encoding: 'raw', audio: base64Audio } };
            ws.send(JSON.stringify(frame));
+            xfSpeechFrameCountRef.current += 1;
            frameStatus = 1;
          };

@@ -1227,6 +1253,7 @@ export default function ReportEditor() {
            let seg = '';
            for (const w of jsonData.data.result.ws) { if (w.cw?.[0]?.w) seg += w.cw[0].w; }
            if (seg) {
+              xfSpeechTextReceivedRef.current = true;
              setChatInput(prev => prev + seg);
            }
          }
@@ -1240,7 +1267,17 @@ export default function ReportEditor() {
      };

      ws.onerror = () => { alert('讯飞语音连接失败，请确认已登录且超级管理员已配置语音参数'); setIsListening(false); stopMicrophone(); };
-      ws.onclose = () => { setIsListening(false); stopMicrophone(); };
+      ws.onclose = () => {
+        const shouldExplainNoText = xfSpeechUserStoppedRef.current
+          && xfSpeechFrameCountRef.current > 0
+          && !xfSpeechTextReceivedRef.current;
+        setIsListening(false);
+        stopMicrophone();
+        xfWsRef.current = null;
+        if (shouldExplainNoText) {
+          alert('语音听写已结束，但讯飞没有返回可用文字。请确认麦克风输入音量正常，并尽量使用普通话靠近麦克风重试。');
+        }
+      };
    } catch (e: any) {
      alert('讯飞语音初始化失败: ' + e.message);
    }