Send speech audio as normalized PCM frames

- Resample microphone input from the actual browser AudioContext sample rate to 16k before sending it to the speech proxy. - Encode speech input as 16-bit PCM and send it in small 1280-byte frames instead of relying on the browser to create a 16k audio context. - Flush pending audio before sending the standard Xunfei IAT end frame. - Extract PCM helpers and cover downsampling, PCM encoding, base64 conversion, and byte concatenation with unit tests. - Update report editor, testing, and progress documentation for the corrected speech audio pipeline.
2026-05-02 06:30:56 +08:00
parent 87ab7d4b9c
commit 13d8853532
6 changed files with 128 additions and 26 deletions
--- a/docs/modules/report-editor.md
+++ b/docs/modules/report-editor.md
@@ -86,7 +86,7 @@ AI 面板支持两种模式：
 讯飞听写通过后端 WebSocket 代理：

 - 前端连接 `/api/speech/iat`，不再生成讯飞鉴权 URL，也不读取 APPID/APIKey/APISecret。
- 浏览器采集麦克风音频，转换为 16k PCM 后发送音频帧。
+- 浏览器采集麦克风音频后按实际 `AudioContext.sampleRate` 重采样为 16k、16bit、单声道 PCM，并按小帧发送音频帧。
 - 启动前会检查浏览器是否支持 `navigator.mediaDevices.getUserMedia` 和 `AudioContext`；如果不是 `localhost` 或 HTTPS 等安全上下文，浏览器会禁止麦克风能力。Docker 演示环境可使用 `https://localhost:4443`，局域网普通 HTTP 只能通过 Chrome/Edge 演示启动参数临时标记为可信来源。
 - 后端读取 Settings API 中的 `xfSpeechConfig`，连接讯飞 IAT，上游首帧由后端补齐 `common.app_id` 和默认 `business` 参数。
 - 识别结果由后端转发回前端，并追加到 AI 输入框。
--- a/docs/progress.md
+++ b/docs/progress.md
@@ -94,3 +94,4 @@
 | 2026-05-02 | 修复报告编辑器加载已有 AI 区域后下拉栏初始显示“无可用 AI 区域”的问题。 |
 | 2026-05-02 | 调整抽帧百分比为两位小数保序保存；自动截图按时间顺序执行，自动插入按配置顺序执行。 |
 | 2026-05-02 | 加固报告编辑器语音采集，保留 Web Audio 节点引用、显式恢复 AudioContext，并在无识别文本时给出提示。 |
+| 2026-05-02 | 对齐讯飞 IAT 音频帧协议，前端按实际采样率重采样到 16k PCM、按小帧发送，并使用标准结束帧。 |
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -21,6 +21,7 @@ npm run build
 | 前端 Dashboard API | 工作台统计封装会请求 `/api/dashboard/stats` 并校验响应结构。 |
 | 前端审计 API | 审计日志列表封装会请求 `/api/audit-logs` 并校验响应结构。 |
 | 前端语音代理地址 | 根据当前页面来源或 `VITE_API_BASE_URL` 生成 `/api/speech/iat` WebSocket 地址。 |
+| 前端语音 PCM 处理 | 麦克风浮点音频会按实际采样率重采样为 16k、编码为 16bit PCM，并按小帧拼接发送给讯飞代理。 |
 | 前端字段库和文件 API | 字段库读取/更新、通用文件列表/上传封装。 |
 | Auth 兼容映射 | 后端 `doctor` 角色会映射为当前前端使用的 `user`，并保留本地签名和模板授权。 |
 | 权限展示 | 侧边栏和路由守卫会按角色显示或阻止模板管理、用户管理、审计日志等入口。 |
@@ -71,6 +72,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视
 | Dashboard API 封装 | 已覆盖 | `api/dashboard.test.ts` |
 | 审计日志 API 封装 | 已覆盖 | `api/audit.test.ts` |
 | 语音 WebSocket 代理地址 | 已覆盖 | `api/speech.test.ts` |
+| 语音 PCM 重采样和编码 | 已覆盖 | `utils/audioPcm.test.ts` |
 | 字段库 API 封装 | 已覆盖 | `api/library.test.ts` |
 | 通用文件 API 封装 | 已覆盖 | `api/files.test.ts` |
 | 后端用户到前端用户映射 | 已覆盖 | `auth/backendUser.test.ts` |
@@ -112,7 +114,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视
 | 报告编辑器完整流程 | 部分覆盖 | 已覆盖保存修订版本、个人模板和后端草稿/完成报告 schema；模板切换、字段同步仍待补。 |
 | 视频抽帧 | 待 E2E/人工 | 依赖真实视频解码和 canvas。 |
 | AI 撰写 | 待集成测试 | 需要隔离外部模型服务。 |
-| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖后端首帧处理；完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 |
+| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖前端 16k PCM 处理、后端首帧处理和 WebSocket 地址生成；完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 |

 ## Playwright 说明

--- a/src/pages/ReportEditor.tsx
+++ b/src/pages/ReportEditor.tsx
@@ -24,6 +24,7 @@ import { listFiles, uploadFileResource } from '../api/files';
 import { isLocalFallbackEnabled } from '../config/runtime';
 import { diffChars } from 'diff';
 import { areAiRegionOptionsEqual, getAiRegionOptions, type AiRegionOption } from '../utils/aiRegions';
+import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from '../utils/audioPcm';
 import { buildFrameCaptureJobs, DEFAULT_FRAME_POSITIONS, normalizeFramePositions } from '../utils/framePositions';

 type AudioWindow = Window & typeof globalThis & {
@@ -107,6 +108,8 @@ export default function ReportEditor() {
  const xfMediaStreamRef = useRef<MediaStream | null>(null);
  const xfAudioSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
  const xfAudioProcessorRef = useRef<ScriptProcessorNode | null>(null);
+  const xfPendingPcmBytesRef = useRef<Uint8Array>(new Uint8Array(0));
+  const xfSpeechFrameStatusRef = useRef<0 | 1>(0);
  const xfSpeechFrameCountRef = useRef(0);
  const xfSpeechTextReceivedRef = useRef(false);
  const xfSpeechUserStoppedRef = useRef(false);
@@ -1127,23 +1130,40 @@ export default function ReportEditor() {
    return html;
  };

-  function floatTo16BitPCM(input: Float32Array): ArrayBuffer {
-    const output = new DataView(new ArrayBuffer(input.length * 2));
-    for (let i = 0; i < input.length; i++) {
-      const s = Math.max(-1, Math.min(1, input[i]));
-      output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
-    }
-    return output.buffer;
-  }
+  const sendXfAudioBytes = (bytes: Uint8Array) => {
+    const ws = xfWsRef.current;
+    if (!ws || ws.readyState !== WebSocket.OPEN || bytes.length === 0) return;

-  function arrayBufferToBase64(buffer: ArrayBuffer): string {
-    const bytes = new Uint8Array(buffer);
-    let binary = '';
-    for (let i = 0; i < bytes.byteLength; i++) {
-      binary += String.fromCharCode(bytes[i]);
+    ws.send(JSON.stringify({
+      data: {
+        status: xfSpeechFrameStatusRef.current,
+        format: 'audio/L16;rate=16000',
+        encoding: 'raw',
+        audio: arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength)),
+      },
+    }));
+    xfSpeechFrameStatusRef.current = 1;
+    xfSpeechFrameCountRef.current += 1;
+  };
+
+  const queueXfPcmBytes = (bytes: Uint8Array) => {
+    const chunkSize = 1280;
+    let pending = concatBytes(xfPendingPcmBytesRef.current, bytes);
+
+    while (pending.length >= chunkSize) {
+      sendXfAudioBytes(pending.slice(0, chunkSize));
+      pending = pending.slice(chunkSize);
    }
-    return btoa(binary);
-  }
+
+    xfPendingPcmBytesRef.current = pending;
+  };
+
+  const flushXfPendingAudio = () => {
+    if (xfPendingPcmBytesRef.current.length > 0) {
+      sendXfAudioBytes(xfPendingPcmBytesRef.current);
+      xfPendingPcmBytesRef.current = new Uint8Array(0);
+    }
+  };

  const toggleListening = async () => {
    // 专门提取一个彻底关闭物理麦克风的函数
@@ -1176,7 +1196,8 @@ export default function ReportEditor() {

      if (xfWsRef.current && xfWsRef.current.readyState === WebSocket.OPEN) {
        try {
-          const endFrame = { data: { status: 2, format: 'audio/L16;rate=16000', encoding: 'raw', audio: '' } };
+          flushXfPendingAudio();
+          const endFrame = { data: { status: 2 } };
          xfWsRef.current.send(JSON.stringify(endFrame));
        } catch {}
      }
@@ -1199,16 +1220,17 @@ export default function ReportEditor() {

      const ws = new WebSocket(getSpeechIatWebSocketUrl());
      xfWsRef.current = ws;
+      xfPendingPcmBytesRef.current = new Uint8Array(0);
+      xfSpeechFrameStatusRef.current = 0;
      xfSpeechFrameCountRef.current = 0;
      xfSpeechTextReceivedRef.current = false;
      xfSpeechUserStoppedRef.current = false;
-      let frameStatus = 0;

      ws.onopen = async () => {
        try {
          const stream = await mediaDevices.getUserMedia({ audio: true });
          xfMediaStreamRef.current = stream;
-          const audioContext = new AudioContextClass({ sampleRate: 16000 });
+          const audioContext = new AudioContextClass();
          xfAudioContextRef.current = audioContext;
          if (audioContext.state === 'suspended') {
            await audioContext.resume();
@@ -1221,12 +1243,9 @@ export default function ReportEditor() {
          processor.onaudioprocess = (e) => {
            if (ws.readyState !== WebSocket.OPEN || !xfAudioContextRef.current) return;
            const inputData = e.inputBuffer.getChannelData(0);
-            const pcmBuffer = floatTo16BitPCM(inputData);
-            const base64Audio = arrayBufferToBase64(pcmBuffer);
-            const frame: any = { data: { status: frameStatus, format: 'audio/L16;rate=16000', encoding: 'raw', audio: base64Audio } };
-            ws.send(JSON.stringify(frame));
-            xfSpeechFrameCountRef.current += 1;
-            frameStatus = 1;
+            const downsampled = downsampleTo16K(inputData, xfAudioContextRef.current.sampleRate);
+            const pcmBuffer = floatTo16BitPCM(downsampled);
+            queueXfPcmBytes(new Uint8Array(pcmBuffer));
          };

          source.connect(processor);
@@ -1273,6 +1292,7 @@ export default function ReportEditor() {
          && !xfSpeechTextReceivedRef.current;
        setIsListening(false);
        stopMicrophone();
+        xfPendingPcmBytesRef.current = new Uint8Array(0);
        xfWsRef.current = null;
        if (shouldExplainNoText) {
          alert('语音听写已结束，但讯飞没有返回可用文字。请确认麦克风输入音量正常，并尽量使用普通话靠近麦克风重试。');
--- a/src/utils/audioPcm.test.ts
+++ b/src/utils/audioPcm.test.ts
@@ -0,0 +1,29 @@
+import { describe, expect, it } from 'vitest';
+import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from './audioPcm';
+
+describe('audioPcm', () => {
+  it('downsamples browser audio to 16k with averaged samples', () => {
+    const input = new Float32Array([1, 0.5, 0, -0.5, -1, -0.5]);
+    const output = downsampleTo16K(input, 48000);
+
+    expect(output[0]).toBeCloseTo(0.5);
+    expect(output[1]).toBeCloseTo(-2 / 3);
+  });
+
+  it('keeps existing 16k audio unchanged', () => {
+    const input = new Float32Array([0.1, 0.2]);
+
+    expect(downsampleTo16K(input, 16000)).toBe(input);
+  });
+
+  it('encodes float samples as little-endian signed 16-bit PCM', () => {
+    const pcm = new Uint8Array(floatTo16BitPCM(new Float32Array([-1, 0, 1])));
+
+    expect(Array.from(pcm)).toEqual([0x00, 0x80, 0x00, 0x00, 0xff, 0x7f]);
+  });
+
+  it('converts bytes to base64 and concatenates pending chunks', () => {
+    expect(arrayBufferToBase64(new Uint8Array([1, 2, 3]).buffer)).toBe('AQID');
+    expect(Array.from(concatBytes(new Uint8Array([1, 2]), new Uint8Array([3])))).toEqual([1, 2, 3]);
+  });
+});
--- a/src/utils/audioPcm.ts
+++ b/src/utils/audioPcm.ts
@@ -0,0 +1,50 @@
+export const downsampleTo16K = (input: Float32Array, sampleRate: number): Float32Array => {
+  const targetRate = 16000;
+  if (!sampleRate || sampleRate === targetRate) return input;
+  if (sampleRate < targetRate) return input;
+
+  const ratio = sampleRate / targetRate;
+  const outputLength = Math.max(1, Math.round(input.length / ratio));
+  const output = new Float32Array(outputLength);
+  let inputOffset = 0;
+
+  for (let outputOffset = 0; outputOffset < outputLength; outputOffset++) {
+    const nextInputOffset = Math.min(input.length, Math.round((outputOffset + 1) * ratio));
+    let sum = 0;
+    let count = 0;
+    for (let i = inputOffset; i < nextInputOffset; i++) {
+      sum += input[i];
+      count += 1;
+    }
+    output[outputOffset] = count > 0 ? sum / count : 0;
+    inputOffset = nextInputOffset;
+  }
+
+  return output;
+};
+
+export const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => {
+  const output = new DataView(new ArrayBuffer(input.length * 2));
+  for (let i = 0; i < input.length; i++) {
+    const s = Math.max(-1, Math.min(1, input[i]));
+    output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
+  }
+  return output.buffer;
+};
+
+export const arrayBufferToBase64 = (buffer: ArrayBuffer): string => {
+  const bytes = new Uint8Array(buffer);
+  let binary = '';
+  for (let i = 0; i < bytes.byteLength; i++) {
+    binary += String.fromCharCode(bytes[i]);
+  }
+  return btoa(binary);
+};
+
+export const concatBytes = (left: Uint8Array, right: Uint8Array): Uint8Array => {
+  if (left.length === 0) return right;
+  const merged = new Uint8Array(left.length + right.length);
+  merged.set(left, 0);
+  merged.set(right, left.length);
+  return merged;
+};