diff --git a/docs/modules/report-editor.md b/docs/modules/report-editor.md index 49a5f53..4f0956f 100644 --- a/docs/modules/report-editor.md +++ b/docs/modules/report-editor.md @@ -86,7 +86,7 @@ AI 面板支持两种模式: 讯飞听写通过后端 WebSocket 代理: - 前端连接 `/api/speech/iat`,不再生成讯飞鉴权 URL,也不读取 APPID/APIKey/APISecret。 -- 浏览器采集麦克风音频,转换为 16k PCM 后发送音频帧。 +- 浏览器采集麦克风音频后按实际 `AudioContext.sampleRate` 重采样为 16k、16bit、单声道 PCM,并按小帧发送音频帧。 - 启动前会检查浏览器是否支持 `navigator.mediaDevices.getUserMedia` 和 `AudioContext`;如果不是 `localhost` 或 HTTPS 等安全上下文,浏览器会禁止麦克风能力。Docker 演示环境可使用 `https://localhost:4443`,局域网普通 HTTP 只能通过 Chrome/Edge 演示启动参数临时标记为可信来源。 - 后端读取 Settings API 中的 `xfSpeechConfig`,连接讯飞 IAT,上游首帧由后端补齐 `common.app_id` 和默认 `business` 参数。 - 识别结果由后端转发回前端,并追加到 AI 输入框。 diff --git a/docs/progress.md b/docs/progress.md index ee567ac..dd67bc6 100644 --- a/docs/progress.md +++ b/docs/progress.md @@ -94,3 +94,4 @@ | 2026-05-02 | 修复报告编辑器加载已有 AI 区域后下拉栏初始显示“无可用 AI 区域”的问题。 | | 2026-05-02 | 调整抽帧百分比为两位小数保序保存;自动截图按时间顺序执行,自动插入按配置顺序执行。 | | 2026-05-02 | 加固报告编辑器语音采集,保留 Web Audio 节点引用、显式恢复 AudioContext,并在无识别文本时给出提示。 | +| 2026-05-02 | 对齐讯飞 IAT 音频帧协议,前端按实际采样率重采样到 16k PCM、按小帧发送,并使用标准结束帧。 | diff --git a/docs/testing.md b/docs/testing.md index e06d7b4..938d1ac 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -21,6 +21,7 @@ npm run build | 前端 Dashboard API | 工作台统计封装会请求 `/api/dashboard/stats` 并校验响应结构。 | | 前端审计 API | 审计日志列表封装会请求 `/api/audit-logs` 并校验响应结构。 | | 前端语音代理地址 | 根据当前页面来源或 `VITE_API_BASE_URL` 生成 `/api/speech/iat` WebSocket 地址。 | +| 前端语音 PCM 处理 | 麦克风浮点音频会按实际采样率重采样为 16k、编码为 16bit PCM,并按小帧拼接发送给讯飞代理。 | | 前端字段库和文件 API | 字段库读取/更新、通用文件列表/上传封装。 | | Auth 兼容映射 | 后端 `doctor` 角色会映射为当前前端使用的 `user`,并保留本地签名和模板授权。 | | 权限展示 | 侧边栏和路由守卫会按角色显示或阻止模板管理、用户管理、审计日志等入口。 | @@ -71,6 +72,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视 | Dashboard API 封装 | 已覆盖 | `api/dashboard.test.ts` | | 审计日志 API 封装 | 已覆盖 | `api/audit.test.ts` | | 语音 WebSocket 代理地址 | 已覆盖 | `api/speech.test.ts` | +| 语音 PCM 重采样和编码 | 已覆盖 | `utils/audioPcm.test.ts` | | 字段库 API 封装 | 已覆盖 | `api/library.test.ts` | | 通用文件 API 封装 | 已覆盖 | `api/files.test.ts` | | 后端用户到前端用户映射 | 已覆盖 | `auth/backendUser.test.ts` | @@ -112,7 +114,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视 | 报告编辑器完整流程 | 部分覆盖 | 已覆盖保存修订版本、个人模板和后端草稿/完成报告 schema;模板切换、字段同步仍待补。 | | 视频抽帧 | 待 E2E/人工 | 依赖真实视频解码和 canvas。 | | AI 撰写 | 待集成测试 | 需要隔离外部模型服务。 | -| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖后端首帧处理;完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 | +| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖前端 16k PCM 处理、后端首帧处理和 WebSocket 地址生成;完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 | ## Playwright 说明 diff --git a/src/pages/ReportEditor.tsx b/src/pages/ReportEditor.tsx index 97e752b..7395cff 100644 --- a/src/pages/ReportEditor.tsx +++ b/src/pages/ReportEditor.tsx @@ -24,6 +24,7 @@ import { listFiles, uploadFileResource } from '../api/files'; import { isLocalFallbackEnabled } from '../config/runtime'; import { diffChars } from 'diff'; import { areAiRegionOptionsEqual, getAiRegionOptions, type AiRegionOption } from '../utils/aiRegions'; +import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from '../utils/audioPcm'; import { buildFrameCaptureJobs, DEFAULT_FRAME_POSITIONS, normalizeFramePositions } from '../utils/framePositions'; type AudioWindow = Window & typeof globalThis & { @@ -107,6 +108,8 @@ export default function ReportEditor() { const xfMediaStreamRef = useRef(null); const xfAudioSourceRef = useRef(null); const xfAudioProcessorRef = useRef(null); + const xfPendingPcmBytesRef = useRef(new Uint8Array(0)); + const xfSpeechFrameStatusRef = useRef<0 | 1>(0); const xfSpeechFrameCountRef = useRef(0); const xfSpeechTextReceivedRef = useRef(false); const xfSpeechUserStoppedRef = useRef(false); @@ -1127,23 +1130,40 @@ export default function ReportEditor() { return html; }; - function floatTo16BitPCM(input: Float32Array): ArrayBuffer { - const output = new DataView(new ArrayBuffer(input.length * 2)); - for (let i = 0; i < input.length; i++) { - const s = Math.max(-1, Math.min(1, input[i])); - output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true); - } - return output.buffer; - } + const sendXfAudioBytes = (bytes: Uint8Array) => { + const ws = xfWsRef.current; + if (!ws || ws.readyState !== WebSocket.OPEN || bytes.length === 0) return; - function arrayBufferToBase64(buffer: ArrayBuffer): string { - const bytes = new Uint8Array(buffer); - let binary = ''; - for (let i = 0; i < bytes.byteLength; i++) { - binary += String.fromCharCode(bytes[i]); + ws.send(JSON.stringify({ + data: { + status: xfSpeechFrameStatusRef.current, + format: 'audio/L16;rate=16000', + encoding: 'raw', + audio: arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength)), + }, + })); + xfSpeechFrameStatusRef.current = 1; + xfSpeechFrameCountRef.current += 1; + }; + + const queueXfPcmBytes = (bytes: Uint8Array) => { + const chunkSize = 1280; + let pending = concatBytes(xfPendingPcmBytesRef.current, bytes); + + while (pending.length >= chunkSize) { + sendXfAudioBytes(pending.slice(0, chunkSize)); + pending = pending.slice(chunkSize); } - return btoa(binary); - } + + xfPendingPcmBytesRef.current = pending; + }; + + const flushXfPendingAudio = () => { + if (xfPendingPcmBytesRef.current.length > 0) { + sendXfAudioBytes(xfPendingPcmBytesRef.current); + xfPendingPcmBytesRef.current = new Uint8Array(0); + } + }; const toggleListening = async () => { // 专门提取一个彻底关闭物理麦克风的函数 @@ -1176,7 +1196,8 @@ export default function ReportEditor() { if (xfWsRef.current && xfWsRef.current.readyState === WebSocket.OPEN) { try { - const endFrame = { data: { status: 2, format: 'audio/L16;rate=16000', encoding: 'raw', audio: '' } }; + flushXfPendingAudio(); + const endFrame = { data: { status: 2 } }; xfWsRef.current.send(JSON.stringify(endFrame)); } catch {} } @@ -1199,16 +1220,17 @@ export default function ReportEditor() { const ws = new WebSocket(getSpeechIatWebSocketUrl()); xfWsRef.current = ws; + xfPendingPcmBytesRef.current = new Uint8Array(0); + xfSpeechFrameStatusRef.current = 0; xfSpeechFrameCountRef.current = 0; xfSpeechTextReceivedRef.current = false; xfSpeechUserStoppedRef.current = false; - let frameStatus = 0; ws.onopen = async () => { try { const stream = await mediaDevices.getUserMedia({ audio: true }); xfMediaStreamRef.current = stream; - const audioContext = new AudioContextClass({ sampleRate: 16000 }); + const audioContext = new AudioContextClass(); xfAudioContextRef.current = audioContext; if (audioContext.state === 'suspended') { await audioContext.resume(); @@ -1221,12 +1243,9 @@ export default function ReportEditor() { processor.onaudioprocess = (e) => { if (ws.readyState !== WebSocket.OPEN || !xfAudioContextRef.current) return; const inputData = e.inputBuffer.getChannelData(0); - const pcmBuffer = floatTo16BitPCM(inputData); - const base64Audio = arrayBufferToBase64(pcmBuffer); - const frame: any = { data: { status: frameStatus, format: 'audio/L16;rate=16000', encoding: 'raw', audio: base64Audio } }; - ws.send(JSON.stringify(frame)); - xfSpeechFrameCountRef.current += 1; - frameStatus = 1; + const downsampled = downsampleTo16K(inputData, xfAudioContextRef.current.sampleRate); + const pcmBuffer = floatTo16BitPCM(downsampled); + queueXfPcmBytes(new Uint8Array(pcmBuffer)); }; source.connect(processor); @@ -1273,6 +1292,7 @@ export default function ReportEditor() { && !xfSpeechTextReceivedRef.current; setIsListening(false); stopMicrophone(); + xfPendingPcmBytesRef.current = new Uint8Array(0); xfWsRef.current = null; if (shouldExplainNoText) { alert('语音听写已结束,但讯飞没有返回可用文字。请确认麦克风输入音量正常,并尽量使用普通话靠近麦克风重试。'); diff --git a/src/utils/audioPcm.test.ts b/src/utils/audioPcm.test.ts new file mode 100644 index 0000000..958c487 --- /dev/null +++ b/src/utils/audioPcm.test.ts @@ -0,0 +1,29 @@ +import { describe, expect, it } from 'vitest'; +import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from './audioPcm'; + +describe('audioPcm', () => { + it('downsamples browser audio to 16k with averaged samples', () => { + const input = new Float32Array([1, 0.5, 0, -0.5, -1, -0.5]); + const output = downsampleTo16K(input, 48000); + + expect(output[0]).toBeCloseTo(0.5); + expect(output[1]).toBeCloseTo(-2 / 3); + }); + + it('keeps existing 16k audio unchanged', () => { + const input = new Float32Array([0.1, 0.2]); + + expect(downsampleTo16K(input, 16000)).toBe(input); + }); + + it('encodes float samples as little-endian signed 16-bit PCM', () => { + const pcm = new Uint8Array(floatTo16BitPCM(new Float32Array([-1, 0, 1]))); + + expect(Array.from(pcm)).toEqual([0x00, 0x80, 0x00, 0x00, 0xff, 0x7f]); + }); + + it('converts bytes to base64 and concatenates pending chunks', () => { + expect(arrayBufferToBase64(new Uint8Array([1, 2, 3]).buffer)).toBe('AQID'); + expect(Array.from(concatBytes(new Uint8Array([1, 2]), new Uint8Array([3])))).toEqual([1, 2, 3]); + }); +}); diff --git a/src/utils/audioPcm.ts b/src/utils/audioPcm.ts new file mode 100644 index 0000000..6525aab --- /dev/null +++ b/src/utils/audioPcm.ts @@ -0,0 +1,50 @@ +export const downsampleTo16K = (input: Float32Array, sampleRate: number): Float32Array => { + const targetRate = 16000; + if (!sampleRate || sampleRate === targetRate) return input; + if (sampleRate < targetRate) return input; + + const ratio = sampleRate / targetRate; + const outputLength = Math.max(1, Math.round(input.length / ratio)); + const output = new Float32Array(outputLength); + let inputOffset = 0; + + for (let outputOffset = 0; outputOffset < outputLength; outputOffset++) { + const nextInputOffset = Math.min(input.length, Math.round((outputOffset + 1) * ratio)); + let sum = 0; + let count = 0; + for (let i = inputOffset; i < nextInputOffset; i++) { + sum += input[i]; + count += 1; + } + output[outputOffset] = count > 0 ? sum / count : 0; + inputOffset = nextInputOffset; + } + + return output; +}; + +export const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => { + const output = new DataView(new ArrayBuffer(input.length * 2)); + for (let i = 0; i < input.length; i++) { + const s = Math.max(-1, Math.min(1, input[i])); + output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true); + } + return output.buffer; +}; + +export const arrayBufferToBase64 = (buffer: ArrayBuffer): string => { + const bytes = new Uint8Array(buffer); + let binary = ''; + for (let i = 0; i < bytes.byteLength; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +}; + +export const concatBytes = (left: Uint8Array, right: Uint8Array): Uint8Array => { + if (left.length === 0) return right; + const merged = new Uint8Array(left.length + right.length); + merged.set(left, 0); + merged.set(right, left.length); + return merged; +};