Send speech audio as normalized PCM frames
- Resample microphone input from the actual browser AudioContext sample rate to 16k before sending it to the speech proxy. - Encode speech input as 16-bit PCM and send it in small 1280-byte frames instead of relying on the browser to create a 16k audio context. - Flush pending audio before sending the standard Xunfei IAT end frame. - Extract PCM helpers and cover downsampling, PCM encoding, base64 conversion, and byte concatenation with unit tests. - Update report editor, testing, and progress documentation for the corrected speech audio pipeline.
This commit is contained in:
@@ -86,7 +86,7 @@ AI 面板支持两种模式:
|
||||
讯飞听写通过后端 WebSocket 代理:
|
||||
|
||||
- 前端连接 `/api/speech/iat`,不再生成讯飞鉴权 URL,也不读取 APPID/APIKey/APISecret。
|
||||
- 浏览器采集麦克风音频,转换为 16k PCM 后发送音频帧。
|
||||
- 浏览器采集麦克风音频后按实际 `AudioContext.sampleRate` 重采样为 16k、16bit、单声道 PCM,并按小帧发送音频帧。
|
||||
- 启动前会检查浏览器是否支持 `navigator.mediaDevices.getUserMedia` 和 `AudioContext`;如果不是 `localhost` 或 HTTPS 等安全上下文,浏览器会禁止麦克风能力。Docker 演示环境可使用 `https://localhost:4443`,局域网普通 HTTP 只能通过 Chrome/Edge 演示启动参数临时标记为可信来源。
|
||||
- 后端读取 Settings API 中的 `xfSpeechConfig`,连接讯飞 IAT,上游首帧由后端补齐 `common.app_id` 和默认 `business` 参数。
|
||||
- 识别结果由后端转发回前端,并追加到 AI 输入框。
|
||||
|
||||
@@ -94,3 +94,4 @@
|
||||
| 2026-05-02 | 修复报告编辑器加载已有 AI 区域后下拉栏初始显示“无可用 AI 区域”的问题。 |
|
||||
| 2026-05-02 | 调整抽帧百分比为两位小数保序保存;自动截图按时间顺序执行,自动插入按配置顺序执行。 |
|
||||
| 2026-05-02 | 加固报告编辑器语音采集,保留 Web Audio 节点引用、显式恢复 AudioContext,并在无识别文本时给出提示。 |
|
||||
| 2026-05-02 | 对齐讯飞 IAT 音频帧协议,前端按实际采样率重采样到 16k PCM、按小帧发送,并使用标准结束帧。 |
|
||||
|
||||
@@ -21,6 +21,7 @@ npm run build
|
||||
| 前端 Dashboard API | 工作台统计封装会请求 `/api/dashboard/stats` 并校验响应结构。 |
|
||||
| 前端审计 API | 审计日志列表封装会请求 `/api/audit-logs` 并校验响应结构。 |
|
||||
| 前端语音代理地址 | 根据当前页面来源或 `VITE_API_BASE_URL` 生成 `/api/speech/iat` WebSocket 地址。 |
|
||||
| 前端语音 PCM 处理 | 麦克风浮点音频会按实际采样率重采样为 16k、编码为 16bit PCM,并按小帧拼接发送给讯飞代理。 |
|
||||
| 前端字段库和文件 API | 字段库读取/更新、通用文件列表/上传封装。 |
|
||||
| Auth 兼容映射 | 后端 `doctor` 角色会映射为当前前端使用的 `user`,并保留本地签名和模板授权。 |
|
||||
| 权限展示 | 侧边栏和路由守卫会按角色显示或阻止模板管理、用户管理、审计日志等入口。 |
|
||||
@@ -71,6 +72,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视
|
||||
| Dashboard API 封装 | 已覆盖 | `api/dashboard.test.ts` |
|
||||
| 审计日志 API 封装 | 已覆盖 | `api/audit.test.ts` |
|
||||
| 语音 WebSocket 代理地址 | 已覆盖 | `api/speech.test.ts` |
|
||||
| 语音 PCM 重采样和编码 | 已覆盖 | `utils/audioPcm.test.ts` |
|
||||
| 字段库 API 封装 | 已覆盖 | `api/library.test.ts` |
|
||||
| 通用文件 API 封装 | 已覆盖 | `api/files.test.ts` |
|
||||
| 后端用户到前端用户映射 | 已覆盖 | `auth/backendUser.test.ts` |
|
||||
@@ -112,7 +114,7 @@ AI 第三方接口、讯飞语音上游 WebSocket、麦克风权限和真实视
|
||||
| 报告编辑器完整流程 | 部分覆盖 | 已覆盖保存修订版本、个人模板和后端草稿/完成报告 schema;模板切换、字段同步仍待补。 |
|
||||
| 视频抽帧 | 待 E2E/人工 | 依赖真实视频解码和 canvas。 |
|
||||
| AI 撰写 | 待集成测试 | 需要隔离外部模型服务。 |
|
||||
| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖后端首帧处理;完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 |
|
||||
| 讯飞语音听写 | 部分覆盖/待集成测试 | 已覆盖前端 16k PCM 处理、后端首帧处理和 WebSocket 地址生成;完整链路仍需要 WebSocket 集成测试、麦克风权限和测试凭证。 |
|
||||
|
||||
## Playwright 说明
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import { listFiles, uploadFileResource } from '../api/files';
|
||||
import { isLocalFallbackEnabled } from '../config/runtime';
|
||||
import { diffChars } from 'diff';
|
||||
import { areAiRegionOptionsEqual, getAiRegionOptions, type AiRegionOption } from '../utils/aiRegions';
|
||||
import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from '../utils/audioPcm';
|
||||
import { buildFrameCaptureJobs, DEFAULT_FRAME_POSITIONS, normalizeFramePositions } from '../utils/framePositions';
|
||||
|
||||
type AudioWindow = Window & typeof globalThis & {
|
||||
@@ -107,6 +108,8 @@ export default function ReportEditor() {
|
||||
const xfMediaStreamRef = useRef<MediaStream | null>(null);
|
||||
const xfAudioSourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
|
||||
const xfAudioProcessorRef = useRef<ScriptProcessorNode | null>(null);
|
||||
const xfPendingPcmBytesRef = useRef<Uint8Array>(new Uint8Array(0));
|
||||
const xfSpeechFrameStatusRef = useRef<0 | 1>(0);
|
||||
const xfSpeechFrameCountRef = useRef(0);
|
||||
const xfSpeechTextReceivedRef = useRef(false);
|
||||
const xfSpeechUserStoppedRef = useRef(false);
|
||||
@@ -1127,23 +1130,40 @@ export default function ReportEditor() {
|
||||
return html;
|
||||
};
|
||||
|
||||
function floatTo16BitPCM(input: Float32Array): ArrayBuffer {
|
||||
const output = new DataView(new ArrayBuffer(input.length * 2));
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
|
||||
}
|
||||
return output.buffer;
|
||||
}
|
||||
const sendXfAudioBytes = (bytes: Uint8Array) => {
|
||||
const ws = xfWsRef.current;
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN || bytes.length === 0) return;
|
||||
|
||||
function arrayBufferToBase64(buffer: ArrayBuffer): string {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.byteLength; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
ws.send(JSON.stringify({
|
||||
data: {
|
||||
status: xfSpeechFrameStatusRef.current,
|
||||
format: 'audio/L16;rate=16000',
|
||||
encoding: 'raw',
|
||||
audio: arrayBufferToBase64(bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength)),
|
||||
},
|
||||
}));
|
||||
xfSpeechFrameStatusRef.current = 1;
|
||||
xfSpeechFrameCountRef.current += 1;
|
||||
};
|
||||
|
||||
const queueXfPcmBytes = (bytes: Uint8Array) => {
|
||||
const chunkSize = 1280;
|
||||
let pending = concatBytes(xfPendingPcmBytesRef.current, bytes);
|
||||
|
||||
while (pending.length >= chunkSize) {
|
||||
sendXfAudioBytes(pending.slice(0, chunkSize));
|
||||
pending = pending.slice(chunkSize);
|
||||
}
|
||||
return btoa(binary);
|
||||
}
|
||||
|
||||
xfPendingPcmBytesRef.current = pending;
|
||||
};
|
||||
|
||||
const flushXfPendingAudio = () => {
|
||||
if (xfPendingPcmBytesRef.current.length > 0) {
|
||||
sendXfAudioBytes(xfPendingPcmBytesRef.current);
|
||||
xfPendingPcmBytesRef.current = new Uint8Array(0);
|
||||
}
|
||||
};
|
||||
|
||||
const toggleListening = async () => {
|
||||
// 专门提取一个彻底关闭物理麦克风的函数
|
||||
@@ -1176,7 +1196,8 @@ export default function ReportEditor() {
|
||||
|
||||
if (xfWsRef.current && xfWsRef.current.readyState === WebSocket.OPEN) {
|
||||
try {
|
||||
const endFrame = { data: { status: 2, format: 'audio/L16;rate=16000', encoding: 'raw', audio: '' } };
|
||||
flushXfPendingAudio();
|
||||
const endFrame = { data: { status: 2 } };
|
||||
xfWsRef.current.send(JSON.stringify(endFrame));
|
||||
} catch {}
|
||||
}
|
||||
@@ -1199,16 +1220,17 @@ export default function ReportEditor() {
|
||||
|
||||
const ws = new WebSocket(getSpeechIatWebSocketUrl());
|
||||
xfWsRef.current = ws;
|
||||
xfPendingPcmBytesRef.current = new Uint8Array(0);
|
||||
xfSpeechFrameStatusRef.current = 0;
|
||||
xfSpeechFrameCountRef.current = 0;
|
||||
xfSpeechTextReceivedRef.current = false;
|
||||
xfSpeechUserStoppedRef.current = false;
|
||||
let frameStatus = 0;
|
||||
|
||||
ws.onopen = async () => {
|
||||
try {
|
||||
const stream = await mediaDevices.getUserMedia({ audio: true });
|
||||
xfMediaStreamRef.current = stream;
|
||||
const audioContext = new AudioContextClass({ sampleRate: 16000 });
|
||||
const audioContext = new AudioContextClass();
|
||||
xfAudioContextRef.current = audioContext;
|
||||
if (audioContext.state === 'suspended') {
|
||||
await audioContext.resume();
|
||||
@@ -1221,12 +1243,9 @@ export default function ReportEditor() {
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (ws.readyState !== WebSocket.OPEN || !xfAudioContextRef.current) return;
|
||||
const inputData = e.inputBuffer.getChannelData(0);
|
||||
const pcmBuffer = floatTo16BitPCM(inputData);
|
||||
const base64Audio = arrayBufferToBase64(pcmBuffer);
|
||||
const frame: any = { data: { status: frameStatus, format: 'audio/L16;rate=16000', encoding: 'raw', audio: base64Audio } };
|
||||
ws.send(JSON.stringify(frame));
|
||||
xfSpeechFrameCountRef.current += 1;
|
||||
frameStatus = 1;
|
||||
const downsampled = downsampleTo16K(inputData, xfAudioContextRef.current.sampleRate);
|
||||
const pcmBuffer = floatTo16BitPCM(downsampled);
|
||||
queueXfPcmBytes(new Uint8Array(pcmBuffer));
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
@@ -1273,6 +1292,7 @@ export default function ReportEditor() {
|
||||
&& !xfSpeechTextReceivedRef.current;
|
||||
setIsListening(false);
|
||||
stopMicrophone();
|
||||
xfPendingPcmBytesRef.current = new Uint8Array(0);
|
||||
xfWsRef.current = null;
|
||||
if (shouldExplainNoText) {
|
||||
alert('语音听写已结束,但讯飞没有返回可用文字。请确认麦克风输入音量正常,并尽量使用普通话靠近麦克风重试。');
|
||||
|
||||
29
src/utils/audioPcm.test.ts
Normal file
29
src/utils/audioPcm.test.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { arrayBufferToBase64, concatBytes, downsampleTo16K, floatTo16BitPCM } from './audioPcm';
|
||||
|
||||
describe('audioPcm', () => {
|
||||
it('downsamples browser audio to 16k with averaged samples', () => {
|
||||
const input = new Float32Array([1, 0.5, 0, -0.5, -1, -0.5]);
|
||||
const output = downsampleTo16K(input, 48000);
|
||||
|
||||
expect(output[0]).toBeCloseTo(0.5);
|
||||
expect(output[1]).toBeCloseTo(-2 / 3);
|
||||
});
|
||||
|
||||
it('keeps existing 16k audio unchanged', () => {
|
||||
const input = new Float32Array([0.1, 0.2]);
|
||||
|
||||
expect(downsampleTo16K(input, 16000)).toBe(input);
|
||||
});
|
||||
|
||||
it('encodes float samples as little-endian signed 16-bit PCM', () => {
|
||||
const pcm = new Uint8Array(floatTo16BitPCM(new Float32Array([-1, 0, 1])));
|
||||
|
||||
expect(Array.from(pcm)).toEqual([0x00, 0x80, 0x00, 0x00, 0xff, 0x7f]);
|
||||
});
|
||||
|
||||
it('converts bytes to base64 and concatenates pending chunks', () => {
|
||||
expect(arrayBufferToBase64(new Uint8Array([1, 2, 3]).buffer)).toBe('AQID');
|
||||
expect(Array.from(concatBytes(new Uint8Array([1, 2]), new Uint8Array([3])))).toEqual([1, 2, 3]);
|
||||
});
|
||||
});
|
||||
50
src/utils/audioPcm.ts
Normal file
50
src/utils/audioPcm.ts
Normal file
@@ -0,0 +1,50 @@
|
||||
export const downsampleTo16K = (input: Float32Array, sampleRate: number): Float32Array => {
|
||||
const targetRate = 16000;
|
||||
if (!sampleRate || sampleRate === targetRate) return input;
|
||||
if (sampleRate < targetRate) return input;
|
||||
|
||||
const ratio = sampleRate / targetRate;
|
||||
const outputLength = Math.max(1, Math.round(input.length / ratio));
|
||||
const output = new Float32Array(outputLength);
|
||||
let inputOffset = 0;
|
||||
|
||||
for (let outputOffset = 0; outputOffset < outputLength; outputOffset++) {
|
||||
const nextInputOffset = Math.min(input.length, Math.round((outputOffset + 1) * ratio));
|
||||
let sum = 0;
|
||||
let count = 0;
|
||||
for (let i = inputOffset; i < nextInputOffset; i++) {
|
||||
sum += input[i];
|
||||
count += 1;
|
||||
}
|
||||
output[outputOffset] = count > 0 ? sum / count : 0;
|
||||
inputOffset = nextInputOffset;
|
||||
}
|
||||
|
||||
return output;
|
||||
};
|
||||
|
||||
export const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => {
|
||||
const output = new DataView(new ArrayBuffer(input.length * 2));
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
const s = Math.max(-1, Math.min(1, input[i]));
|
||||
output.setInt16(i * 2, s < 0 ? s * 0x8000 : s * 0x7fff, true);
|
||||
}
|
||||
return output.buffer;
|
||||
};
|
||||
|
||||
export const arrayBufferToBase64 = (buffer: ArrayBuffer): string => {
|
||||
const bytes = new Uint8Array(buffer);
|
||||
let binary = '';
|
||||
for (let i = 0; i < bytes.byteLength; i++) {
|
||||
binary += String.fromCharCode(bytes[i]);
|
||||
}
|
||||
return btoa(binary);
|
||||
};
|
||||
|
||||
export const concatBytes = (left: Uint8Array, right: Uint8Array): Uint8Array => {
|
||||
if (left.length === 0) return right;
|
||||
const merged = new Uint8Array(left.length + right.length);
|
||||
merged.set(left, 0);
|
||||
merged.set(right, left.length);
|
||||
return merged;
|
||||
};
|
||||
Reference in New Issue
Block a user