ISISeg/Tools_scripts_XunFei-Ubuntu/xfyun_tts_ubuntu.py

#!/usr/bin/env python3
"""Generate XFYUN TTS voice files on Ubuntu.

This script supports both the normal XFYUN online TTS endpoint and the
super-realistic TTS endpoint used by the PowerShell workflow.
"""

from __future__ import annotations

import argparse
import base64
import email.utils
import hashlib
import hmac
import json
import os
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from urllib.parse import quote, urlparse


NORMAL_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
SUPER_TTS_URL = "wss://cbm01.cn-huabei-1.xf-yun.com/v1/private/mcd9m97e6"


@dataclass(frozen=True)
class ScriptSegment:
    index: str
    title: str
    text: str


def safe_filename(value: str) -> str:
    cleaned = re.sub(r'[\\/:*?"<>|]', "", value).strip()
    return cleaned or "segment"


def find_default_script(cwd: Path) -> Path:
    candidates = sorted(cwd.glob("*.md"))
    preferred = [
        path
        for path in candidates
        if path.name.startswith("配音稿") or path.name.lower().startswith("voice")
    ]
    fallback = [
        path
        for path in candidates
        if not path.name.startswith("配音生成工作流") and path.name.lower() != "readme.md"
    ]
    selected = (preferred or fallback or candidates)
    if not selected:
        raise FileNotFoundError("Cannot find script Markdown file. Use --script to specify one.")
    return selected[0]


def load_segments(script_path: Path) -> list[ScriptSegment]:
    content = script_path.read_text(encoding="utf-8-sig")
    pattern = re.compile(
        r"(?ms)^##\s+([1-9])\.\s+(.+?)\r?\n(.*?)(?=^##\s+[1-9]\.\s+|\Z)"
    )
    matches = pattern.findall(content)
    if not matches:
        raise ValueError("Cannot find sections like '## 1. title' in script Markdown.")

    segments: list[ScriptSegment] = []
    metadata = re.compile(r"^(说明|时长|备注|镜头|画面|音色|语速|输出|提示)[:：]")
    for index, title, body in matches:
        lines = []
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line or line.startswith("#") or metadata.match(line):
                continue
            lines.append(line)
        text = "\n".join(lines).replace("\t", " ").strip()
        if not text:
            raise ValueError(f"Section {index} has no readable text.")
        segments.append(ScriptSegment(index=index, title=title.strip(), text=text))
    return segments


def build_auth_url(request_url: str, api_key: str, api_secret: str) -> str:
    uri = urlparse(request_url)
    host_name = uri.hostname or ""
    path = uri.path or "/"
    date = email.utils.formatdate(usegmt=True)
    signature_origin = f"host: {host_name}\ndate: {date}\nGET {path} HTTP/1.1"
    digest = hmac.new(
        api_secret.encode("utf-8"),
        signature_origin.encode("utf-8"),
        hashlib.sha256,
    ).digest()
    signature = base64.b64encode(digest).decode("ascii")
    authorization_origin = (
        f'api_key="{api_key}", algorithm="hmac-sha256", '
        f'headers="host date request-line", signature="{signature}"'
    )
    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("ascii")
    return (
        f"{request_url}?authorization={quote(authorization)}"
        f"&date={quote(date)}&host={quote(host_name)}"
    )


def require_websocket():
    try:
        import websocket  # type: ignore
    except ImportError as exc:
        raise SystemExit(
            "Missing dependency: websocket-client. Install it with:\n"
            "  python3 -m pip install -r Tools_scripts_XunFei-Ubuntu/requirements-ubuntu.txt"
        ) from exc
    return websocket


def recv_json(socket: Any) -> dict[str, Any]:
    message = socket.recv()
    if isinstance(message, bytes):
        message = message.decode("utf-8")
    return json.loads(message)


def synthesize_normal(
    *,
    text: str,
    out_file: Path,
    app_id: str,
    api_key: str,
    api_secret: str,
    voice: str,
    speed: int,
    volume: int,
    pitch: int,
) -> None:
    websocket = require_websocket()
    url = build_auth_url(NORMAL_TTS_URL, api_key, api_secret)
    socket = websocket.create_connection(url, timeout=30)
    audio = bytearray()
    try:
        payload = {
            "common": {"app_id": app_id},
            "business": {
                "aue": "lame",
                "sfl": 1,
                "auf": "audio/L16;rate=16000",
                "vcn": voice,
                "speed": speed,
                "volume": volume,
                "pitch": pitch,
                "bgs": 0,
                "tte": "UTF8",
                "reg": "2",
                "rdn": "0",
            },
            "data": {
                "status": 2,
                "text": base64.b64encode(text.encode("utf-8")).decode("ascii"),
            },
        }
        socket.send(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
        while True:
            response = recv_json(socket)
            if response.get("code", 0) != 0:
                raise RuntimeError(
                    f"XFYUN normal TTS failed: code={response.get('code')}, "
                    f"message={response.get('message')}"
                )
            data = response.get("data") or {}
            if data.get("audio"):
                audio.extend(base64.b64decode(data["audio"]))
            if data.get("status") == 2:
                break
    finally:
        socket.close()

    if not audio:
        raise RuntimeError("No audio data returned by XFYUN normal TTS.")
    out_file.write_bytes(audio)


def synthesize_super(
    *,
    text: str,
    out_file: Path,
    app_id: str,
    api_key: str,
    api_secret: str,
    voice: str,
    speed: int,
    volume: int,
    pitch: int,
    raw_text: bool,
) -> None:
    websocket = require_websocket()
    url = build_auth_url(SUPER_TTS_URL, api_key, api_secret)
    socket = websocket.create_connection(url, timeout=30)
    audio = bytearray()
    request_text = text if raw_text else base64.b64encode(text.encode("utf-8")).decode("ascii")
    try:
        payload = {
            "header": {"app_id": app_id, "status": 2},
            "parameter": {
                "oral": {
                    "oral_level": "mid",
                    "spark_assist": 1,
                    "remain": 1,
                },
                "tts": {
                    "vcn": voice,
                    "speed": speed,
                    "volume": volume,
                    "pitch": pitch,
                    "bgs": 0,
                    "reg": 0,
                    "rdn": 0,
                    "rhy": 0,
                    "watermask": 0,
                    "implicit_watermark": False,
                    "audio": {
                        "encoding": "lame",
                        "sample_rate": 24000,
                        "channels": 1,
                        "bit_depth": 16,
                        "frame_size": 0,
                    },
                },
            },
            "payload": {
                "text": {
                    "encoding": "utf8",
                    "compress": "raw",
                    "format": "plain",
                    "status": 2,
                    "seq": 0,
                    "text": request_text,
                }
            },
        }
        socket.send(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
        while True:
            response = recv_json(socket)
            header = response.get("header") or {}
            if header and header.get("code", 0) != 0:
                raise RuntimeError(
                    f"XFYUN super TTS failed: code={header.get('code')}, "
                    f"message={header.get('message')}, sid={header.get('sid')}"
                )
            if response.get("code", 0) != 0:
                raise RuntimeError(
                    f"XFYUN super TTS failed: code={response.get('code')}, "
                    f"message={response.get('message')}"
                )
            payload_audio = ((response.get("payload") or {}).get("audio") or {})
            if payload_audio.get("audio"):
                audio.extend(base64.b64decode(payload_audio["audio"]))
            if header.get("status") == 2 or payload_audio.get("status") == 2:
                break
    finally:
        socket.close()

    if not audio:
        raise RuntimeError("No audio data returned by XFYUN super TTS.")
    out_file.write_bytes(audio)


def validate_range(name: str, value: int) -> None:
    if value < 0 or value > 100:
        raise ValueError(f"{name} must be between 0 and 100.")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generate XFYUN TTS audio on Ubuntu.")
    parser.add_argument("--script", type=Path, default=None, help="Markdown script path.")
    parser.add_argument("--output-dir", type=Path, default=Path("02_audio/xfyun_tts"))
    parser.add_argument("--mode", choices=["normal", "super"], default="super")
    parser.add_argument("--voice", default=None, help="XFYUN vcn voice name.")
    parser.add_argument("--speed", type=int, default=50)
    parser.add_argument("--volume", type=int, default=70)
    parser.add_argument("--pitch", type=int, default=50)
    parser.add_argument("--raw-text", action="store_true", help="Use raw text for super TTS.")
    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing mp3 files.")
    parser.add_argument("--dry-run", action="store_true", help="Only parse script and print plan.")
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    validate_range("speed", args.speed)
    validate_range("volume", args.volume)
    validate_range("pitch", args.pitch)

    script_path = args.script or find_default_script(Path.cwd())
    segments = load_segments(script_path)
    voice = args.voice or ("xiaoyan" if args.mode == "normal" else "x5_lingfeiyi_flow")

    print(f"script={script_path}")
    print(f"mode={args.mode}")
    print(f"voice={voice}")
    print(f"segments={len(segments)}")
    for segment in segments:
        print(f"  {segment.index}. {segment.title} ({len(segment.text)} chars)")

    if args.dry_run:
        return 0

    app_id = os.environ.get("XF_APPID")
    api_key = os.environ.get("XF_APIKEY")
    api_secret = os.environ.get("XF_APISECRET")
    if not app_id or not api_key or not api_secret:
        raise SystemExit("Please set XF_APPID, XF_APIKEY and XF_APISECRET first.")

    args.output_dir.mkdir(parents=True, exist_ok=True)
    for segment in segments:
        out_file = args.output_dir / f"{segment.index}-{safe_filename(segment.title)}.mp3"
        if out_file.exists() and not args.overwrite:
            print(f"skip existing: {out_file}")
            continue
        print(f"synthesizing {segment.index}: {segment.title}")
        if args.mode == "normal":
            synthesize_normal(
                text=segment.text,
                out_file=out_file,
                app_id=app_id,
                api_key=api_key,
                api_secret=api_secret,
                voice=voice,
                speed=args.speed,
                volume=args.volume,
                pitch=args.pitch,
            )
        else:
            synthesize_super(
                text=segment.text,
                out_file=out_file,
                app_id=app_id,
                api_key=api_key,
                api_secret=api_secret,
                voice=voice,
                speed=args.speed,
                volume=args.volume,
                pitch=args.pitch,
                raw_text=args.raw_text,
            )
        print(f"generated: {out_file}")

    print("all voice files generated")
    return 0


if __name__ == "__main__":
    try:
        raise SystemExit(main())
    except KeyboardInterrupt:
        raise SystemExit(130)