feat: 建立 SAM2 标注闭环基线

- 打通工作区真实标注闭环:支持手工多边形、矩形、圆形、点区域和线段生成 mask,并可保存、回显、更新和删除后端 annotation。

- 增强 polygon 编辑器:支持顶点拖动、顶点删除、边中点插入、多 polygon 子区域选择编辑,以及区域合并和区域去除。

- 接入 GT mask 导入:后端支持二值/多类别 mask 拆分、contour 转 polygon、distance transform seed point,前端支持导入、回显和 seed point 拖动编辑。

- 完善导出能力:COCO JSON 导出对齐前端,PNG mask ZIP 同时包含单标注 mask、按 zIndex 融合的 semantic_frame 和 semantic_classes.json。

- 打通异步任务管理:新增任务取消、重试、失败详情接口与 Dashboard 控件,worker 支持取消状态检查并通过 Redis/WebSocket 推送 cancelled 事件。

- 对接 Dashboard 后端数据:概览统计、解析队列和实时流转记录从 FastAPI 聚合接口与 WebSocket 更新。

- 增强 AI 推理参数:前端发送 crop_to_prompt、auto_filter_background 和 min_score,后端支持点/框 prompt 局部裁剪推理、结果回映射和负向点/低分过滤。

- 接入 SAM3 基础设施:新增独立 Python 3.12 sam3 环境安装脚本、外部 worker helper、后端桥接和真实 Python/CUDA/包/HF checkpoint access 状态检测。

- 保留 SAM3 授权边界:当前官方 facebook/sam3 gated 权重未授权时状态接口会返回不可用,不伪装成可推理。

- 增强前端状态管理:新增 mask undo/redo 历史栈、AI 模型选择状态、保存状态 dirty/draft/saved 流转和项目状态归一化。

- 更新前端 API 封装:补充 annotation CRUD、GT mask import、mask ZIP export、task cancel/retry/detail、AI runtime status 和 prediction options。

- 更新 UI 控件:ToolsPalette、AISegmentation、VideoWorkspace 和 CanvasArea 接入真实操作、导入导出、撤销重做、任务控制和模型状态。

- 新增 polygon-clipping 依赖,用于前端区域 union/difference 几何运算。

- 完善后端 schemas/status/progress:补充 AI 模型外部状态字段、任务 cancelled 状态和进度事件 payload。

- 补充测试覆盖:新增后端任务控制、SAM3 桥接、GT mask、导出融合、AI options 测试;补充前端 Canvas、Dashboard、VideoWorkspace、ToolsPalette、API 和 store 测试。

- 更新 README、AGENTS 和 doc 文档:冻结当前需求/设计/测试计划,标注真实功能、剩余 Mock、SAM3 授权边界和后续实施顺序。
This commit is contained in:
2026-05-01 15:26:25 +08:00
parent f020ff3b4f
commit 689a9ba283
48 changed files with 3280 additions and 176 deletions

View File

@@ -0,0 +1,190 @@
"""Standalone SAM 3 helper for the dedicated Python 3.12 runtime.
The main FastAPI backend can keep running in the existing Python 3.11/SAM 2
environment while this helper is executed with a separate conda env that meets
SAM 3's stricter runtime requirements.
"""
from __future__ import annotations
import argparse
import importlib.util
import json
import os
import sys
from pathlib import Path
from typing import Any
import numpy as np
from PIL import Image
def _torch_status() -> tuple[bool, str | None, str | None, str | None]:
try:
import torch
cuda_available = bool(torch.cuda.is_available())
return (
cuda_available,
getattr(torch, "__version__", None),
getattr(torch.version, "cuda", None),
torch.cuda.get_device_name(0) if cuda_available else None,
)
except Exception: # noqa: BLE001
return False, None, None, None
def _compact_error(exc: Exception) -> str:
lines = [line.strip() for line in str(exc).splitlines() if line.strip()]
for line in lines:
if "Access to model" in line or "Cannot access gated repo" in line:
return line
return lines[0] if lines else exc.__class__.__name__
def _checkpoint_access(model_version: str) -> tuple[bool, str | None]:
try:
from huggingface_hub import hf_hub_download
repo_id = "facebook/sam3.1" if model_version == "sam3.1" else "facebook/sam3"
hf_hub_download(repo_id=repo_id, filename="config.json")
return True, None
except Exception as exc: # noqa: BLE001
return False, _compact_error(exc)
def runtime_status() -> dict[str, Any]:
model_version = os.environ.get("SAM3_MODEL_VERSION", "sam3")
package_error = None
package_available = importlib.util.find_spec("sam3") is not None
if package_available:
try:
import sam3 # noqa: F401
except Exception as exc: # noqa: BLE001
package_available = False
package_error = str(exc)
cuda_available, torch_version, cuda_version, device_name = _torch_status()
python_ok = sys.version_info >= (3, 12)
checkpoint_access = False
checkpoint_error = None
if package_available:
checkpoint_access, checkpoint_error = _checkpoint_access(model_version)
available = bool(package_available and python_ok and cuda_available and checkpoint_access)
missing = []
if not python_ok:
missing.append("Python 3.12+ runtime")
if not package_available:
missing.append(f"sam3 package ({package_error})" if package_error else "sam3 package")
if torch_version is None:
missing.append("PyTorch")
if not cuda_available:
missing.append("CUDA GPU")
if package_available and not checkpoint_access:
missing.append(f"Hugging Face checkpoint access ({checkpoint_error})")
return {
"available": available,
"package_available": package_available,
"checkpoint_access": checkpoint_access,
"python_ok": python_ok,
"torch_ok": torch_version is not None,
"torch_version": torch_version,
"cuda_version": cuda_version,
"cuda_available": cuda_available,
"device": "cuda" if cuda_available else "unavailable",
"device_name": device_name,
"message": (
"SAM 3 external runtime is ready."
if available
else f"SAM 3 external runtime unavailable: missing {', '.join(missing)}."
),
}
def _mask_to_polygon(mask: np.ndarray) -> list[list[float]]:
import cv2
if mask.dtype != np.uint8:
mask = (mask > 0).astype(np.uint8)
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
height, width = mask.shape[:2]
largest = []
for contour in contours:
if len(contour) > len(largest):
largest = contour
if len(largest) < 3:
return []
return [[float(point[0][0]) / width, float(point[0][1]) / height] for point in largest]
def _to_numpy(value: Any) -> np.ndarray:
if hasattr(value, "detach"):
value = value.detach().cpu().numpy()
elif hasattr(value, "cpu"):
value = value.cpu().numpy()
return np.asarray(value)
def predict(request_path: Path) -> dict[str, Any]:
import torch
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.model_builder import build_sam3_image_model
payload = json.loads(request_path.read_text(encoding="utf-8"))
image_path = Path(payload["image_path"])
text = str(payload["text"]).strip()
threshold = float(payload.get("confidence_threshold", 0.5))
if not text:
raise ValueError("SAM 3 semantic prompt requires non-empty text.")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
image = Image.open(image_path).convert("RGB")
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
model = build_sam3_image_model()
processor = Sam3Processor(model, confidence_threshold=threshold)
state = processor.set_image(image)
output = processor.set_text_prompt(state=state, prompt=text)
masks = _to_numpy(output.get("masks", []))
scores = _to_numpy(output.get("scores", []))
if masks.ndim == 4:
masks = masks[:, 0]
elif masks.ndim == 3 and masks.shape[0] == 1:
masks = masks[None, 0]
polygons = []
for mask in masks:
polygon = _mask_to_polygon(mask)
if polygon:
polygons.append(polygon)
return {
"polygons": polygons,
"scores": scores.astype(float).tolist() if scores.size else [],
}
def main() -> int:
parser = argparse.ArgumentParser(description="SAM 3 external runtime helper")
parser.add_argument("--status", action="store_true")
parser.add_argument("--request", type=Path)
args = parser.parse_args()
try:
if args.status:
print(json.dumps(runtime_status(), ensure_ascii=False))
return 0
if args.request:
print(json.dumps(predict(args.request), ensure_ascii=False))
return 0
parser.error("Use --status or --request")
except Exception as exc: # noqa: BLE001
print(json.dumps({"error": str(exc)}, ensure_ascii=False), file=sys.stderr)
return 1
return 2
if __name__ == "__main__":
raise SystemExit(main())