feat: 完善视频传播、标注编辑和拆帧闭环

- 接入 SAM2 视频传播能力：新增 /api/ai/propagate，支持用当前帧 mask/polygon/bbox 作为 seed，通过 SAM2 video predictor 向前、向后或双向传播，并可保存为真实 annotation。 - 接入 SAM3 video tracker：通过独立 Python 3.12 external worker 调用 SAM3 video predictor/tracker，使用本地 checkpoint 与 bbox seed 执行视频级跟踪，并在模型状态中标记 video_track 能力。 - 完善 SAM 模型分发：sam_registry 按 model_id 明确区分 sam2 propagation 与 sam3 video_track，避免两个模型链路混用。 - 打通前端“传播片段”：VideoWorkspace 使用当前选中 mask 和当前 AI 模型调用后端传播接口，传播结果回写并刷新工作区已保存标注。 - 增强 SAM3 本地 checkpoint 配置：新增 sam3_checkpoint_path 配置和 .env.example 示例，状态检查改为基于本地 checkpoint/独立环境/模型包可用性。 - 完善视频拆帧参数：/api/media/parse 支持 parse_fps、max_frames、target_width，后端任务保存帧时间戳、源帧号和 frame_sequence 元数据。 - 增加运行时 schema 兼容处理：启动时为旧 frames 表补充 timestamp_ms 和 source_frame_number 列，避免旧库升级后缺字段。 - 强化 Canvas 标注编辑：补齐多边形闭合、点工具、顶点拖拽、边中点插入、Delete/Backspace 删除、区域合并和重叠去除等交互。 - 增强语义分类联动：选中 mask 后可通过右侧语义分类树更新标签、颜色和 class metadata，并同步到保存/导出链路。 - 增加关键帧时间轴体验：FrameTimeline 显示具体时间信息，并支持键盘左右方向键切换关键帧。 - 完善 AI 交互分割参数：前端保留正向点、反向点、框选和 interactive prompt 的调用状态，支持 SAM2 细化候选区域与 SAM3 bbox 入口。 - 扩展后端/前端 API 类型：新增 propagateMasks、传播请求/响应 schema，并补齐 annotation、导出、模型状态和任务接口的测试覆盖。 - 更新项目文档：同步 README、AGENTS、接口契约、需求冻结、设计冻结、前端元素审计、实施计划和测试计划，标明真实功能边界与剩余风险。 - 增加测试覆盖：补充 SAM2/SAM3 传播、SAM3 状态、媒体拆帧参数、Canvas 编辑、语义标签切换、时间轴、工作区传播和 API 合约测试。 - 加强仓库安全边界：将 sam3权重/ 加入 .gitignore，避免本地模型权重被误提交。验证：npm run test:run；pytest backend/tests；npm run lint；npm run build；python -m py_compile；git diff --check。
2026-05-01 20:27:33 +08:00
parent 689a9ba283
commit 5ab4602535
43 changed files with 2722 additions and 216 deletions
--- a/backend/services/sam3_external_worker.py
+++ b/backend/services/sam3_external_worker.py
@@ -43,6 +43,13 @@ def _compact_error(exc: Exception) -> str:


 def _checkpoint_access(model_version: str) -> tuple[bool, str | None]:
+    checkpoint_path = os.environ.get("SAM3_CHECKPOINT_PATH", "").strip()
+    if checkpoint_path:
+        path = Path(checkpoint_path)
+        if path.is_file():
+            return True, None
+        return False, f"local checkpoint not found: {checkpoint_path}"
+
    try:
        from huggingface_hub import hf_hub_download

@@ -55,6 +62,7 @@ def _checkpoint_access(model_version: str) -> tuple[bool, str | None]:

 def runtime_status() -> dict[str, Any]:
    model_version = os.environ.get("SAM3_MODEL_VERSION", "sam3")
+    checkpoint_path = os.environ.get("SAM3_CHECKPOINT_PATH", "").strip() or None
    package_error = None
    package_available = importlib.util.find_spec("sam3") is not None
    if package_available:
@@ -85,6 +93,7 @@ def runtime_status() -> dict[str, Any]:
        "available": available,
        "package_available": package_available,
        "checkpoint_access": checkpoint_access,
+        "checkpoint_path": checkpoint_path or f"official/HuggingFace ({model_version})",
        "python_ok": python_ok,
        "torch_ok": torch_version is not None,
        "torch_version": torch_version,
@@ -118,34 +127,67 @@ def _mask_to_polygon(mask: np.ndarray) -> list[list[float]]:

 def _to_numpy(value: Any) -> np.ndarray:
    if hasattr(value, "detach"):
-        value = value.detach().cpu().numpy()
-    elif hasattr(value, "cpu"):
+        value = value.detach()
+        if hasattr(value, "is_floating_point") and value.is_floating_point():
+            value = value.float()
        value = value.cpu().numpy()
+    elif hasattr(value, "cpu"):
+        value = value.cpu()
+        if hasattr(value, "is_floating_point") and value.is_floating_point():
+            value = value.float()
+        value = value.numpy()
    return np.asarray(value)


-def predict(request_path: Path) -> dict[str, Any]:
-    import torch
-    from sam3.model.sam3_image_processor import Sam3Processor
-    from sam3.model_builder import build_sam3_image_model
+def _xyxy_to_cxcywh(box: list[float]) -> list[float]:
+    if len(box) != 4:
+        raise ValueError("SAM 3 box prompt requires [x1, y1, x2, y2].")
+    x1, y1, x2, y2 = [min(max(float(value), 0.0), 1.0) for value in box]
+    left, right = sorted([x1, x2])
+    top, bottom = sorted([y1, y2])
+    width = max(right - left, 1e-6)
+    height = max(bottom - top, 1e-6)
+    return [left + width / 2, top + height / 2, width, height]

-    payload = json.loads(request_path.read_text(encoding="utf-8"))
-    image_path = Path(payload["image_path"])
-    text = str(payload["text"]).strip()
-    threshold = float(payload.get("confidence_threshold", 0.5))
-    if not text:
-        raise ValueError("SAM 3 semantic prompt requires non-empty text.")

-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
+def _bbox_from_seed(seed: dict[str, Any]) -> list[float]:
+    bbox = seed.get("bbox")
+    if isinstance(bbox, list) and len(bbox) == 4:
+        return [min(max(float(value), 0.0), 1.0) for value in bbox]

-    image = Image.open(image_path).convert("RGB")
-    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
-        model = build_sam3_image_model()
-        processor = Sam3Processor(model, confidence_threshold=threshold)
-        state = processor.set_image(image)
-        output = processor.set_text_prompt(state=state, prompt=text)
+    polygons = seed.get("polygons") or []
+    points = [point for polygon in polygons for point in polygon if len(point) >= 2]
+    if not points:
+        raise ValueError("SAM 3 video tracking requires seed bbox or polygons.")
+    xs = [min(max(float(point[0]), 0.0), 1.0) for point in points]
+    ys = [min(max(float(point[1]), 0.0), 1.0) for point in points]
+    left, right = min(xs), max(xs)
+    top, bottom = min(ys), max(ys)
+    return [left, top, max(right - left, 1e-6), max(bottom - top, 1e-6)]

+
+def _video_outputs_to_response(outputs: dict[str, Any]) -> dict[str, Any]:
+    masks = _to_numpy(outputs.get("out_binary_masks", []))
+    scores = _to_numpy(outputs.get("out_probs", []))
+    obj_ids = _to_numpy(outputs.get("out_obj_ids", []))
+    if masks.ndim == 4:
+        masks = masks[:, 0]
+    elif masks.ndim == 2:
+        masks = masks[None, ...]
+
+    polygons = []
+    out_scores = []
+    out_ids = []
+    for index, mask in enumerate(masks):
+        polygon = _mask_to_polygon(mask)
+        if polygon:
+            polygons.append(polygon)
+            out_scores.append(float(scores[index]) if scores.size > index else 1.0)
+            out_ids.append(int(obj_ids[index]) if obj_ids.size > index else index + 1)
+    return {"polygons": polygons, "scores": out_scores, "object_ids": out_ids}
+
+
+def _prediction_to_response(output: dict[str, Any]) -> dict[str, Any]:
    masks = _to_numpy(output.get("masks", []))
    scores = _to_numpy(output.get("scores", []))
    if masks.ndim == 4:
@@ -165,6 +207,115 @@ def predict(request_path: Path) -> dict[str, Any]:
    }


+def predict_video(request_path: Path) -> dict[str, Any]:
+    import torch
+    from sam3.model_builder import build_sam3_video_predictor
+
+    payload = json.loads(request_path.read_text(encoding="utf-8"))
+    frame_dir = Path(payload["frame_dir"])
+    source_frame_index = int(payload.get("source_frame_index", 0))
+    seed = payload.get("seed") or {}
+    direction = str(payload.get("direction") or "forward").lower()
+    max_frames = payload.get("max_frames")
+    max_frames = int(max_frames) if max_frames else None
+    checkpoint_path = str(payload.get("checkpoint_path") or os.environ.get("SAM3_CHECKPOINT_PATH", "")).strip()
+    threshold = float(payload.get("confidence_threshold", 0.5))
+    if direction not in {"forward", "backward", "both"}:
+        raise ValueError(f"Unsupported propagation direction: {direction}")
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    predictor = build_sam3_video_predictor(
+        checkpoint_path=checkpoint_path or None,
+        async_loading_frames=False,
+    )
+    session_id = None
+    try:
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            session = predictor.handle_request(
+                {
+                    "type": "start_session",
+                    "resource_path": str(frame_dir),
+                    "offload_video_to_cpu": True,
+                    "offload_state_to_cpu": True,
+                }
+            )
+            session_id = session["session_id"]
+            predictor.handle_request(
+                {
+                    "type": "add_prompt",
+                    "session_id": session_id,
+                    "frame_index": source_frame_index,
+                    "bounding_boxes": [_bbox_from_seed(seed)],
+                    "bounding_box_labels": [1],
+                    "output_prob_thresh": threshold,
+                    "rel_coordinates": True,
+                }
+            )
+            frames = []
+            for item in predictor.handle_stream_request(
+                {
+                    "type": "propagate_in_video",
+                    "session_id": session_id,
+                    "propagation_direction": direction,
+                    "start_frame_index": source_frame_index,
+                    "max_frame_num_to_track": max_frames,
+                    "output_prob_thresh": threshold,
+                }
+            ):
+                frame_response = _video_outputs_to_response(item.get("outputs") or {})
+                frame_response["frame_index"] = int(item["frame_index"])
+                frames.append(frame_response)
+    finally:
+        if session_id:
+            predictor.handle_request({"type": "close_session", "session_id": session_id})
+
+    return {"frames": frames}
+
+
+def predict(request_path: Path) -> dict[str, Any]:
+    import torch
+    from sam3.model.sam3_image_processor import Sam3Processor
+    from sam3.model_builder import build_sam3_image_model
+
+    payload = json.loads(request_path.read_text(encoding="utf-8"))
+    if str(payload.get("prompt_type") or "").strip().lower() == "video_track":
+        return predict_video(request_path)
+
+    image_path = Path(payload["image_path"])
+    prompt_type = str(payload.get("prompt_type") or "semantic").strip().lower()
+    text = str(payload.get("text") or "").strip()
+    threshold = float(payload.get("confidence_threshold", 0.5))
+    checkpoint_path = str(payload.get("checkpoint_path") or os.environ.get("SAM3_CHECKPOINT_PATH", "")).strip()
+    if prompt_type == "semantic" and not text:
+        raise ValueError("SAM 3 semantic prompt requires non-empty text.")
+    if prompt_type not in {"semantic", "box"}:
+        raise ValueError(f"Unsupported SAM 3 prompt type: {prompt_type}")
+
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    image = Image.open(image_path).convert("RGB")
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+        model = build_sam3_image_model(
+            checkpoint_path=checkpoint_path or None,
+            load_from_HF=not bool(checkpoint_path),
+        )
+        processor = Sam3Processor(model, confidence_threshold=threshold)
+        state = processor.set_image(image)
+        if prompt_type == "box":
+            output = processor.add_geometric_prompt(
+                state=state,
+                box=_xyxy_to_cxcywh(payload.get("box") or []),
+                label=True,
+            )
+        else:
+            output = processor.set_text_prompt(state=state, prompt=text)
+
+    return _prediction_to_response(output)
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description="SAM 3 external runtime helper")
    parser.add_argument("--status", action="store_true")