Skip to main content
In this tutorial, we’ll use the Perceptron SDK to run Isaac 0.1 across an MP4 video of a surfing scene and stitch annotated frames back into a video. You can download the Jupyter notebook here.
1

Install dependencies and configure the SDK

Install the SDK, OpenCV, Pillow, and tqdm. Export an API key so configure() can pick it up.
uv pip install perceptron opencv-python pillow tqdm
export PERCEPTRON_API_KEY="sk_live_..."
Create frame_by_frame.py and add the imports + configuration block:
frame_by_frame.py
import os
from pathlib import Path

import cv2
from PIL import Image, ImageDraw
from tqdm import tqdm

from perceptron import configure, image, perceive, text

configure(
    provider="perceptron",
    api_key=os.getenv("PERCEPTRON_API_KEY", "<your_api_key_here>"),
)

VIDEO_PATH = Path("surf.mp4")
FRAMES_DIR = Path("frames")
ANNOTATIONS_DIR = Path("frames_annotated")
OUTPUT_VIDEO = Path("surf_annotated.mp4")

FRAMES_DIR.mkdir(exist_ok=True)
ANNOTATIONS_DIR.mkdir(exist_ok=True)
2

Download the sample video

Grab the shared surfing clip (or point the script at your own MP4).
curl -L -o surf.mp4 \
  https://raw.githubusercontent.com/perceptron-ai-inc/perceptron/main/cookbook/_shared/assets/tutorials/isaac_0.1_frame_by_frame/surf.mp4
3

Extract frames (tune the stride)

We sample one JPG every stride frames so long clips stay manageable.
def extract_frames(video_path: Path, dest_dir: Path, stride: int = 3) -> list[Path]:
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        raise RuntimeError(f"Failed to open video: {video_path}")

    saved: list[Path] = []
    idx = 0
    while True:
        ok, frame = cap.read()
        if not ok:
            break
        if idx % stride == 0:
            frame_path = dest_dir / f"frame_{idx:05d}.jpg"
            cv2.imwrite(str(frame_path), frame)
            saved.append(frame_path)
        idx += 1
    cap.release()
    print(f"Extracted {len(saved)} frames (stride={stride})")
    return saved

FRAME_PATHS = extract_frames(VIDEO_PATH, FRAMES_DIR, stride=3)
if not FRAME_PATHS:
    raise RuntimeError("No frames extracted; check the video and stride settings.")
Tuning tip: decrease stride for smoother playback, increase it when you just need periodic samples.
4

Detect surfers in every frame

The @perceive helper wraps Isaac 0.1 so we can send each frame plus a natural-language instruction. The loop captures the raw answer, counts boxes, converts them to pixel coordinates, and draws overlays.
PROMPT = "Find every surfer and surfboard in the frame. Return one bounding box per item."

@perceive(expects="box", allow_multiple=True)
def detect_surfers(frame_path: str):
    frame = image(frame_path)
    return frame + text(PROMPT)

all_detections = []

for frame_path in tqdm(FRAME_PATHS, desc="Detecting frames"):
    result = detect_surfers(str(frame_path))

    img = Image.open(frame_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    pixel_boxes = result.points_to_pixels(width=img.width, height=img.height) or []

    all_detections.append(
        {
            "frame": frame_path.name,
            "boxes_count": len(pixel_boxes),
            "text": result.text,
        }
    )

    for box in pixel_boxes:
        draw.rectangle(
            [
                int(box.top_left.x),
                int(box.top_left.y),
                int(box.bottom_right.x),
                int(box.bottom_right.y),
            ],
            outline="lime",
            width=3,
        )
        label = box.mention or getattr(box, "label", None) or "surfer"
        draw.text(
            (int(box.top_left.x), max(int(box.top_left.y) - 18, 0)),
            label,
            fill="lime",
        )

    img.save(ANNOTATIONS_DIR / frame_path.name)
  • result.points_to_pixels() keeps the normalized → pixel conversion consistent.
  • The all_detections list becomes a quick audit trail (counts + captions per frame).
5

Stitch annotated frames back to MP4

OpenCV writes the annotated JPGs back into a video using the original resolution and your preferred FPS.
def stitch_video(frame_dir: Path, output_path: Path, fps: int = 10) -> None:
    frames = sorted(frame_dir.glob("frame_*.jpg"))
    if not frames:
        raise RuntimeError("No annotated frames found to stitch.")

    sample = cv2.imread(str(frames[0]))
    height, width = sample.shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    for frame_path in frames:
        frame = cv2.imread(str(frame_path))
        writer.write(frame)
    writer.release()
    print(f"Saved annotated video to {output_path}")

stitch_video(ANNOTATIONS_DIR, OUTPUT_VIDEO, fps=10)
6

Run the pipeline

Execute the script end-to-end and inspect the outputs. You should see:
  • frames/ and frames_annotated/ populated with numbered JPGs.
  • surf_annotated.mp4 in the project root.
python frame_by_frame.py
open surf_annotated.mp4  # macOS (use your OS viewer)
Feel free to tweak the prompt and target classes to match your production objects. We’d be delighted to hear about your project on Discord.