diff --git a/.ci b/.ci
new file mode 160000
index 00000000..c602c96a
--- /dev/null
+++ b/.ci
@@ -0,0 +1 @@
+Subproject commit c602c96a0370cb3288f1cd0cbcdd0816dfb0621e
diff --git a/.ci/README.md b/.ci/README.md
deleted file mode 100644
index bfc28ea1..00000000
--- a/.ci/README.md
+++ /dev/null
@@ -1,388 +0,0 @@
-# .ci — CI Images and Pipeline
-
-```
-.ci/
-├── config.yaml              # Unified config (images, jobs, agent definitions)
-├── utils.py                 # Shared utilities (load_config, normalize_config, get_git_commit)
-├── agent.py                 # Runner Agent (scheduler, webhooks, remote dispatch)
-├── build.py                 # Image builder
-├── run.py                   # CI pipeline runner (Docker layer)
-├── ci_resource.py           # GPU/memory detection and allocation
-├── github_status.py         # GitHub Commit Status reporting
-├── images/
-│   ├── nvidia/Dockerfile
-│   ├── iluvatar/Dockerfile
-│   ├── metax/Dockerfile
-│   ├── moore/Dockerfile
-│   ├── cambricon/Dockerfile
-│   └── ascend/Dockerfile
-└── tests/                   # Unit tests
-    ├── conftest.py
-    ├── test_agent.py
-    ├── test_build.py
-    ├── test_run.py
-    ├── test_resource.py
-    ├── test_github_status.py
-    └── test_utils.py
-```
-
-**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml`
-
----
-
-## Configuration `config.yaml`
-
-Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list.
-At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`).
-
-```yaml
-repo:
-  url: https://github.com/InfiniTensor/InfiniOps.git
-  branch: master
-
-github:
-  status_context_prefix: "ci/infiniops"
-
-agents:                                  # Remote agent URLs (used by CLI for cross-machine dispatch)
-  nvidia:
-    url: http://nvidia-host:8080
-  iluvatar:
-    url: http://iluvatar-host:8080
-
-platforms:
-  nvidia:
-    image:                              # Image definition
-      dockerfile: .ci/images/nvidia/
-      build_args:
-        BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3
-    setup: pip install .[dev] --no-build-isolation
-    jobs:
-      gpu:                              # Flattened as `nvidia_gpu`.
-        resources:
-          ngpus: 1                      # Scheduler auto-picks this many free GPUs.
-          memory: 32GB
-          shm_size: 16g
-          timeout: 3600
-        stages:
-          - name: test
-            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
-
-  iluvatar:
-    image:
-      dockerfile: .ci/images/iluvatar/
-      build_args:
-        BASE_IMAGE: corex:qs_pj20250825
-        APT_MIRROR: http://archive.ubuntu.com/ubuntu
-        PIP_INDEX_URL: https://pypi.org/simple
-    docker_args:                        # Platform-level docker args, inherited by all jobs
-      - "--privileged"
-      - "--cap-add=ALL"
-      - "--pid=host"
-      - "--ipc=host"
-    volumes:
-      - /dev:/dev
-      - /lib/firmware:/lib/firmware
-      - /usr/src:/usr/src
-      - /lib/modules:/lib/modules
-    setup: pip install .[dev] --no-build-isolation
-    jobs:
-      gpu:                              # Flattened as `iluvatar_gpu`.
-        resources:
-          ngpus: 1
-          gpu_ids: auto
-          memory: 32GB
-          shm_size: 16g
-          timeout: 3600
-        stages:
-          - name: test
-            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
-```
-
-### Config hierarchy
-
-| Level | Field | Description |
-|---|---|---|
-| **Platform** | `image` | Image definition (dockerfile, build_args) |
-| | `image_tag` | Default image tag (defaults to `latest`) |
-| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) |
-| | `volumes` | Extra volume mounts |
-| | `setup` | In-container setup command |
-| | `env` | Injected container env vars |
-| **Job** | `resources.ngpus` | Number of GPUs to allocate (default: 1). Used with `gpu_ids: auto` for dynamic allocation |
-| | `resources.gpu_ids` | `auto`: scheduler picks `ngpus` least-loaded GPUs. Static: pin to specific IDs (e.g., `"0"`, `"0,2"`). `all`: use all GPUs |
-| | `resources.memory` | Container memory limit |
-| | `resources.shm_size` | Shared memory size |
-| | `resources.timeout` | Max run time in seconds |
-| | `stages` | Execution stage list |
-| | Any platform field | Jobs can override any platform-level default |
-
----
-
-## Image builder `build.py`
-
-| Flag | Description |
-|---|---|
-| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) |
-| `--commit` | Use specific commit ref as image tag (default: HEAD) |
-| `--force` | Skip Dockerfile change detection |
-| `--dry-run` | Print commands without executing |
-
-```bash
-# Build with change detection (skips if no Dockerfile changes)
-python .ci/build.py --platform nvidia
-
-# Build Iluvatar image
-python .ci/build.py --platform iluvatar --force
-
-# Force build all platforms
-python .ci/build.py --force
-```
-
-Build artifacts are stored as local Docker image tags: `infiniops-ci/<platform>:<commit-hash>` and `:latest`.
-Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically.
-
-> `--push` is reserved for future use; requires a `registry` section in `config.yaml`.
-
----
-
-## Pipeline runner `run.py`
-
-Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon`/`npu-smi` on PATH), no manual specification needed.
-
-| Flag | Description |
-|---|---|
-| `--config` | Config file path (default: `.ci/config.yaml`) |
-| `--job` | Job name (e.g., `nvidia_gpu`, `ascend_npu`). Defaults to all jobs for the current platform |
-| `--branch` | Override clone branch (default: config `repo.branch`) |
-| `--stage` | Run only the specified stage |
-| `--image-tag` | Override image tag |
-| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via platform-specific env var) |
-| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) |
-| `--results-dir` | Host directory mounted to `/workspace/results` inside the container |
-| `--local` | Mount current directory (read-only) instead of cloning from git |
-| `--dry-run` | Print docker command without executing |
-
-```bash
-# Simplest usage: auto-detect platform, run all jobs, use config default branch
-python .ci/run.py
-
-# Run a specific job
-python .ci/run.py --job nvidia_gpu
-
-# Run only the test stage, preview mode
-python .ci/run.py --job nvidia_gpu --stage test --dry-run
-
-# Test local uncommitted changes without pushing
-python .ci/run.py --local
-```
-
-Container execution flow: `git clone` → `checkout` → `setup` → stages (fail-fast: first failure breaks the loop and preserves the real exit code).
-With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified.
-Proxy vars are forwarded from the host. Test results are written to `--results-dir` (each run gets a unique directory with timestamp + UUID suffix). Each run uses a clean environment (no host pip cache mounted).
-
----
-
-## Platform differences
-
-| Platform | GPU passthrough | Device env var | Base image | Detection tool |
-|---|---|---|---|---|
-| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | — (uses Docker flag) | `nvcr.io/nvidia/pytorch:25.12-py3` | `nvidia-smi` |
-| Iluvatar | `--privileged` + `/dev` mount | `CUDA_VISIBLE_DEVICES` | `corex:qs_pj20250825` | `ixsmi` |
-| MetaX | `--privileged` | `CUDA_VISIBLE_DEVICES` | `maca-pytorch:3.2.1.4-...` | `mx-smi` |
-| Moore | `--privileged` | `MTHREADS_VISIBLE_DEVICES` | `vllm_musa:20251112_hygon` | `mthreads-gmi` |
-| Cambricon | `--privileged` | `MLU_VISIBLE_DEVICES` | `cambricon/pytorch:v1.25.3` | `cnmon` |
-| Ascend | `--privileged` + device mounts | `ASCEND_VISIBLE_DEVICES` | `vllm-ascend:v0.18.0rc1-openeuler` | `npu-smi` |
-
-Device visibility is derived from the platform name (see `PLATFORM_DEVICE_ENV` in `ci_resource.py`). NVIDIA uses Docker's `--gpus` flag; all other platforms use `--privileged` and control visibility via a platform-specific environment variable.
-
----
-
-## Runner Agent `agent.py`
-
-The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch.
-
-### CLI manual execution
-
-```bash
-# Run all jobs (dispatched to remote agents, using config default branch)
-python .ci/agent.py run
-
-# Specify branch
-python .ci/agent.py run --branch feat/xxx
-
-# Run a specific job
-python .ci/agent.py run --job nvidia_gpu
-
-# Filter by platform
-python .ci/agent.py run --platform nvidia
-
-# Preview mode
-python .ci/agent.py run --dry-run
-```
-
-| Flag | Description |
-|---|---|
-| `--branch` | Test branch (default: config `repo.branch`) |
-| `--job` | Specific job name |
-| `--platform` | Filter jobs by platform |
-| `--commit` | Override commit SHA used for GitHub status reporting |
-| `--image-tag` | Override image tag |
-| `--dry-run` | Preview mode |
-
-### Webhook server
-
-Deploy one Agent instance per platform machine (platform is auto-detected). On each machine:
-
-```bash
-python .ci/agent.py serve --port 8080
-```
-
-Additional `serve` flags:
-
-| Flag | Description |
-|---|---|
-| `--port` | Listen port (default: 8080) |
-| `--host` | Listen address (default: `0.0.0.0`) |
-| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) |
-| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) |
-| `--results-dir` | Results directory (default: `ci-results`) |
-| `--utilization-threshold` | GPU idle threshold percentage (default: 10) |
-
-| Endpoint | Method | Description |
-|---|---|---|
-| `/webhook` | POST | GitHub webhook (push/pull_request) |
-| `/api/run` | POST | Remote job trigger |
-| `/api/job/{id}` | GET | Query job status |
-| `/api/job/{id}/log` | GET | Full job log (text/plain) |
-| `/health` | GET | Health check |
-| `/status` | GET | Queue + resource status |
-
-Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var.
-
-### Remote agent configuration
-
-Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents:
-
-```yaml
-agents:
-  nvidia:
-    url: http://<nvidia-ip>:8080
-  iluvatar:
-    url: http://<iluvatar-ip>:8080
-  metax:
-    url: http://<metax-ip>:8080
-  moore:
-    url: http://<moore-ip>:8080
-```
-
-### Resource scheduling
-
-The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism:
-- GPUs with utilization < threshold (default 10%) and not already allocated → available
-- Allocation picks the **least-loaded** GPUs first (sorted by utilization ascending)
-- When `gpu_ids: auto` (default), the scheduler allocates `ngpus` GPUs per job
-- When resources are insufficient, jobs are queued automatically (max 100 pending); completed jobs release resources and trigger scheduling of queued tasks
-- Docker execution has a Python-level timeout fallback (job timeout + 120s) to prevent stuck containers
-
-### GitHub Status
-
-Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status:
-- `pending` — job started
-- `success` / `failure` — job completed
-
-Status context format: `ci/infiniops/{job_name}`
-
----
-
-## Multi-machine deployment guide
-
-### Per-platform setup
-
-Each machine needs Docker installed, the platform runtime, and the base CI image built.
-
-| Platform | Runtime check | Base image | Build command |
-|---|---|---|---|
-| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:25.12-py3` (public) | `python .ci/build.py --platform nvidia` |
-| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` |
-| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` |
-| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` |
-| Cambricon | `cnmon` | `cambricon/pytorch:v1.25.3` (import in advance) | `python .ci/build.py --platform cambricon` |
-| Ascend | `npu-smi` (+ Ascend driver + CANN toolkit) | `vllm-ascend:v0.18.0rc1-openeuler` (import in advance) | `python .ci/build.py --platform ascend` |
-
-### Start Agent services
-
-On each machine (platform is auto-detected):
-
-```bash
-python .ci/agent.py serve --port 8080
-```
-
-### Configure remote agent URLs
-
-On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format).
-
-### Trigger cross-platform tests
-
-```bash
-# Run all platform jobs at once (using config default branch)
-python .ci/agent.py run
-
-# Preview mode (no actual execution)
-python .ci/agent.py run --dry-run
-
-# Run only a specific platform
-python .ci/agent.py run --platform nvidia
-```
-
-### Optional configuration
-
-#### GitHub Status reporting
-
-Set the env var on all machines so each reports its own platform's test status:
-
-```bash
-export GITHUB_TOKEN=ghp_xxxxxxxxxxxx
-```
-
-#### API Token authentication
-
-When agents are exposed on untrusted networks, enable token auth:
-
-```bash
-python .ci/agent.py serve --port 8080 --api-token <secret>
-# Or: export AGENT_API_TOKEN=<secret>
-```
-
-#### GitHub Webhook auto-trigger
-
-In GitHub repo → Settings → Webhooks, add a webhook for each machine:
-
-| Field | Value |
-|---|---|
-| Payload URL | `http://<machine-ip>:8080/webhook` |
-| Content type | `application/json` |
-| Secret | Must match `--webhook-secret` |
-| Events | `push` and `pull_request` |
-
-```bash
-python .ci/agent.py serve --port 8080 --webhook-secret <github-secret>
-# Or: export WEBHOOK_SECRET=<github-secret>
-```
-
-### Verification checklist
-
-```bash
-# 1. Dry-run each machine individually
-for platform in nvidia iluvatar metax moore cambricon ascend; do
-  python .ci/agent.py run --platform $platform --dry-run
-done
-
-# 2. Health and resource checks
-for ip in <nvidia-ip> <iluvatar-ip> <metax-ip> <moore-ip> <cambricon-ip> <ascend-ip>; do
-  curl http://$ip:8080/health
-  curl http://$ip:8080/status
-done
-
-# 3. Cross-platform test
-python .ci/agent.py run --branch master
-```
diff --git a/.ci/agent.py b/.ci/agent.py
deleted file mode 100644
index 9e8899b5..00000000
--- a/.ci/agent.py
+++ /dev/null
@@ -1,1088 +0,0 @@
-#!/usr/bin/env python3
-"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting.
-
-Usage:
-    # Run jobs locally (or dispatch to remote agents)
-    python .ci/agent.py run
-    python .ci/agent.py run --branch master --job nvidia_gpu --dry-run
-
-    # Start webhook server (auto-detects platform)
-    python .ci/agent.py serve --port 8080
-"""
-
-import argparse
-import collections
-import hashlib
-import hmac
-import json
-import os
-import shlex
-import subprocess
-import sys
-import threading
-import time
-import urllib.error
-import urllib.request
-import uuid
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from pathlib import Path
-
-import ci_resource as res
-import github_status as gh
-import run
-
-# Maximum POST body size (1 MB) to prevent memory exhaustion
-MAX_CONTENT_LENGTH = 1 * 1024 * 1024
-
-# Job states
-STATE_QUEUED = "queued"
-STATE_RUNNING = "running"
-STATE_PENDING = "pending"
-STATE_SUCCESS = "success"
-STATE_FAILURE = "failure"
-STATE_ERROR = "error"
-
-TAIL_LINES = 50
-MAX_QUEUE_SIZE = 100
-
-# urllib helpers (module-level for easier mocking in tests)
-urllib_request = urllib.request.Request
-urllib_urlopen = urllib.request.urlopen
-
-
-class QueueFullError(Exception):
-    """Raised when the job queue has reached its maximum size."""
-
-
-# ---------------------------------------------------------------------------
-# Data classes
-# ---------------------------------------------------------------------------
-
-
-class JobRequest:
-    """Describes a CI job to be executed."""
-
-    def __init__(
-        self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None
-    ):
-        self.job_id = str(uuid.uuid4())[:8]
-        self.job_name = job_name
-        self.branch = branch
-        self.commit_sha = commit_sha
-        self.config = config
-        self.image_tag = image_tag
-        self.results_dir = results_dir or Path("ci-results")
-        self.created_at = datetime.now().isoformat()
-
-        job = config["jobs"][job_name]
-        self.platform = job.get("platform", "nvidia")
-
-    def to_dict(self):
-        return {
-            "job_id": self.job_id,
-            "job_name": self.job_name,
-            "branch": self.branch,
-            "commit_sha": self.commit_sha,
-            "platform": self.platform,
-            "created_at": self.created_at,
-        }
-
-
-class JobResult:
-    """Outcome of a completed job."""
-
-    def __init__(
-        self,
-        job_id,
-        job_name,
-        commit_sha,
-        returncode,
-        results_dir,
-        duration,
-        error_tail=None,
-        log_file=None,
-    ):
-        self.job_id = job_id
-        self.job_name = job_name
-        self.commit_sha = commit_sha
-        self.returncode = returncode
-        self.results_dir = results_dir
-        self.duration = duration
-        self.error_tail = error_tail or []
-        self.log_file = log_file
-
-        self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE
-
-    def to_dict(self):
-        d = {
-            "job_id": self.job_id,
-            "job_name": self.job_name,
-            "commit_sha": self.commit_sha,
-            "state": self.state,
-            "returncode": self.returncode,
-            "results_dir": str(self.results_dir),
-            "duration_seconds": round(self.duration, 1),
-        }
-
-        if self.error_tail:
-            d["error_tail"] = self.error_tail
-
-        if self.log_file:
-            d["log_file"] = str(self.log_file)
-
-        return d
-
-
-# ---------------------------------------------------------------------------
-# Scheduler
-# ---------------------------------------------------------------------------
-
-
-class Scheduler:
-    """Resource-aware job scheduler with dynamic parallelism."""
-
-    def __init__(
-        self,
-        config,
-        platform,
-        resource_pool,
-        results_dir=None,
-        max_workers=4,
-        no_status=False,
-        dry_run=False,
-    ):
-        self._config = config
-        self._platform = platform
-        self._resource_pool = resource_pool
-        self._results_dir = results_dir or Path("ci-results")
-        self._no_status = no_status
-        self._dry_run = dry_run
-        self._queue = collections.deque()
-        self._jobs: dict[str, dict] = {}  # job_id -> {request, result, state, gpu_ids}
-        self._executor = ThreadPoolExecutor(max_workers=max_workers)
-        self._lock = threading.Lock()
-        self._done_event = threading.Event()
-
-        # GitHub config
-        github_cfg = config.get("github", {})
-        self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops")
-        repo = config.get("repo", {})
-        repo_url = repo.get("url", "")
-        self._owner, self._repo = gh.parse_repo_url(repo_url)
-
-    def submit(self, job_request):
-        """Add a job to the queue and attempt to schedule it.
-
-        Returns the job_id. Raises ``QueueFullError`` if the queue is at
-        capacity.
-        """
-        with self._lock:
-            if len(self._queue) >= MAX_QUEUE_SIZE:
-                raise QueueFullError(
-                    f"queue full ({MAX_QUEUE_SIZE} jobs), try again later"
-                )
-
-            self._jobs[job_request.job_id] = {
-                "request": job_request,
-                "result": None,
-                "state": STATE_QUEUED,
-                "gpu_ids": [],
-            }
-            self._queue.append(job_request)
-
-        self._try_schedule()
-        return job_request.job_id
-
-    def get_job(self, job_id):
-        """Get job info by ID."""
-        with self._lock:
-            entry = self._jobs.get(job_id)
-
-            if not entry:
-                return None
-
-            info = entry["request"].to_dict()
-            info["state"] = entry["state"]
-
-            if entry["result"]:
-                info.update(entry["result"].to_dict())
-
-            return info
-
-    def get_job_log_file(self, job_id):
-        """Return the log file path for a completed job, or None."""
-        with self._lock:
-            entry = self._jobs.get(job_id)
-
-            if not entry or not entry["result"]:
-                return None
-
-            return entry["result"].log_file
-
-    def get_status(self):
-        """Return scheduler status for the /status endpoint."""
-        with self._lock:
-            queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue]
-            running = []
-            completed = []
-
-            for entry in self._jobs.values():
-                state = entry["state"]
-
-                if state == STATE_RUNNING:
-                    running.append(
-                        {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]}
-                    )
-                elif state in (STATE_SUCCESS, STATE_FAILURE):
-                    completed.append(entry["result"].to_dict())
-
-        return {
-            "queued": queued,
-            "running": running,
-            "completed": completed[-20:],  # Last 20
-            "resources": self._resource_pool.get_status(),
-        }
-
-    def wait_all(self):
-        """Block until all submitted jobs are done. Returns list of JobResult."""
-        while True:
-            with self._lock:
-                pending = any(
-                    e["state"] in (STATE_QUEUED, STATE_RUNNING)
-                    for e in self._jobs.values()
-                )
-
-            if not pending:
-                break
-
-            self._done_event.wait(timeout=2.0)
-            self._done_event.clear()
-
-        with self._lock:
-            return [e["result"] for e in self._jobs.values() if e["result"] is not None]
-
-    def _try_schedule(self):
-        """Try to run queued jobs that have enough resources.
-
-        Resource allocation and job submission are split: allocation decisions
-        are made under the lock, but executor.submit() happens outside to
-        prevent deadlock when the thread pool is saturated.
-        """
-        to_launch = []  # [(req, gpu_ids), ...]
-
-        with self._lock:
-            remaining = collections.deque()
-
-            while self._queue:
-                req = self._queue.popleft()
-                job_cfg = self._config["jobs"].get(req.job_name, {})
-                gpu_count = res.parse_gpu_requirement(job_cfg)
-                memory_mb = res.parse_memory_requirement(job_cfg)
-
-                if self._dry_run:
-                    # In dry-run mode, skip resource checks
-                    gpu_ids, ok = [], True
-                else:
-                    gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb)
-
-                if ok:
-                    self._jobs[req.job_id]["state"] = STATE_RUNNING
-                    self._jobs[req.job_id]["gpu_ids"] = gpu_ids
-                    to_launch.append((req, gpu_ids))
-                else:
-                    remaining.append(req)
-
-            self._queue = remaining
-
-        # Submit outside the lock to avoid deadlock with ThreadPoolExecutor
-        for req, gpu_ids in to_launch:
-            self._executor.submit(self._run_job, req, gpu_ids)
-
-    def _run_job(self, req, gpu_ids):
-        """Execute a single job in a worker thread.
-
-        Wrapped in try/finally to guarantee GPU resources are always released
-        and job state is updated even on unexpected exceptions.
-        """
-        context = gh.build_status_context(self._status_prefix, req.job_name)
-        result = None
-
-        try:
-            # Post pending status
-            if not self._no_status:
-                gh.post_commit_status(
-                    self._owner,
-                    self._repo,
-                    req.commit_sha,
-                    STATE_PENDING,
-                    context,
-                    f"Running {req.job_name}...",
-                )
-
-            job_cfg = self._config["jobs"][req.job_name]
-            all_stages = job_cfg.get("stages", [])
-            repo_url = self._config.get("repo", {}).get("url", "")
-            commit_short = (
-                req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha
-            )
-            results_dir = run.build_results_dir(
-                req.results_dir, req.platform, all_stages, commit_short
-            )
-
-            gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None
-            docker_args = run.build_docker_args(
-                self._config,
-                req.job_name,
-                repo_url,
-                req.branch,
-                all_stages,
-                "/workspace",
-                req.image_tag,
-                gpu_id_override=gpu_id_str,
-                results_dir=results_dir,
-            )
-
-            start = time.monotonic()
-
-            if self._dry_run:
-                print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}")
-                returncode = 0
-                error_tail = []
-                log_file = None
-            else:
-                results_dir.mkdir(parents=True, exist_ok=True)
-                log_file = results_dir / "job.log"
-                proc = subprocess.Popen(
-                    docker_args,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.STDOUT,
-                )
-                tail_buf = collections.deque(maxlen=TAIL_LINES)
-
-                with open(log_file, "wb") as lf:
-                    for line in proc.stdout:
-                        sys.stdout.buffer.write(line)
-                        lf.write(line)
-                        tail_buf.append(line)
-
-                proc.stdout.close()
-
-                # Python-level timeout as fallback for the in-container timeout.
-                job_timeout = job_cfg.get("resources", {}).get("timeout")
-                fallback_timeout = (job_timeout + 120) if job_timeout else 7200
-
-                try:
-                    returncode = proc.wait(timeout=fallback_timeout)
-                except subprocess.TimeoutExpired:
-                    proc.kill()
-                    proc.wait()
-                    returncode = -9
-                    timeout_msg = f"Job killed: exceeded {fallback_timeout}s timeout\n"
-                    tail_buf.append(timeout_msg.encode())
-
-                    with open(log_file, "ab") as lf:
-                        lf.write(timeout_msg.encode())
-
-                if returncode != 0:
-                    error_tail = [
-                        raw.decode("utf-8", errors="replace").rstrip("\n")
-                        for raw in tail_buf
-                    ]
-                else:
-                    error_tail = []
-
-            duration = time.monotonic() - start
-
-            result = JobResult(
-                job_id=req.job_id,
-                job_name=req.job_name,
-                commit_sha=req.commit_sha,
-                returncode=returncode,
-                results_dir=results_dir,
-                duration=duration,
-                error_tail=error_tail,
-                log_file=log_file,
-            )
-
-            # Post final status
-            if not self._no_status:
-                gh.post_commit_status(
-                    self._owner,
-                    self._repo,
-                    req.commit_sha,
-                    result.state,
-                    context,
-                    f"{req.job_name}: {result.state} in {duration:.0f}s",
-                )
-        except Exception as e:
-            print(
-                f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr
-            )
-
-            if result is None:
-                result = JobResult(
-                    job_id=req.job_id,
-                    job_name=req.job_name,
-                    commit_sha=req.commit_sha,
-                    returncode=-1,
-                    results_dir=req.results_dir,
-                    duration=0,
-                    error_tail=[str(e)],
-                )
-
-            if not self._no_status:
-                gh.post_commit_status(
-                    self._owner,
-                    self._repo,
-                    req.commit_sha,
-                    STATE_ERROR,
-                    context,
-                    f"{req.job_name}: internal error",
-                )
-        finally:
-            # Always release resources and update state
-            self._resource_pool.release(gpu_ids)
-
-            with self._lock:
-                self._jobs[req.job_id]["result"] = result
-                self._jobs[req.job_id]["state"] = (
-                    result.state if result else STATE_FAILURE
-                )
-
-            self._done_event.set()
-            # Safe outside lock: `_try_schedule` acquires `self._lock` internally.
-            self._try_schedule()
-
-        return result
-
-
-# ---------------------------------------------------------------------------
-# Webhook server
-# ---------------------------------------------------------------------------
-
-
-def verify_signature(secret, body, signature_header):
-    """Verify GitHub webhook HMAC-SHA256 signature."""
-    if not signature_header:
-        return False
-
-    expected = (
-        "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest()
-    )
-    return hmac.compare_digest(expected, signature_header)
-
-
-def _verify_api_token(handler):
-    """Check Bearer token for /api/run authentication.
-
-    Returns True if authenticated, False (and sends 401) if not.
-    When no api_token is configured on the server, all requests are allowed.
-    """
-    api_token = getattr(handler.server, "api_token", None)
-
-    if not api_token:
-        return True
-
-    auth_header = handler.headers.get("Authorization", "")
-
-    if auth_header == f"Bearer {api_token}":
-        return True
-
-    handler._respond_json(401, {"error": "unauthorized"})
-    return False
-
-
-class WebhookHandler(BaseHTTPRequestHandler):
-    """HTTP handler for GitHub webhooks and API endpoints."""
-
-    def log_message(self, format, *args):
-        msg = format % args if args else format
-        print(f"[agent] {msg}", file=sys.stderr)
-
-    def do_GET(self):
-        if self.path == "/health":
-            self._respond_json(200, {"status": "ok", "platform": self.server.platform})
-        elif self.path == "/status":
-            status = self.server.scheduler.get_status()
-            self._respond_json(200, status)
-        elif self.path.startswith("/api/job/"):
-            self._handle_api_job()
-        else:
-            self._respond_json(404, {"error": "not found"})
-
-    def do_POST(self):
-        content_length = int(self.headers.get("Content-Length", 0))
-
-        if content_length > MAX_CONTENT_LENGTH:
-            self._respond_json(413, {"error": "payload too large"})
-            return
-
-        body = self.rfile.read(content_length)
-
-        if self.path == "/webhook":
-            self._handle_webhook(body)
-        elif self.path == "/api/run":
-            self._handle_api_run(body)
-        else:
-            self._respond_json(404, {"error": "not found"})
-
-    def _handle_webhook(self, body):
-        # Verify signature if secret is configured
-        if self.server.webhook_secret:
-            sig = self.headers.get("X-Hub-Signature-256", "")
-
-            if not verify_signature(self.server.webhook_secret, body, sig):
-                self._respond_json(401, {"error": "invalid signature"})
-                return
-
-        event_type = self.headers.get("X-GitHub-Event", "")
-
-        if event_type == "ping":
-            self._respond_json(200, {"msg": "pong"})
-            return
-
-        try:
-            payload = json.loads(body)
-        except json.JSONDecodeError:
-            self._respond_json(400, {"error": "invalid JSON"})
-            return
-
-        if event_type == "push":
-            branch, sha = self._parse_push(payload)
-        elif event_type == "pull_request":
-            action = payload.get("action", "")
-
-            if action not in ("opened", "synchronize"):
-                self._respond_json(200, {"msg": f"ignored PR action: {action}"})
-                return
-
-            branch, sha = self._parse_pull_request(payload)
-        else:
-            self._respond_json(200, {"msg": f"ignored event: {event_type}"})
-            return
-
-        if not branch or not sha:
-            self._respond_json(400, {"error": "could not extract branch/sha"})
-            return
-
-        job_ids = self._submit_jobs(branch, sha)
-        self._respond_json(200, {"accepted": True, "job_ids": job_ids})
-
-    def _handle_api_run(self, body):
-        """Handle /api/run: remote job trigger (requires Bearer token auth)."""
-        if not _verify_api_token(self):
-            return
-
-        try:
-            payload = json.loads(body)
-        except json.JSONDecodeError:
-            self._respond_json(400, {"error": "invalid JSON"})
-            return
-
-        branch = payload.get("branch", "")
-        sha = payload.get("commit_sha", "")
-        job_name = payload.get("job")
-        image_tag = payload.get("image_tag")
-
-        if not branch:
-            self._respond_json(400, {"error": "branch is required"})
-            return
-
-        if not sha:
-            sha = run.get_git_commit()
-
-        job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag)
-        self._respond_json(200, {"accepted": True, "job_ids": job_ids})
-
-    def _handle_api_job(self):
-        """Handle `GET /api/job/{id}` and `GET /api/job/{id}/log`."""
-        parts = self.path.rstrip("/").split("/")
-
-        if len(parts) < 4:
-            self._respond_json(400, {"error": "missing job_id"})
-            return
-
-        job_id = parts[3]
-
-        # `GET /api/job/{id}/log` — return full log file.
-        if len(parts) >= 5 and parts[4] == "log":
-            self._handle_job_log(job_id)
-            return
-
-        info = self.server.scheduler.get_job(job_id)
-
-        if info is None:
-            self._respond_json(404, {"error": f"job {job_id} not found"})
-        else:
-            self._respond_json(200, info)
-
-    def _handle_job_log(self, job_id):
-        """Return the full log file for a completed job."""
-        log_file = self.server.scheduler.get_job_log_file(job_id)
-
-        if log_file is None or not Path(log_file).is_file():
-            self._respond_json(404, {"error": f"log not available for job {job_id}"})
-            return
-
-        try:
-            data = Path(log_file).read_bytes()
-        except OSError as e:
-            self._respond_json(500, {"error": f"failed to read log: {e}"})
-            return
-
-        self.send_response(200)
-        self.send_header("Content-Type", "text/plain; charset=utf-8")
-        self.send_header("Content-Length", str(len(data)))
-        self.end_headers()
-        self.wfile.write(data)
-
-    def _parse_push(self, payload):
-        branch = payload.get("ref", "").removeprefix("refs/heads/")
-        sha = payload.get("after", "")
-        return branch, sha
-
-    def _parse_pull_request(self, payload):
-        pr = payload.get("pull_request", {})
-        head = pr.get("head", {})
-        branch = head.get("ref", "")
-        sha = head.get("sha", "")
-        return branch, sha
-
-    def _submit_jobs(self, branch, sha, job_name=None, image_tag=None):
-        config = self.server.config
-
-        try:
-            job_names = run.resolve_job_names(
-                config.get("jobs", {}),
-                platform=self.server.platform,
-                job=job_name,
-            )
-        except ValueError as e:
-            self._respond_json(400, {"error": str(e)})
-            return []
-
-        job_ids = []
-
-        for name in job_names:
-            req = JobRequest(
-                job_name=name,
-                branch=branch,
-                commit_sha=sha,
-                config=config,
-                image_tag=image_tag,
-                results_dir=self.server.results_dir,
-            )
-
-            try:
-                jid = self.server.scheduler.submit(req)
-            except QueueFullError as e:
-                self._respond_json(503, {"error": str(e)})
-                return job_ids
-
-            job_ids.append(jid)
-
-        return job_ids
-
-    def _respond_json(self, status_code, data):
-        body = json.dumps(data, indent=2).encode("utf-8")
-        self.send_response(status_code)
-        self.send_header("Content-Type", "application/json")
-        self.send_header("Content-Length", str(len(body)))
-        self.end_headers()
-        self.wfile.write(body)
-
-
-class AgentServer(HTTPServer):
-    """HTTP server with scheduler and config context."""
-
-    def __init__(
-        self,
-        host,
-        port,
-        config,
-        scheduler,
-        platform,
-        webhook_secret=None,
-        api_token=None,
-        results_dir=None,
-    ):
-        super().__init__((host, port), WebhookHandler)
-        self.config = config
-        self.scheduler = scheduler
-        self.platform = platform
-        self.webhook_secret = webhook_secret
-        self.api_token = api_token
-        self.results_dir = results_dir or Path("ci-results")
-
-
-# ---------------------------------------------------------------------------
-# Remote job dispatch (for CLI triggering remote agents)
-# ---------------------------------------------------------------------------
-
-
-def dispatch_remote_job(
-    agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None
-):
-    """Send a job to a remote agent via HTTP API. Returns job_id or None."""
-    url = f"{agent_url.rstrip('/')}/api/run"
-    body = {
-        "branch": branch,
-        "commit_sha": commit_sha,
-        "job": job_name,
-    }
-
-    if image_tag:
-        body["image_tag"] = image_tag
-
-    data = json.dumps(body).encode("utf-8")
-    headers = {"Content-Type": "application/json"}
-
-    if api_token:
-        headers["Authorization"] = f"Bearer {api_token}"
-
-    req = urllib_request(url, data=data, headers=headers, method="POST")
-
-    try:
-        with urllib_urlopen(req, timeout=30) as resp:
-            result = json.loads(resp.read())
-            job_ids = result.get("job_ids", [])
-            return job_ids[0] if job_ids else None
-    except Exception as e:
-        print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr)
-        return None
-
-
-def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200):
-    """Poll a remote agent for job completion. Returns final state dict or None."""
-    url = f"{agent_url.rstrip('/')}/api/job/{job_id}"
-    deadline = time.monotonic() + timeout
-    consecutive_failures = 0
-
-    while time.monotonic() < deadline:
-        try:
-            req = urllib_request(url)
-
-            with urllib_urlopen(req, timeout=10) as resp:
-                info = json.loads(resp.read())
-
-            consecutive_failures = 0
-            state = info.get("state", "")
-
-            if state in (STATE_SUCCESS, STATE_FAILURE):
-                return info
-        except Exception as e:
-            consecutive_failures += 1
-
-            if consecutive_failures == 1 or consecutive_failures % 20 == 0:
-                print(
-                    f"warning: polling {url} failed ({consecutive_failures}x): {e}",
-                    file=sys.stderr,
-                )
-
-        time.sleep(interval)
-
-    return None
-
-
-def fetch_remote_log(agent_url, job_id):
-    """Fetch the full log for a completed remote job. Returns text or None."""
-    url = f"{agent_url.rstrip('/')}/api/job/{job_id}/log"
-
-    try:
-        req = urllib_request(url)
-
-        with urllib_urlopen(req, timeout=30) as resp:
-            return resp.read().decode("utf-8", errors="replace")
-    except Exception:
-        return None
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-def cmd_run(args):
-    """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP."""
-    config = run.load_config(args.config)
-    agents = config.get("agents", {})
-    branch = args.branch or config.get("repo", {}).get("branch", "master")
-    commit_sha = args.commit or run.get_git_commit(short=False)
-
-    # Determine which jobs to run
-    try:
-        job_names = run.resolve_job_names(
-            config.get("jobs", {}), platform=args.platform, job=args.job
-        )
-    except ValueError as e:
-        print(f"error: {e}", file=sys.stderr)
-        sys.exit(1)
-
-    if not job_names:
-        print("error: no matching jobs found", file=sys.stderr)
-        sys.exit(1)
-
-    # Resolve agent URL for each job
-    jobs_to_dispatch = []  # [(name, agent_url)]
-
-    for name in job_names:
-        job = config.get("jobs", {}).get(name, {})
-        platform = job.get("platform", "")
-        agent_url = agents.get(platform, {}).get("url", "")
-
-        if not agent_url:
-            print(
-                f"error: no agent URL configured for platform {platform!r} (job {name})",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-
-        jobs_to_dispatch.append((name, agent_url))
-
-    api_token = os.environ.get("AGENT_API_TOKEN", "")
-    results = []
-
-    if args.dry_run:
-        for name, agent_url in jobs_to_dispatch:
-            platform, _, job = name.partition("_")
-            print(f"[dry-run] dispatch {platform} {job} job to {agent_url}")
-    else:
-        # Dispatch all jobs, then poll concurrently.
-        dispatched = []  # [(name, agent_url, job_id)]
-
-        for name, agent_url in jobs_to_dispatch:
-            platform, _, job = name.partition("_")
-            print(
-                f"==> dispatching {platform} {job} job to {agent_url}",
-                file=sys.stderr,
-            )
-            job_id = dispatch_remote_job(
-                agent_url,
-                name,
-                branch,
-                commit_sha,
-                args.image_tag,
-                api_token=api_token or None,
-            )
-
-            if job_id:
-                print(f"    job_id: {job_id}", file=sys.stderr)
-                dispatched.append((name, agent_url, job_id))
-            else:
-                print(f"    failed to dispatch {name}", file=sys.stderr)
-                results.append({"job_name": name, "state": "error"})
-
-        if dispatched:
-            with ThreadPoolExecutor(max_workers=len(dispatched)) as executor:
-                futures = {
-                    executor.submit(poll_remote_job, url, jid): (name, url, jid)
-                    for name, url, jid in dispatched
-                }
-
-                # Collect name lengths for column alignment.
-                name_width = max(len(n) for n, _, _ in dispatched)
-
-                for future in as_completed(futures):
-                    name, agent_url, job_id = futures[future]
-                    result = future.result()
-
-                    if result:
-                        state = result.get("state", "unknown")
-                        duration = result.get("duration_seconds", 0)
-                        tag = "PASS" if state == STATE_SUCCESS else "FAIL"
-                        print(
-                            f"<== {tag}  {name:<{name_width}}  ({duration:.0f}s)",
-                            file=sys.stderr,
-                        )
-
-                        if state != STATE_SUCCESS:
-                            full_log = fetch_remote_log(agent_url, job_id)
-
-                            if full_log:
-                                print(
-                                    f"--- full log ({name}) ---",
-                                    file=sys.stderr,
-                                )
-                                print(full_log, file=sys.stderr)
-                                print("---", file=sys.stderr)
-                            else:
-                                # Fall back to `error_tail` if full log unavailable.
-                                error_tail = result.get("error_tail", [])
-
-                                if error_tail:
-                                    print(
-                                        f"--- error output (last {len(error_tail)} lines) ---",
-                                        file=sys.stderr,
-                                    )
-
-                                    for line in error_tail:
-                                        print(f"    {line}", file=sys.stderr)
-
-                                    print("---", file=sys.stderr)
-
-                        results.append(result)
-                    else:
-                        print(
-                            f"<== TIMEOUT  {name:<{name_width}}",
-                            file=sys.stderr,
-                        )
-                        results.append({"job_name": name, "state": "timeout"})
-
-    # Summary: only print when there are failures.
-    failed = [r for r in results if r.get("state") != STATE_SUCCESS]
-
-    if failed:
-        print("\n========== Failed ==========", file=sys.stderr)
-        name_width = max(len(r.get("job_name", "?")) for r in failed)
-
-        for r in failed:
-            name = r.get("job_name", "?")
-            state = r.get("state", "unknown")
-            duration = r.get("duration_seconds", 0)
-            print(
-                f"  FAIL  {name:<{name_width}}  {state} ({duration:.0f}s)",
-                file=sys.stderr,
-            )
-
-        sys.exit(1)
-
-
-def cmd_serve(args):
-    """Handle 'serve' subcommand: start webhook server."""
-    config = run.load_config(args.config)
-
-    platform = res.detect_platform()
-
-    if not platform:
-        print(
-            "error: could not detect platform (no nvidia-smi or ixsmi found)",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    try:
-        run.resolve_job_names(config.get("jobs", {}), platform=platform)
-    except ValueError as e:
-        print(f"error: {e}", file=sys.stderr)
-        sys.exit(1)
-
-    pool = res.ResourcePool(
-        platform,
-        utilization_threshold=args.utilization_threshold,
-    )
-    scheduler = Scheduler(
-        config,
-        platform,
-        pool,
-        results_dir=args.results_dir,
-    )
-
-    webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "")
-    api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "")
-
-    if not webhook_secret:
-        print(
-            "WARNING: No webhook secret configured. Webhook endpoint accepts "
-            "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.",
-            file=sys.stderr,
-        )
-
-    if not api_token:
-        print(
-            "WARNING: No API token configured. /api/run endpoint is unauthenticated. "
-            "Set --api-token or AGENT_API_TOKEN for production.",
-            file=sys.stderr,
-        )
-
-    server = AgentServer(
-        args.host,
-        args.port,
-        config,
-        scheduler,
-        platform,
-        webhook_secret=webhook_secret or None,
-        api_token=api_token or None,
-        results_dir=args.results_dir,
-    )
-
-    print(
-        f"Agent serving on {args.host}:{args.port} (platform={platform})",
-        file=sys.stderr,
-    )
-    print("  POST /webhook  — GitHub webhook", file=sys.stderr)
-    print("  POST /api/run  — remote job trigger", file=sys.stderr)
-    print("  GET  /health   — health check", file=sys.stderr)
-    print("  GET  /status   — queue & resource status", file=sys.stderr)
-    print("  GET  /api/job/{id} — job status", file=sys.stderr)
-    print("  GET  /api/job/{id}/log — full job log", file=sys.stderr)
-
-    try:
-        server.serve_forever()
-    except KeyboardInterrupt:
-        print("\nShutting down...", file=sys.stderr)
-        server.shutdown()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks",
-    )
-    subparsers = parser.add_subparsers(dest="command")
-
-    # --- run subcommand ---
-    run_parser = subparsers.add_parser("run", help="Run CI jobs")
-    run_parser.add_argument(
-        "--config",
-        type=Path,
-        default=Path(__file__).resolve().parent / "config.yaml",
-    )
-    run_parser.add_argument(
-        "--branch", type=str, help="Branch to test (default: config repo.branch)"
-    )
-    run_parser.add_argument("--job", type=str, help="Specific job name")
-    run_parser.add_argument("--platform", type=str, help="Filter jobs by platform")
-    run_parser.add_argument("--image-tag", type=str, help="Override image tag")
-    run_parser.add_argument("--commit", type=str, help="Override commit SHA")
-    run_parser.add_argument("--dry-run", action="store_true")
-
-    # --- serve subcommand ---
-    serve_parser = subparsers.add_parser("serve", help="Start webhook server")
-    serve_parser.add_argument(
-        "--config",
-        type=Path,
-        default=Path(__file__).resolve().parent / "config.yaml",
-    )
-    serve_parser.add_argument("--port", type=int, default=8080)
-    serve_parser.add_argument("--host", type=str, default="0.0.0.0")
-    serve_parser.add_argument("--webhook-secret", type=str)
-    serve_parser.add_argument(
-        "--api-token",
-        type=str,
-        help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)",
-    )
-    serve_parser.add_argument(
-        "--results-dir",
-        type=Path,
-        default=Path("ci-results"),
-    )
-    serve_parser.add_argument(
-        "--utilization-threshold",
-        type=int,
-        default=10,
-    )
-
-    args = parser.parse_args()
-
-    if args.command == "run":
-        cmd_run(args)
-    elif args.command == "serve":
-        cmd_serve(args)
-    else:
-        parser.print_help()
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.ci/build.py b/.ci/build.py
deleted file mode 100644
index b373cb7d..00000000
--- a/.ci/build.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env python3
-"""CI image builder: detect changes, build, tag, and optionally push Docker images."""
-
-import argparse
-import json
-import os
-import shlex
-import subprocess
-import sys
-from pathlib import Path
-
-from utils import get_git_commit, load_config
-
-
-def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"):
-    """Check if any file under `dockerfile_dir` changed since `base_ref`."""
-    result = subprocess.run(
-        ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir],
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        print(
-            "warning: git diff failed (shallow clone or initial commit?);"
-            " assuming Dockerfile changed",
-            file=sys.stderr,
-        )
-        return True
-
-    return bool(result.stdout.strip())
-
-
-def docker_login(registry_cfg, dry_run):
-    """Log in to the registry using `credentials_env` token.
-
-    Returns True on success.
-
-    NOTE: Registry support is currently unused (`config.yaml` has no registry
-    section). Retained for future integration with an external image management
-    system.
-    """
-    credentials_env = registry_cfg.get("credentials_env")
-    registry_url = registry_cfg.get("url", "")
-
-    if not credentials_env or not registry_url:
-        return True
-
-    token = os.environ.get(credentials_env)
-
-    if not token:
-        print(
-            f"error: {credentials_env} not set, cannot login",
-            file=sys.stderr,
-        )
-        return False
-
-    if dry_run:
-        print(
-            f"[dry-run] echo <token> | docker login {registry_url}"
-            " --username token --password-stdin"
-        )
-        return True
-
-    result = subprocess.run(
-        ["docker", "login", registry_url, "--username", "token", "--password-stdin"],
-        input=token,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        print("error: docker login failed", file=sys.stderr)
-        return False
-
-    return True
-
-
-def build_image_tag(registry_url, project, platform, tag):
-    if registry_url:
-        return f"{registry_url}/{project}/{platform}:{tag}"
-
-    return f"{project}-ci/{platform}:{tag}"
-
-
-def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in):
-    """Build a single platform image. Returns True on success."""
-    registry_url = registry_cfg.get("url", "")
-    project = registry_cfg.get("project", "infiniops")
-    dockerfile_dir = platform_cfg["dockerfile"]
-    commit_tag = build_image_tag(registry_url, project, platform, commit)
-    latest_tag = build_image_tag(registry_url, project, platform, "latest")
-
-    build_args_cfg = platform_cfg.get("build_args", {})
-    build_cmd = ["docker", "build", "--network", "host"]
-
-    for key, value in build_args_cfg.items():
-        build_cmd.extend(["--build-arg", f"{key}={value}"])
-
-    for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):
-        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower())
-
-        if proxy_val:
-            build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"])
-            build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"])
-
-    private_sdk = platform_cfg.get("private_sdk", {})
-
-    if private_sdk:
-        source_env = private_sdk.get("source_env", "")
-        sdk_url = os.environ.get(source_env, "") if source_env else ""
-
-        if sdk_url:
-            build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"])
-
-    build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir])
-
-    if dry_run:
-        print(f"[dry-run] {shlex.join(build_cmd)}")
-
-        if push:
-            if not logged_in:
-                print("[dry-run] (skipping push: docker login failed)")
-            else:
-                print(f"[dry-run] docker push {commit_tag}")
-                print(f"[dry-run] docker push {latest_tag}")
-
-        return True
-
-    print(f"==> building {platform}: {commit_tag}", file=sys.stderr)
-    result = subprocess.run(build_cmd)
-
-    if result.returncode != 0:
-        error = {
-            "stage": "build",
-            "platform": platform,
-            "tag": commit_tag,
-            "exit_code": result.returncode,
-        }
-        print(json.dumps(error), file=sys.stderr)
-
-        return False
-
-    if push:
-        if not logged_in:
-            print("error: docker login failed, cannot push", file=sys.stderr)
-            return False
-
-        for tag in (commit_tag, latest_tag):
-            print(f"==> pushing {tag}", file=sys.stderr)
-            push_result = subprocess.run(["docker", "push", tag])
-
-            if push_result.returncode != 0:
-                error = {
-                    "stage": "push",
-                    "platform": platform,
-                    "tag": tag,
-                    "exit_code": push_result.returncode,
-                }
-                print(json.dumps(error), file=sys.stderr)
-
-                return False
-
-    return True
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Build CI Docker images")
-    parser.add_argument(
-        "--platform",
-        type=str,
-        default="all",
-        help="Platform to build (nvidia, iluvatar, metax, moore, cambricon, ascend, or all). Default: all",
-    )
-    parser.add_argument(
-        "--config",
-        type=Path,
-        default=Path(__file__).resolve().parent / "config.yaml",
-        help="Path to config.yaml",
-    )
-    parser.add_argument(
-        "--commit",
-        type=str,
-        default="HEAD",
-        help="Git ref for tagging the image (default: HEAD)",
-    )
-    parser.add_argument(
-        "--push",
-        action="store_true",
-        help="Push images to registry after building (requires registry in config)",
-    )
-    parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Skip change detection and force build",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Print commands without executing",
-    )
-    args = parser.parse_args()
-
-    config = load_config(args.config)
-    registry_cfg = config.get("registry", {})
-    images_cfg = config.get("images", {})
-
-    if not images_cfg:
-        print("error: no `images` section in config", file=sys.stderr)
-        sys.exit(1)
-
-    if args.platform == "all":
-        platforms = list(images_cfg.keys())
-    else:
-        if args.platform not in images_cfg:
-            print(
-                f"error: platform `{args.platform}` not found in config",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-        platforms = [args.platform]
-
-    commit = get_git_commit(args.commit)
-    logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True
-    failed = False
-
-    for platform in platforms:
-        platform_cfg = images_cfg[platform]
-        dockerfile_dir = platform_cfg["dockerfile"]
-
-        if not Path(dockerfile_dir).is_dir():
-            print(
-                f"warning: dockerfile directory `{dockerfile_dir}` does not exist,"
-                f" skipping {platform}",
-                file=sys.stderr,
-            )
-            continue
-
-        if not args.force and not has_dockerfile_changed(dockerfile_dir):
-            print(f"==> {platform}: no changes detected, skipping", file=sys.stderr)
-            continue
-
-        ok = build_image(
-            platform,
-            platform_cfg,
-            registry_cfg,
-            commit,
-            args.push,
-            args.dry_run,
-            logged_in=logged_in,
-        )
-
-        if not ok:
-            failed = True
-
-    if failed:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py
deleted file mode 100644
index b23fee73..00000000
--- a/.ci/ci_resource.py
+++ /dev/null
@@ -1,575 +0,0 @@
-#!/usr/bin/env python3
-"""Resource detection and allocation for CI Runner Agent."""
-
-import json
-import operator
-import os
-import re
-import shutil
-import subprocess
-import sys
-import threading
-from dataclasses import dataclass
-
-# Platform-to-device-env mapping for non-NVIDIA platforms.
-# NVIDIA uses Docker's --gpus flag instead of an environment variable.
-PLATFORM_DEVICE_ENV = {
-    "iluvatar": "CUDA_VISIBLE_DEVICES",
-    "metax": "CUDA_VISIBLE_DEVICES",
-    "moore": "MTHREADS_VISIBLE_DEVICES",
-    "cambricon": "MLU_VISIBLE_DEVICES",
-    "ascend": "ASCEND_VISIBLE_DEVICES",
-}
-
-
-@dataclass
-class GpuInfo:
-    index: int
-    memory_used_mb: float
-    memory_total_mb: float
-    utilization_pct: float
-
-
-@dataclass
-class SystemResources:
-    total_memory_mb: float
-    available_memory_mb: float
-    cpu_count: int
-
-
-class ResourcePool:
-    """Thread-safe GPU and system resource manager.
-
-    Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi)
-    and tracks allocations to enable dynamic parallel scheduling.
-    """
-
-    GPU_QUERY_TOOLS = {
-        "nvidia": "nvidia-smi",
-        "iluvatar": "ixsmi",
-        "metax": "mx-smi",
-        "moore": "mthreads-gmi",
-        "cambricon": "cnmon",
-        "ascend": "npu-smi",
-    }
-
-    def __init__(self, platform, utilization_threshold=10):
-        self._platform = platform
-        self._utilization_threshold = utilization_threshold
-        self._allocated: set[int] = set()
-        self._lock = threading.Lock()
-
-    @property
-    def platform(self):
-        return self._platform
-
-    @property
-    def allocated(self):
-        with self._lock:
-            return set(self._allocated)
-
-    def detect_gpus(self) -> list[GpuInfo]:
-        """Query GPU status via platform-specific CLI tool."""
-        if self._platform == "metax":
-            return self._detect_gpus_metax()
-
-        if self._platform == "moore":
-            return self._detect_gpus_moore()
-
-        if self._platform == "cambricon":
-            return self._detect_gpus_cambricon()
-
-        if self._platform == "ascend":
-            return self._detect_gpus_ascend()
-
-        tool = self.GPU_QUERY_TOOLS.get(self._platform)
-
-        if not tool:
-            return []
-
-        try:
-            result = subprocess.run(
-                [
-                    tool,
-                    "--query-gpu=index,memory.used,memory.total,utilization.gpu",
-                    "--format=csv,noheader,nounits",
-                ],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            return []
-
-        if result.returncode != 0:
-            return []
-
-        gpus = []
-
-        for line in result.stdout.strip().splitlines():
-            parts = [p.strip() for p in line.split(",")]
-
-            if len(parts) < 4:
-                continue
-
-            try:
-                gpus.append(
-                    GpuInfo(
-                        index=int(parts[0]),
-                        memory_used_mb=float(parts[1]),
-                        memory_total_mb=float(parts[2]),
-                        utilization_pct=float(parts[3]),
-                    )
-                )
-            except (ValueError, IndexError):
-                continue
-
-        return gpus
-
-    def _detect_gpus_metax(self) -> list[GpuInfo]:
-        """Parse mx-smi output for MetaX GPUs.
-
-        Runs --show-memory and --show-usage separately and merges results.
-        Output format example:
-            GPU#0  MXC550  0000:1a:00.0
-                Memory
-                    vis_vram total  : 67108864 KB
-                    vis_vram used   : 879032 KB
-                Utilization
-                    GPU             : 0 %
-        """
-
-        def run_mxsmi(flag):
-            try:
-                r = subprocess.run(
-                    ["mx-smi", flag],
-                    capture_output=True,
-                    text=True,
-                    timeout=10,
-                )
-                return r.stdout if r.returncode == 0 else ""
-            except (FileNotFoundError, subprocess.TimeoutExpired):
-                return ""
-
-        mem_out = run_mxsmi("--show-memory")
-        util_out = run_mxsmi("--show-usage")
-
-        # Parse memory: collect {index: (used_kb, total_kb)}
-        mem = {}
-        current = None
-        for line in mem_out.splitlines():
-            m = re.match(r"GPU#(\d+)", line.strip())
-            if m:
-                current = int(m.group(1))
-                mem[current] = [0.0, 0.0]
-                continue
-            if current is None:
-                continue
-            m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line)
-            if m:
-                mem[current][1] = float(m.group(1)) / 1024  # KB -> MB
-            m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line)
-            if m:
-                mem[current][0] = float(m.group(1)) / 1024  # KB -> MB
-
-        # Parse utilization: collect {index: utilization_pct}
-        util = {}
-        current = None
-        in_util = False
-        for line in util_out.splitlines():
-            m = re.match(r"GPU#(\d+)", line.strip())
-            if m:
-                current = int(m.group(1))
-                in_util = False
-                continue
-            if current is None:
-                continue
-            if "Utilization" in line:
-                in_util = True
-                continue
-            if in_util:
-                m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line)
-                if m:
-                    util[current] = float(m.group(1))
-                    in_util = False
-
-        gpus = []
-        for idx in sorted(mem):
-            used_mb, total_mb = mem[idx]
-            gpus.append(
-                GpuInfo(
-                    index=idx,
-                    memory_used_mb=used_mb,
-                    memory_total_mb=total_mb,
-                    utilization_pct=util.get(idx, 0.0),
-                )
-            )
-        return gpus
-
-    def _detect_gpus_moore(self) -> list[GpuInfo]:
-        """Parse mthreads-gmi JSON output for Moore Threads GPUs.
-
-        Uses: mthreads-gmi -q --json
-        Expected JSON structure:
-            {
-              "Attached GPUs": {
-                "GPU 00000000:3B:00.0": {
-                  "Minor Number": "0",
-                  "Memory Usage": {
-                    "Total": "24576 MiB",
-                    "Used": "512 MiB"
-                  },
-                  "Utilization": {
-                    "Gpu": "5 %"
-                  }
-                }
-              }
-            }
-        """
-
-        def extract_number(s):
-            m = re.search(r"([\d.]+)", str(s))
-            return float(m.group(1)) if m else 0.0
-
-        try:
-            result = subprocess.run(
-                ["mthreads-gmi", "-q", "--json"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            return []
-
-        if result.returncode != 0:
-            return []
-
-        try:
-            data = json.loads(result.stdout)
-        except json.JSONDecodeError:
-            return []
-
-        gpus = []
-        attached = data.get("Attached GPUs", {})
-
-        for gpu_data in attached.values():
-            try:
-                index = int(gpu_data.get("Minor Number", len(gpus)))
-
-                mem = gpu_data.get("Memory Usage", {})
-                total_mb = extract_number(mem.get("Total", "0 MiB"))
-                used_mb = extract_number(mem.get("Used", "0 MiB"))
-                util_pct = extract_number(
-                    gpu_data.get("Utilization", {}).get("Gpu", "0 %")
-                )
-
-                gpus.append(
-                    GpuInfo(
-                        index=index,
-                        memory_used_mb=used_mb,
-                        memory_total_mb=total_mb,
-                        utilization_pct=util_pct,
-                    )
-                )
-            except (ValueError, AttributeError):
-                continue
-
-        return sorted(gpus, key=operator.attrgetter("index"))
-
-    def _detect_gpus_cambricon(self) -> list[GpuInfo]:
-        """Parse cnmon output for Cambricon MLU cards.
-
-        Each card appears as two consecutive data rows:
-            Row 1: | {card}  {vf}  {name}  {fw} | {bus_id} | {util}%  {ecc} |
-            Row 2: | {fan}%  {temp}  {pwr} | {mem_used} MiB/ {mem_total} MiB | ... |
-        """
-        try:
-            result = subprocess.run(
-                ["cnmon"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            return []
-
-        if result.returncode != 0:
-            return []
-
-        gpus = []
-        lines = result.stdout.splitlines()
-        i = 0
-
-        while i < len(lines):
-            line = lines[i]
-            # Row 1: "| {index} ... | {bus_id} | {util}%  {ecc} |"
-            m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line)
-
-            if m1 and i + 1 < len(lines):
-                try:
-                    card_index = int(m1.group(1))
-                    util_pct = float(m1.group(2))
-                    row2 = lines[i + 1]
-                    mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2)
-
-                    if mem_m:
-                        used_mb = float(mem_m.group(1))
-                        total_mb = float(mem_m.group(2))
-                    else:
-                        used_mb, total_mb = 0.0, 0.0
-
-                    gpus.append(
-                        GpuInfo(
-                            index=card_index,
-                            memory_used_mb=used_mb,
-                            memory_total_mb=total_mb,
-                            utilization_pct=util_pct,
-                        )
-                    )
-                except (ValueError, AttributeError):
-                    pass
-                i += 2
-                continue
-
-            i += 1
-
-        return sorted(gpus, key=operator.attrgetter("index"))
-
-    def _detect_gpus_ascend(self) -> list[GpuInfo]:
-        """Parse npu-smi info output for Huawei Ascend NPUs.
-
-        Output format (pipe-delimited table, two rows per NPU):
-            | 0     910B4               | OK            | 86.5  41  ...
-            | 0                         | 0000:C1:00.0  | 0     0 / 0   2789 / 32768   |
-        Row 1: index, name, health, power, temp, hugepages.
-        Row 2: chip_id, bus_id, aicore_util, memory_usage, hbm_usage.
-        """
-        try:
-            result = subprocess.run(
-                ["npu-smi", "info"],
-                capture_output=True,
-                text=True,
-                timeout=10,
-            )
-        except (FileNotFoundError, subprocess.TimeoutExpired):
-            return []
-
-        if result.returncode != 0:
-            return []
-
-        gpus = []
-        lines = result.stdout.splitlines()
-        i = 0
-
-        while i < len(lines):
-            line = lines[i]
-            # Match row 1: `| {index}  {name}  ...`.
-            m1 = re.match(r"^\|\s+(\d+)\s+", line)
-
-            if m1 and i + 1 < len(lines):
-                try:
-                    npu_index = int(m1.group(1))
-                    aicore_m = re.match(
-                        r"^\|\s+\d+\s+\|\s+[\da-f:.]+\s+\|\s*([\d.]+)\s", lines[i + 1]
-                    )
-
-                    util_pct = float(aicore_m.group(1)) if aicore_m else 0.0
-
-                    # Parse HBM usage from row 2.  Row contains both DDR
-                    # ("0 / 0") and HBM ("2789 / 32768"); HBM is always last.
-                    hbm_matches = re.findall(r"([\d.]+)\s*/\s*([\d.]+)", lines[i + 1])
-
-                    if hbm_matches:
-                        used_mb = float(hbm_matches[-1][0])
-                        total_mb = float(hbm_matches[-1][1])
-                    else:
-                        used_mb, total_mb = 0.0, 0.0
-
-                    gpus.append(
-                        GpuInfo(
-                            index=npu_index,
-                            memory_used_mb=used_mb,
-                            memory_total_mb=total_mb,
-                            utilization_pct=util_pct,
-                        )
-                    )
-                except (ValueError, AttributeError):
-                    pass
-
-                i += 2
-                continue
-
-            i += 1
-
-        return sorted(gpus, key=operator.attrgetter("index"))
-
-    def detect_system_resources(self) -> SystemResources:
-        """Read system memory from /proc/meminfo and CPU count."""
-        total_mb = 0.0
-        available_mb = 0.0
-
-        try:
-            with open("/proc/meminfo", encoding="utf-8") as f:
-                for line in f:
-                    if line.startswith("MemTotal:"):
-                        total_mb = float(line.split()[1]) / 1024
-                    elif line.startswith("MemAvailable:"):
-                        available_mb = float(line.split()[1]) / 1024
-        except OSError:
-            pass
-
-        return SystemResources(
-            total_memory_mb=total_mb,
-            available_memory_mb=available_mb,
-            cpu_count=os.cpu_count() or 1,
-        )
-
-    def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]:
-        """Try to allocate GPUs and check memory.
-
-        Returns (allocated_gpu_ids, success). On failure returns ([], False).
-        GPUs are selected by ascending utilization (least loaded first).
-        Detection runs outside the lock to avoid blocking other threads.
-        """
-        if gpu_count <= 0:
-            if memory_mb > 0:
-                sys_res = self.detect_system_resources()
-
-                if sys_res.available_memory_mb < memory_mb:
-                    return ([], False)
-
-            return ([], True)
-
-        # Detect GPUs and memory outside the lock (subprocess.run can block).
-        gpus = self.detect_gpus()
-        sys_res = self.detect_system_resources() if memory_mb > 0 else None
-
-        with self._lock:
-            available = [
-                g
-                for g in gpus
-                if g.index not in self._allocated
-                and g.utilization_pct < self._utilization_threshold
-            ]
-
-            if len(available) < gpu_count:
-                return ([], False)
-
-            if sys_res is not None and sys_res.available_memory_mb < memory_mb:
-                return ([], False)
-
-            # Pick least loaded GPUs.
-            available.sort(key=lambda g: g.utilization_pct)
-            selected = [g.index for g in available[:gpu_count]]
-            self._allocated.update(selected)
-            return (selected, True)
-
-    def release(self, gpu_ids):
-        """Return GPUs to the free pool."""
-        with self._lock:
-            self._allocated -= set(gpu_ids)
-
-    def get_status(self) -> dict:
-        """Return current resource status for API endpoints."""
-        gpus = self.detect_gpus()
-        sys_res = self.detect_system_resources()
-
-        with self._lock:
-            allocated = sorted(self._allocated)
-
-        return {
-            "platform": self._platform,
-            "gpus": [
-                {
-                    "index": g.index,
-                    "memory_used_mb": g.memory_used_mb,
-                    "memory_total_mb": g.memory_total_mb,
-                    "utilization_pct": g.utilization_pct,
-                    "allocated_by_agent": g.index in allocated,
-                }
-                for g in gpus
-            ],
-            "allocated_gpu_ids": allocated,
-            "system": {
-                "total_memory_mb": round(sys_res.total_memory_mb, 1),
-                "available_memory_mb": round(sys_res.available_memory_mb, 1),
-                "cpu_count": sys_res.cpu_count,
-            },
-            "utilization_threshold": self._utilization_threshold,
-        }
-
-
-def parse_gpu_requirement(job_config) -> int:
-    """Extract GPU count required by a job.
-
-    Resolution rules:
-
-    - ``gpu_ids: "auto"`` (or omitted) — dynamic allocation; returns ``ngpus``
-      (default 1).
-    - ``gpu_ids: "all"`` — use every available GPU; returns 0 (no reservation).
-    - ``gpu_ids: "0,2"`` — static pinning; returns the count of listed IDs.
-      When ``ngpus`` is also present the two must agree.
-
-    The platform name determines how GPUs are exposed to Docker (see
-    ``PLATFORM_DEVICE_ENV``) but does **not** affect GPU counting here.
-    """
-    resources = job_config.get("resources", {})
-    gpu_ids = str(resources.get("gpu_ids", "auto")).strip()
-    ngpus = resources.get("ngpus")
-
-    if gpu_ids == "all":
-        return 0
-
-    if gpu_ids == "auto" or not gpu_ids:
-        return int(ngpus) if ngpus is not None else 1
-
-    # Static pinning — count explicit IDs.
-    count = len(gpu_ids.split(","))
-
-    if ngpus is not None and int(ngpus) != count:
-        print(
-            f"warning: gpu_ids has {count} device(s) but ngpus={ngpus}; "
-            f"using gpu_ids count ({count})",
-            file=sys.stderr,
-        )
-
-    return count
-
-
-def parse_memory_requirement(job_config) -> float:
-    """Extract memory requirement in MB from a job config."""
-    resources = job_config.get("resources", {})
-    memory = str(resources.get("memory", ""))
-
-    if not memory:
-        return 0
-
-    memory = memory.lower().strip()
-
-    if memory.endswith("gb"):
-        return float(memory[:-2]) * 1024
-    elif memory.endswith("g"):
-        return float(memory[:-1]) * 1024
-    elif memory.endswith("mb"):
-        return float(memory[:-2])
-    elif memory.endswith("m"):
-        return float(memory[:-1])
-
-    try:
-        return float(memory) * 1024  # Default: GB
-    except ValueError:
-        print(
-            f"warning: unrecognized memory format {memory!r}, treating as 0",
-            file=sys.stderr,
-        )
-
-        return 0
-
-
-def detect_platform():
-    """Auto-detect the current platform by probing GPU query tools on PATH."""
-    for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items():
-        if shutil.which(tool):
-            return platform
-
-    return None
diff --git a/.ci/github_status.py b/.ci/github_status.py
deleted file mode 100644
index f8f017f1..00000000
--- a/.ci/github_status.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""GitHub Commit Status API wrapper using urllib (zero external dependencies)."""
-
-import json
-import os
-import re
-import sys
-import urllib.error
-import urllib.request
-
-
-def parse_repo_url(url):
-    """Extract (owner, repo) from a GitHub URL.
-
-    Handles:
-      - https://github.com/Owner/Repo.git
-      - git@github.com:Owner/Repo.git
-    """
-    # HTTPS format
-    m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url)
-
-    if m:
-        return m.group(1), m.group(2)
-
-    # SSH format
-    m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url)
-
-    if m:
-        return m.group(1), m.group(2)
-
-    return "", ""
-
-
-def build_status_context(prefix, job_name):
-    """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'."""
-    return f"{prefix}/{job_name}"
-
-
-def post_commit_status(
-    owner,
-    repo,
-    sha,
-    state,
-    context,
-    description,
-    target_url=None,
-    token=None,
-):
-    """Post a commit status to GitHub.
-
-    Args:
-        state: One of 'pending', 'success', 'failure', 'error'.
-        Returns True on success, False on failure.
-    """
-    token = token or os.environ.get("GITHUB_TOKEN", "")
-
-    if not token:
-        print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr)
-        return False
-
-    if not owner or not repo or not sha:
-        print(
-            "warning: missing owner/repo/sha, skipping status update", file=sys.stderr
-        )
-        return False
-
-    url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}"
-    body = {
-        "state": state,
-        "context": context,
-        "description": description[:140],
-    }
-
-    if target_url:
-        body["target_url"] = target_url
-
-    data = json.dumps(body).encode("utf-8")
-    req = urllib.request.Request(
-        url,
-        data=data,
-        headers={
-            "Authorization": f"token {token}",
-            "Accept": "application/vnd.github.v3+json",
-            "Content-Type": "application/json",
-        },
-        method="POST",
-    )
-
-    try:
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            return 200 <= resp.status < 300
-    except urllib.error.HTTPError as e:
-        print(
-            f"warning: GitHub status API returned {e.code}: {e.reason}",
-            file=sys.stderr,
-        )
-        return False
-    except urllib.error.URLError as e:
-        print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr)
-        return False
diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile
deleted file mode 100644
index a542b99e..00000000
--- a/.ci/images/ascend/Dockerfile
+++ /dev/null
@@ -1,29 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-ARG PIP_INDEX_URL=https://pypi.org/simple
-
-RUN pip install --no-cache-dir --progress off \
-    scikit-build-core \
-    libclang \
-    pytest \
-    pytest-cov \
-    pytest-xdist \
-    ruff
-
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}
-ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
-ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
-ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
-ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
-ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
-
-WORKDIR /workspace
diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile
deleted file mode 100644
index 138f3cb4..00000000
--- a/.ci/images/cambricon/Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`.
-ENV PATH=/usr/local/python3.10/bin:${PATH}
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs.
-RUN dnf install -y ninja-build && dnf clean all
-
-ARG PIP_INDEX_URL
-RUN pip install --no-cache-dir \
-    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
-    scikit-build-core \
-    libclang \
-    pytest \
-    pytest-cov \
-    pytest-xdist \
-    ruff==0.15.7
-
-# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version.
-RUN pip show torch >/dev/null 2>&1 && \
-    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
-    touch /etc/pip-constraints.txt
-ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
-
-WORKDIR /workspace
diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile
deleted file mode 100644
index 79afc858..00000000
--- a/.ci/images/iluvatar/Dockerfile
+++ /dev/null
@@ -1,53 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`,
-# but `docker build` `RUN` uses `/bin/sh` which doesn't source it).
-ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages
-ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-ARG APT_MIRROR
-RUN if [ -n "$APT_MIRROR" ]; then \
-        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
-    fi && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        ninja-build \
-        coreutils \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true
-
-ARG PIP_INDEX_URL
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir \
-    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
-    scikit-build-core \
-    pybind11 \
-    libclang \
-    pytest \
-    pytest-cov \
-    pytest-xdist \
-    pyyaml \
-    ruff==0.15.7
-
-RUN pip config set global.index-url https://pypi.org/simple
-
-# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version.
-RUN pip show torch >/dev/null 2>&1 && \
-    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
-    touch /etc/pip-constraints.txt
-ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
-
-WORKDIR /workspace
diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile
deleted file mode 100644
index 540bc9d5..00000000
--- a/.ci/images/metax/Dockerfile
+++ /dev/null
@@ -1,46 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# `conda` Python is used in this image.
-ENV PATH=/opt/conda/bin:${PATH}
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-ARG APT_MIRROR
-RUN if [ -n "$APT_MIRROR" ]; then \
-        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
-    fi && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        cmake \
-        ninja-build \
-        coreutils \
-        libclang-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-ARG PIP_INDEX_URL
-RUN pip install --no-cache-dir \
-    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
-    scikit-build-core \
-    pybind11 \
-    libclang \
-    pytest-cov \
-    pytest-xdist \
-    pyyaml \
-    ruff==0.15.7
-
-# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version.
-RUN pip show torch >/dev/null 2>&1 && \
-    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
-    touch /etc/pip-constraints.txt
-ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
-
-WORKDIR /workspace
diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile
deleted file mode 100644
index a95d9bd1..00000000
--- a/.ci/images/moore/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image.
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-ARG APT_MIRROR
-RUN if [ -n "$APT_MIRROR" ]; then \
-        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
-    fi && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ninja-build \
-        libclang-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-ARG PIP_INDEX_URL
-RUN pip install --no-cache-dir \
-    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
-    scikit-build-core \
-    libclang \
-    pytest-cov \
-    pytest-xdist \
-    ruff==0.15.7
-
-# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version.
-RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt
-ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
-
-WORKDIR /workspace
diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile
deleted file mode 100644
index b4984dac..00000000
--- a/.ci/images/nvidia/Dockerfile
+++ /dev/null
@@ -1,46 +0,0 @@
-ARG BASE_IMAGE
-FROM ${BASE_IMAGE}
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-ARG HTTP_PROXY
-ARG HTTPS_PROXY
-ARG NO_PROXY
-ARG http_proxy
-ARG https_proxy
-ARG no_proxy
-
-ARG APT_MIRROR
-RUN if [ -n "$APT_MIRROR" ]; then \
-        sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \
-    fi && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        cmake \
-        ninja-build \
-        coreutils \
-        libclang-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-
-ARG PIP_INDEX_URL
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir \
-    ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \
-    scikit-build-core \
-    pybind11 \
-    libclang \
-    pytest \
-    pytest-cov \
-    pytest-xdist \
-    pyyaml \
-    ruff==0.15.7
-
-# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version.
-RUN pip show torch >/dev/null 2>&1 && \
-    echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \
-    touch /etc/pip-constraints.txt
-ENV PIP_CONSTRAINT=/etc/pip-constraints.txt
-
-WORKDIR /workspace
diff --git a/.ci/restart-agent.sh b/.ci/restart-agent.sh
deleted file mode 100755
index efe0a900..00000000
--- a/.ci/restart-agent.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Usage: bash .ci/restart-agent.sh [port] [webhook-secret]
-#
-# Restart the CI agent with proxy configured.
-# Edit the HTTPS_PROXY line below for your environment, then:
-#   bash .ci/restart-agent.sh
-#   bash .ci/restart-agent.sh 8080 my-webhook-secret
-
-set -euo pipefail
-
-PORT="${1:-8080}"
-WEBHOOK_SECRET="${2:-}"
-
-# --- Proxy config (edit this) ---
-export HTTPS_PROXY="http://your-proxy:port"
-export HTTP_PROXY="$HTTPS_PROXY"
-export NO_PROXY="localhost,127.0.0.1"
-export https_proxy="$HTTPS_PROXY"
-export http_proxy="$HTTP_PROXY"
-export no_proxy="$NO_PROXY"
-
-# --- Kill existing agent ---
-if pgrep -f "agent.py serve" > /dev/null 2>&1; then
-    echo "Stopping existing agent..."
-    pkill -f "agent.py serve" || true
-    sleep 2
-fi
-
-# --- Start agent ---
-CI_DIR="$(cd "$(dirname "$0")" && pwd)"
-
-if [ ! -f "$CI_DIR/agent.py" ]; then
-    echo "error: $CI_DIR/agent.py not found"
-    exit 1
-fi
-
-ARGS="serve --port $PORT"
-if [ -n "$WEBHOOK_SECRET" ]; then
-    ARGS="$ARGS --webhook-secret $WEBHOOK_SECRET"
-fi
-
-echo "Starting CI agent on port $PORT..."
-nohup python "$CI_DIR/agent.py" $ARGS > /tmp/ci-agent.log 2>&1 &
-
-HOST_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || hostname)
-
-echo "PID:    $!"
-echo "Listen: http://${HOST_IP}:${PORT}"
-echo "Log:    /tmp/ci-agent.log"
-echo "Proxy:  $HTTPS_PROXY"
diff --git a/.ci/run.py b/.ci/run.py
deleted file mode 100644
index e293b4a2..00000000
--- a/.ci/run.py
+++ /dev/null
@@ -1,499 +0,0 @@
-#!/usr/bin/env python3
-"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout."""
-
-import argparse
-import os
-import re
-import shlex
-import subprocess
-import sys
-import uuid
-import xml.etree.ElementTree as ET
-from datetime import datetime
-from pathlib import Path
-
-from ci_resource import (
-    PLATFORM_DEVICE_ENV,
-    ResourcePool,
-    detect_platform,
-    parse_gpu_requirement,
-    parse_memory_requirement,
-)
-from utils import get_git_commit, load_config
-
-# Flags that consume the next token as their value (e.g. -n 4, -k expr).
-_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"}
-
-
-def _junit_xml_indicates_pass(results_dir):
-    """Return True if `pytest` junit XML under `results_dir` reports no failures/errors.
-
-    Used to distinguish a real CI failure from the docker 18.09
-    container-teardown `SIGKILL` (exit code 137) that occurs on this host
-    after a child process exits successfully — bash returns 0 from inside
-    the container, but the docker daemon reports 137 due to a race in its
-    `--rm` cleanup path. The junit XML is written by pytest before that
-    teardown and reliably captures the real outcome of the test stage.
-    """
-    for junit in Path(results_dir).rglob("test-results.xml"):
-        try:
-            root = ET.parse(junit).getroot()
-        except ET.ParseError:
-            continue
-
-        suites = root.findall("testsuite") if root.tag == "testsuites" else [root]
-
-        if not suites:
-            continue
-
-        for suite in suites:
-            try:
-                if int(suite.get("failures", 0)) > 0:
-                    return False
-
-                if int(suite.get("errors", 0)) > 0:
-                    return False
-            except ValueError:
-                return False
-
-        return True
-
-    return False
-
-
-def apply_test_override(run_cmd, test_path):
-    """Replace positional test path(s) in a pytest stage command.
-
-    For example: ``pytest tests/ -n 4 ...`` becomes
-    ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is
-    ``tests/test_gemm.py``.
-    """
-    parts = shlex.split(run_cmd)
-
-    if not parts or parts[0] != "pytest":
-        return run_cmd
-
-    result = ["pytest", test_path]
-    skip_next = False
-
-    for p in parts[1:]:
-        if skip_next:
-            result.append(p)
-            skip_next = False
-            continue
-
-        if p.startswith("-"):
-            result.append(p)
-            if p in _PYTEST_VALUE_FLAGS:
-                skip_next = True
-            continue
-
-        # Skip existing test paths; the override is already in result[1].
-        if not ("/" in p or p.endswith(".py") or "::" in p):
-            result.append(p)
-
-    return shlex.join(result)
-
-
-def build_results_dir(base, platform, stages, commit):
-    """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}_{id}`."""
-    stage_names = "+".join(s["name"] for s in stages)
-    safe_commit = re.sub(r"[^a-zA-Z0-9._-]", "", commit) or "unknown"
-    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-    short_id = uuid.uuid4().hex[:6]
-    dirname = f"{platform}_{stage_names}_{safe_commit}_{timestamp}_{short_id}"
-
-    return Path(base) / dirname
-
-
-def resolve_image(config, platform, image_tag):
-    """Resolve an image reference to a full image name.
-
-    Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config
-    contains a registry section, returns a registry-prefixed URL. Otherwise
-    returns a local tag (current default).
-    """
-    registry = config.get("registry", {})
-    registry_url = registry.get("url", "")
-    project = registry.get("project", "infiniops")
-
-    if not registry_url:
-        return f"{project}-ci/{platform}:{image_tag}"
-
-    return f"{registry_url}/{project}/{platform}:{image_tag}"
-
-
-def build_runner_script():
-    return r"""
-set -e
-cd /workspace
-mkdir -p /workspace/results
-if [ -n "$LOCAL_SRC" ]; then
-  cp -r "$LOCAL_SRC" /tmp/src
-  cd /tmp/src
-else
-  git clone "$REPO_URL" repo
-  cd repo
-  git checkout "$BRANCH"
-fi
-echo "========== Setup =========="
-eval "$SETUP_CMD"
-set +e
-rc=0
-for i in $(seq 1 "$NUM_STAGES"); do
-  name_var="STAGE_${i}_NAME"
-  cmd_var="STAGE_${i}_CMD"
-  name="${!name_var}"
-  cmd="${!cmd_var}"
-  echo "========== Stage: $name =========="
-  if [ -n "$cmd" ]; then
-    eval "$cmd"
-    rc=$?
-    if [ $rc -ne 0 ]; then
-      echo "Stage '$name' failed with exit code $rc"
-      break
-    fi
-  fi
-done
-echo "========== Summary =========="
-if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then
-  chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true
-fi
-exit $rc
-"""
-
-
-def build_docker_args(
-    config,
-    job_name,
-    repo_url,
-    branch,
-    stages,
-    workdir,
-    image_tag_override,
-    gpu_id_override=None,
-    results_dir=None,
-    local_path=None,
-):
-    job = config["jobs"][job_name]
-    platform = job.get("platform", "nvidia")
-    image_tag = image_tag_override or job.get("image", "latest")
-    image = resolve_image(config, platform, image_tag)
-    resources = job.get("resources", {})
-    setup_raw = job.get("setup", "pip install .[dev]")
-
-    if isinstance(setup_raw, list):
-        setup_cmd = "\n".join(setup_raw)
-    else:
-        setup_cmd = setup_raw
-
-    args = [
-        "docker",
-        "run",
-        "--rm",
-        "--network",
-        "host",
-        "-i",
-        "-w",
-        workdir,
-        "-e",
-        f"REPO_URL={repo_url}",
-        "-e",
-        f"BRANCH={branch}",
-        "-e",
-        f"SETUP_CMD={setup_cmd}",
-        "-e",
-        f"NUM_STAGES={len(stages)}",
-        "-e",
-        f"HOST_UID={os.getuid()}",
-        "-e",
-        f"HOST_GID={os.getgid()}",
-    ]
-
-    for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"):
-        proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower())
-
-        if proxy_val:
-            args.extend(["-e", f"{proxy_var}={proxy_val}"])
-            args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"])
-
-    for key, value in job.get("env", {}).items():
-        args.extend(["-e", f"{key}={value}"])
-
-    if results_dir:
-        args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"])
-
-    if local_path:
-        args.extend(["-v", f"{local_path}:/workspace/repo:ro"])
-        args.extend(["-e", "LOCAL_SRC=/workspace/repo"])
-
-    for i, s in enumerate(stages):
-        args.append("-e")
-        args.append(f"STAGE_{i + 1}_NAME={s['name']}")
-        args.append("-e")
-        args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}")
-
-    # Platform-specific device access
-    for flag in job.get("docker_args", []):
-        args.append(flag)
-
-    for vol in job.get("volumes", []):
-        args.extend(["-v", vol])
-
-    raw_gpu_ids = str(resources.get("gpu_ids", "auto")).strip()
-    gpu_id = gpu_id_override or ("" if raw_gpu_ids == "auto" else raw_gpu_ids)
-
-    if gpu_id:
-        if platform == "nvidia":
-            args.extend(["--gpus", "all" if gpu_id == "all" else f"device={gpu_id}"])
-        elif gpu_id != "all":
-            device_env = PLATFORM_DEVICE_ENV.get(platform)
-
-            if device_env:
-                args.extend(["-e", f"{device_env}={gpu_id}"])
-
-    memory = resources.get("memory")
-
-    if memory:
-        mem = str(memory).lower().replace("gb", "g").replace("mb", "m")
-
-        if not mem.endswith("g") and not mem.endswith("m"):
-            mem = f"{mem}g"
-
-        args.extend(["--memory", mem])
-
-    shm_size = resources.get("shm_size")
-
-    if shm_size:
-        args.extend(["--shm-size", str(shm_size)])
-
-    timeout_sec = resources.get("timeout")
-    args.append(image)
-
-    if timeout_sec:
-        # Requires coreutils `timeout` inside the container image.
-        args.extend(["timeout", str(timeout_sec)])
-
-    args.extend(["bash", "-c", build_runner_script().strip()])
-
-    return args
-
-
-def resolve_job_names(jobs, platform=None, job=None):
-    """Resolve job names for a platform.
-
-    - ``job=None`` — all jobs for the platform.
-    - ``job="nvidia_gpu"`` — direct lookup by full name.
-
-    Raises ``ValueError`` if no matching jobs are found.
-    """
-    if job:
-        if job not in jobs:
-            raise ValueError(f"job {job!r} not found in config")
-
-        return [job]
-
-    if not platform:
-        return list(jobs.keys())
-
-    matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform]
-
-    if not matches:
-        raise ValueError(f"no jobs for platform {platform!r}")
-
-    return matches
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Run Docker CI pipeline")
-    parser.add_argument(
-        "--config",
-        type=Path,
-        default=Path(__file__).resolve().parent / "config.yaml",
-        help="Path to config.yaml",
-    )
-    parser.add_argument(
-        "--branch", type=str, help="Override repo branch (default: config repo.branch)"
-    )
-    parser.add_argument(
-        "--job",
-        type=str,
-        help="Job name (e.g. nvidia_gpu, ascend_npu). Default: all jobs for detected platform",
-    )
-    parser.add_argument(
-        "--stage",
-        type=str,
-        help="Run only this stage name (still runs setup first)",
-    )
-    parser.add_argument(
-        "--image-tag",
-        type=str,
-        help="Override image tag (stable, latest, or commit hash)",
-    )
-    parser.add_argument(
-        "--gpu-id",
-        type=str,
-        help='GPU device IDs to use, e.g. "0", "0,2", "all"',
-    )
-    parser.add_argument(
-        "--results-dir",
-        type=Path,
-        default=Path("ci-results"),
-        help="Base directory for test results (default: ./ci-results)",
-    )
-    parser.add_argument(
-        "--test",
-        type=str,
-        help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"',
-    )
-    parser.add_argument(
-        "--local",
-        action="store_true",
-        help="Mount current directory (read-only) into the container instead of cloning from git",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Print docker command and exit",
-    )
-    args = parser.parse_args()
-
-    config = load_config(args.config)
-    repo = config.get("repo", {})
-    repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git")
-    branch = args.branch or repo.get("branch", "master")
-
-    platform = detect_platform()
-
-    if not platform:
-        tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values())
-        print(f"error: could not detect platform (no {tools} found)", file=sys.stderr)
-        sys.exit(1)
-
-    print(f"platform: {platform}", file=sys.stderr)
-
-    jobs = config.get("jobs", {})
-
-    if not jobs:
-        print("error: no jobs in config", file=sys.stderr)
-        sys.exit(1)
-
-    try:
-        job_names = resolve_job_names(jobs, platform, job=args.job)
-    except ValueError as e:
-        print(f"error: {e}", file=sys.stderr)
-        sys.exit(1)
-
-    pool = ResourcePool(platform)
-    failed = 0
-
-    for job_name in job_names:
-        job = jobs[job_name]
-        all_stages = job.get("stages", [])
-
-        if args.stage:
-            stages = [s for s in all_stages if s["name"] == args.stage]
-
-            if not stages:
-                print(
-                    f"error: stage {args.stage!r} not found in {job_name}",
-                    file=sys.stderr,
-                )
-                sys.exit(1)
-        else:
-            stages = all_stages
-
-        if args.test:
-            stages = [
-                {**s, "run": apply_test_override(s.get("run", ""), args.test)}
-                for s in stages
-            ]
-
-        # Resolve GPU assignment: CLI override > auto-allocate > static config.
-        gpu_id_override = args.gpu_id
-        allocated_ids = []
-        raw_gpu_ids = str(job.get("resources", {}).get("gpu_ids", "auto")).strip()
-
-        if not gpu_id_override and raw_gpu_ids == "auto":
-            gpu_count = parse_gpu_requirement(job)
-            memory_mb = parse_memory_requirement(job)
-            allocated_ids, ok = pool.allocate(gpu_count, memory_mb)
-
-            if not ok:
-                detected = pool.detect_gpus()
-                if not detected:
-                    hint = (
-                        f"error: cannot allocate {gpu_count} GPU(s) for {job_name}"
-                        f" — GPU detection returned no devices"
-                        f" (is {ResourcePool.GPU_QUERY_TOOLS.get(platform, '?')} working?)"
-                        f"\nhint: use --gpu-id 0 to bypass auto-allocation"
-                    )
-                else:
-                    hint = (
-                        f"error: cannot allocate {gpu_count} GPU(s) for {job_name}"
-                        f" — {len(detected)} GPU(s) detected but none available"
-                        f" (utilization threshold: {pool._utilization_threshold}%)"
-                        f"\nhint: use --gpu-id 0 to bypass auto-allocation"
-                    )
-                print(hint, file=sys.stderr)
-                failed += 1
-                continue
-
-            if allocated_ids:
-                gpu_id_override = ",".join(str(g) for g in allocated_ids)
-
-        job_platform = job.get("platform", platform)
-        commit = get_git_commit()
-        results_dir = build_results_dir(args.results_dir, job_platform, stages, commit)
-
-        local_path = Path.cwd().resolve() if args.local else None
-        docker_args = build_docker_args(
-            config,
-            job_name,
-            repo_url,
-            branch,
-            stages,
-            "/workspace",
-            args.image_tag,
-            gpu_id_override=gpu_id_override,
-            results_dir=results_dir,
-            local_path=local_path,
-        )
-
-        if args.dry_run:
-            print(shlex.join(docker_args))
-            pool.release(allocated_ids)
-            continue
-
-        print(f"==> running job: {job_name}", file=sys.stderr)
-        results_dir.mkdir(parents=True, exist_ok=True)
-
-        try:
-            returncode = subprocess.run(docker_args).returncode
-        finally:
-            pool.release(allocated_ids)
-
-        if returncode != 0:
-            # Docker 18.09 on this host occasionally SIGKILLs containers
-            # during `--rm` cleanup after the inner process already exited
-            # cleanly, producing exit code 137. Fall back to the pytest
-            # junit XML to recover the real outcome in that case.
-            if returncode == 137 and _junit_xml_indicates_pass(results_dir):
-                print(
-                    f"[warn] job {job_name}: container exited with 137 "
-                    f"(likely docker teardown SIGKILL after clean pytest); "
-                    f"junit XML reports no failures — treating as success",
-                    file=sys.stderr,
-                )
-            else:
-                print(
-                    f"job {job_name} failed (exit code {returncode})",
-                    file=sys.stderr,
-                )
-                failed += 1
-
-    sys.exit(1 if failed else 0)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py
deleted file mode 100644
index 7b028764..00000000
--- a/.ci/tests/conftest.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import sys
-from pathlib import Path
-
-# Allow `import run` and `import build` directly.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-
-import pytest
-
-from utils import normalize_config
-
-
-@pytest.fixture
-def minimal_config():
-    """Minimal platform-centric config, normalized to flat format."""
-    raw = {
-        "repo": {
-            "url": "https://github.com/InfiniTensor/InfiniOps.git",
-            "branch": "master",
-        },
-        "platforms": {
-            "nvidia": {
-                "image": {
-                    "dockerfile": ".ci/images/nvidia/",
-                    "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
-                },
-                "setup": "pip install .[dev]",
-                "jobs": {
-                    "gpu": {
-                        "resources": {
-                            "ngpus": 1,
-                            "memory": "32GB",
-                            "shm_size": "16g",
-                            "timeout": 3600,
-                        },
-                        "stages": [
-                            {
-                                "name": "test",
-                                "run": "pytest tests/ -v",
-                            }
-                        ],
-                    }
-                },
-            }
-        },
-    }
-    return normalize_config(raw)
diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py
deleted file mode 100644
index a0c8cccc..00000000
--- a/.ci/tests/test_agent.py
+++ /dev/null
@@ -1,724 +0,0 @@
-import hashlib
-import hmac
-import json
-import threading
-from pathlib import Path
-from unittest.mock import MagicMock
-
-import pytest
-
-import agent
-import ci_resource as res
-import run
-from utils import normalize_config
-
-
-# ---------------------------------------------------------------------------
-# Test fixtures.
-# ---------------------------------------------------------------------------
-
-
-@pytest.fixture
-def agent_config():
-    raw = {
-        "repo": {
-            "url": "https://github.com/InfiniTensor/InfiniOps.git",
-            "branch": "master",
-        },
-        "github": {
-            "status_context_prefix": "ci/infiniops",
-        },
-        "agents": {
-            "nvidia": {"url": "http://nvidia-host:8080"},
-            "iluvatar": {"url": "http://iluvatar-host:8080"},
-        },
-        "platforms": {
-            "nvidia": {
-                "image": {
-                    "dockerfile": ".ci/images/nvidia/",
-                    "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
-                },
-                "setup": "pip install .[dev]",
-                "jobs": {
-                    "gpu": {
-                        "resources": {
-                            "ngpus": 1,
-                            "memory": "32GB",
-                            "shm_size": "16g",
-                            "timeout": 3600,
-                        },
-                        "stages": [{"name": "test", "run": "pytest tests/ -v"}],
-                    },
-                },
-            },
-            "iluvatar": {
-                "image": {
-                    "dockerfile": ".ci/images/iluvatar/",
-                    "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"},
-                },
-                "setup": "pip install .[dev]",
-                "jobs": {
-                    "gpu": {
-                        "resources": {
-                            "ngpus": 1,
-                            "memory": "32GB",
-                            "shm_size": "16g",
-                            "timeout": 3600,
-                        },
-                        "stages": [{"name": "test", "run": "pytest tests/ -v"}],
-                    },
-                },
-            },
-        },
-    }
-    return normalize_config(raw)
-
-
-@pytest.fixture
-def mock_resource_pool():
-    pool = MagicMock(spec=res.ResourcePool)
-    pool.platform = "nvidia"
-    pool.allocate.return_value = ([0], True)
-    pool.release.return_value = None
-    pool.get_status.return_value = {
-        "platform": "nvidia",
-        "gpus": [],
-        "allocated_gpu_ids": [],
-        "system": {},
-    }
-    return pool
-
-
-# ---------------------------------------------------------------------------
-# Tests for `resolve_job_names`.
-# ---------------------------------------------------------------------------
-
-
-def test_resolve_job_names_by_name(agent_config):
-    jobs = run.resolve_job_names(agent_config["jobs"], job="nvidia_gpu")
-    assert jobs == ["nvidia_gpu"]
-
-
-def test_resolve_job_names_by_platform(agent_config):
-    jobs = run.resolve_job_names(agent_config["jobs"], platform="nvidia")
-    assert jobs == ["nvidia_gpu"]
-
-
-def test_resolve_job_names_by_platform_iluvatar(agent_config):
-    jobs = run.resolve_job_names(agent_config["jobs"], platform="iluvatar")
-    assert jobs == ["iluvatar_gpu"]
-
-
-def test_resolve_job_names_all(agent_config):
-    jobs = run.resolve_job_names(agent_config["jobs"])
-    assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"}
-
-
-def test_resolve_job_names_invalid(agent_config):
-    with pytest.raises(ValueError, match="not_exist"):
-        run.resolve_job_names(agent_config["jobs"], job="not_exist")
-
-
-# ---------------------------------------------------------------------------
-# Tests for `verify_signature`.
-# ---------------------------------------------------------------------------
-
-
-def test_verify_signature_valid():
-    secret = "my-secret"
-    body = b'{"action": "push"}'
-    sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest()
-    assert agent.verify_signature(secret, body, sig) is True
-
-
-def test_verify_signature_invalid():
-    assert agent.verify_signature("secret", b"body", "sha256=wrong") is False
-
-
-def test_verify_signature_empty():
-    assert agent.verify_signature("secret", b"body", "") is False
-
-
-# ---------------------------------------------------------------------------
-# Tests for `JobRequest` and `JobResult`.
-# ---------------------------------------------------------------------------
-
-
-def test_job_request_fields(agent_config):
-    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
-    assert req.job_name == "nvidia_gpu"
-    assert req.platform == "nvidia"
-    assert req.commit_sha == "abc123"
-    assert len(req.job_id) == 8
-    d = req.to_dict()
-    assert d["job_name"] == "nvidia_gpu"
-
-
-def test_job_result_success():
-    r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5)
-    assert r.state == "success"
-
-
-def test_job_result_failure():
-    r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0)
-    assert r.state == "failure"
-
-
-# ---------------------------------------------------------------------------
-# Tests for the `Scheduler` class.
-# ---------------------------------------------------------------------------
-
-
-def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch):
-    monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0))
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        results_dir=Path("/tmp/test-results"),
-        no_status=True,
-        dry_run=True,
-    )
-    req = agent.JobRequest(
-        "nvidia_gpu",
-        "master",
-        "abc123",
-        agent_config,
-        results_dir=Path("/tmp/test-results"),
-    )
-    scheduler.submit(req)
-    results = scheduler.wait_all()
-    assert len(results) == 1
-    assert results[0].state == "success"
-
-
-def test_scheduler_queues_when_no_resources(agent_config, monkeypatch):
-    pool = MagicMock(spec=res.ResourcePool)
-    pool.allocate.return_value = ([], False)
-    pool.get_status.return_value = {
-        "platform": "nvidia",
-        "gpus": [],
-        "allocated_gpu_ids": [],
-        "system": {},
-    }
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        pool,
-        no_status=True,
-        dry_run=False,
-    )
-
-    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
-    scheduler.submit(req)
-
-    info = scheduler.get_job(req.job_id)
-    assert info["state"] == "queued"
-
-
-def test_scheduler_get_status(agent_config, mock_resource_pool):
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-
-    status = scheduler.get_status()
-    assert "queued" in status
-    assert "running" in status
-    assert "completed" in status
-    assert "resources" in status
-
-
-# ---------------------------------------------------------------------------
-# Tests for `WebhookHandler` push event parsing.
-# ---------------------------------------------------------------------------
-
-
-def test_webhook_parse_push():
-    handler = agent.WebhookHandler.__new__(agent.WebhookHandler)
-    payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"}
-    branch, sha = handler._parse_push(payload)
-    assert branch == "feat/test"
-    assert sha == "abc123def456"
-
-
-def test_webhook_parse_pr():
-    handler = agent.WebhookHandler.__new__(agent.WebhookHandler)
-    payload = {
-        "pull_request": {
-            "head": {
-                "ref": "feat/pr-branch",
-                "sha": "def789",
-            }
-        }
-    }
-    branch, sha = handler._parse_pull_request(payload)
-    assert branch == "feat/pr-branch"
-    assert sha == "def789"
-
-
-# ---------------------------------------------------------------------------
-# Integration-style webhook HTTP tests.
-# ---------------------------------------------------------------------------
-
-
-def _urlopen_no_proxy(url_or_req, **kwargs):
-    """`urlopen` mock that bypasses any `HTTP_PROXY`."""
-    import urllib.request
-
-    opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-    return opener.open(url_or_req, **kwargs)
-
-
-def test_health_endpoint(agent_config, mock_resource_pool):
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-    )
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    try:
-        resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5)
-        data = json.loads(resp.read())
-        assert data["status"] == "ok"
-        assert data["platform"] == "nvidia"
-    finally:
-        server.server_close()
-
-
-def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch):
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-        results_dir=Path("/tmp/test-results"),
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.request
-
-    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/api/run",
-        data=body,
-        headers={"Content-Type": "application/json"},
-    )
-
-    try:
-        resp = _urlopen_no_proxy(req, timeout=5)
-        data = json.loads(resp.read())
-        assert data["accepted"] is True
-        assert len(data["job_ids"]) >= 1
-    finally:
-        server.server_close()
-
-
-def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch):
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-    secret = "test-secret"
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-        webhook_secret=secret,
-        results_dir=Path("/tmp/test-results"),
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.request
-
-    payload = json.dumps(
-        {
-            "ref": "refs/heads/master",
-            "after": "abc123def456",
-        }
-    ).encode()
-    sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest()
-
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/webhook",
-        data=payload,
-        headers={
-            "Content-Type": "application/json",
-            "X-GitHub-Event": "push",
-            "X-Hub-Signature-256": sig,
-        },
-    )
-
-    try:
-        resp = _urlopen_no_proxy(req, timeout=5)
-        data = json.loads(resp.read())
-        assert data["accepted"] is True
-    finally:
-        server.server_close()
-
-
-def test_webhook_invalid_signature(agent_config, mock_resource_pool):
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-    )
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-        webhook_secret="real-secret",
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.error
-    import urllib.request
-
-    payload = b'{"ref": "refs/heads/master", "after": "abc"}'
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/webhook",
-        data=payload,
-        headers={
-            "Content-Type": "application/json",
-            "X-GitHub-Event": "push",
-            "X-Hub-Signature-256": "sha256=invalid",
-        },
-    )
-
-    try:
-        with pytest.raises(urllib.error.HTTPError) as exc_info:
-            _urlopen_no_proxy(req, timeout=5)
-
-        assert exc_info.value.code == 401
-    finally:
-        server.server_close()
-
-
-# ---------------------------------------------------------------------------
-# Tests for API token authentication.
-# ---------------------------------------------------------------------------
-
-
-def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch):
-    """When `api_token` is set, `/api/run` rejects requests without a valid token."""
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-        api_token="my-secret-token",
-        results_dir=Path("/tmp/test-results"),
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.error
-    import urllib.request
-
-    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/api/run",
-        data=body,
-        headers={"Content-Type": "application/json"},
-    )
-
-    try:
-        with pytest.raises(urllib.error.HTTPError) as exc_info:
-            _urlopen_no_proxy(req, timeout=5)
-
-        assert exc_info.value.code == 401
-    finally:
-        server.server_close()
-
-
-def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch):
-    """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token."""
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-        api_token="my-secret-token",
-        results_dir=Path("/tmp/test-results"),
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.request
-
-    body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode()
-    req = urllib.request.Request(
-        f"http://127.0.0.1:{port}/api/run",
-        data=body,
-        headers={
-            "Content-Type": "application/json",
-            "Authorization": "Bearer my-secret-token",
-        },
-    )
-
-    try:
-        resp = _urlopen_no_proxy(req, timeout=5)
-        data = json.loads(resp.read())
-        assert data["accepted"] is True
-    finally:
-        server.server_close()
-
-
-# ---------------------------------------------------------------------------
-# Tests for queue backpressure.
-# ---------------------------------------------------------------------------
-
-
-def test_scheduler_rejects_when_queue_full(agent_config, monkeypatch):
-    """Scheduler raises QueueFullError when queue is at capacity."""
-    pool = MagicMock(spec=res.ResourcePool)
-    pool.allocate.return_value = ([], False)  # Never allocate → jobs stay queued.
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        pool,
-        no_status=True,
-        dry_run=False,
-    )
-
-    # Fill queue to capacity.
-    monkeypatch.setattr(agent, "MAX_QUEUE_SIZE", 3)
-
-    for _ in range(3):
-        req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
-        scheduler.submit(req)
-
-    # Next submit should fail.
-    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
-
-    with pytest.raises(agent.QueueFullError):
-        scheduler.submit(req)
-
-
-# ---------------------------------------------------------------------------
-# Tests for `poll_remote_job` error logging.
-# ---------------------------------------------------------------------------
-
-
-def test_poll_remote_job_logs_errors(monkeypatch, capsys):
-    """`poll_remote_job` warns on first failure instead of silently swallowing."""
-    call_count = 0
-
-    def fake_urlopen(req, **kwargs):
-        nonlocal call_count
-        call_count += 1
-        raise ConnectionError("connection refused")
-
-    monkeypatch.setattr(agent, "urllib_urlopen", fake_urlopen)
-    monkeypatch.setattr(agent, "urllib_request", lambda url: url)
-
-    result = agent.poll_remote_job(
-        "http://fake:8080", "job1", interval=0.01, timeout=0.05
-    )
-    assert result is None
-
-    captured = capsys.readouterr()
-    assert "connection refused" in captured.err
-    assert "warning:" in captured.err
-
-
-# ---------------------------------------------------------------------------
-# Tests for `JobResult` `log_file` field.
-# ---------------------------------------------------------------------------
-
-
-def test_job_result_includes_log_file():
-    r = agent.JobResult(
-        "id1",
-        "nvidia_gpu",
-        "abc",
-        1,
-        Path("/tmp/res"),
-        10.0,
-        error_tail=["error"],
-        log_file=Path("/tmp/res/job.log"),
-    )
-    d = r.to_dict()
-    assert d["log_file"] == "/tmp/res/job.log"
-
-
-def test_job_result_omits_log_file_when_none():
-    r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 5.0)
-    d = r.to_dict()
-    assert "log_file" not in d
-
-
-# ---------------------------------------------------------------------------
-# Tests for `/api/job/{id}/log` endpoint.
-# ---------------------------------------------------------------------------
-
-
-def test_job_log_endpoint(agent_config, mock_resource_pool, monkeypatch, tmp_path):
-    """`GET /api/job/{id}/log` returns the full log file content."""
-    monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True)
-
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-        dry_run=True,
-    )
-
-    # Manually inject a completed job with a log file.
-    log_file = tmp_path / "job.log"
-    log_file.write_text("line 1\nline 2\nline 3\n")
-
-    req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config)
-    result = agent.JobResult(
-        req.job_id,
-        "nvidia_gpu",
-        "abc123",
-        0,
-        tmp_path,
-        1.0,
-        log_file=log_file,
-    )
-
-    with scheduler._lock:
-        scheduler._jobs[req.job_id] = {
-            "request": req,
-            "result": result,
-            "state": "success",
-            "gpu_ids": [],
-        }
-
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.request
-
-    url = f"http://127.0.0.1:{port}/api/job/{req.job_id}/log"
-    req_http = urllib.request.Request(url)
-
-    try:
-        resp = _urlopen_no_proxy(req_http, timeout=5)
-        body = resp.read().decode("utf-8")
-        assert "line 1" in body
-        assert "line 2" in body
-        assert "line 3" in body
-        assert resp.headers["Content-Type"] == "text/plain; charset=utf-8"
-    finally:
-        server.server_close()
-
-
-def test_job_log_endpoint_not_found(agent_config, mock_resource_pool):
-    """`GET /api/job/{id}/log` returns 404 for unknown job."""
-    scheduler = agent.Scheduler(
-        agent_config,
-        "nvidia",
-        mock_resource_pool,
-        no_status=True,
-    )
-
-    server = agent.AgentServer(
-        "127.0.0.1",
-        0,
-        agent_config,
-        scheduler,
-        "nvidia",
-    )
-    port = server.server_address[1]
-
-    t = threading.Thread(target=server.handle_request, daemon=True)
-    t.start()
-
-    import urllib.error
-    import urllib.request
-
-    url = f"http://127.0.0.1:{port}/api/job/nonexist/log"
-    req_http = urllib.request.Request(url)
-
-    try:
-        with pytest.raises(urllib.error.HTTPError) as exc_info:
-            _urlopen_no_proxy(req_http, timeout=5)
-
-        assert exc_info.value.code == 404
-    finally:
-        server.server_close()
diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py
deleted file mode 100644
index df25d606..00000000
--- a/.ci/tests/test_build.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from unittest.mock import MagicMock
-
-import build
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_image_tag`.
-# ---------------------------------------------------------------------------
-
-
-def test_build_image_tag_with_registry():
-    tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest")
-    assert tag == "localhost:5000/infiniops/nvidia:latest"
-
-
-def test_build_image_tag_without_registry():
-    tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234")
-    assert tag == "infiniops-ci/nvidia:abc1234"
-
-
-def test_build_image_tag_commit_hash():
-    tag = build.build_image_tag(
-        "registry.example.com:5000", "proj", "ascend", "deadbeef"
-    )
-    assert tag == "registry.example.com:5000/proj/ascend:deadbeef"
-
-
-# ---------------------------------------------------------------------------
-# Tests for `has_dockerfile_changed`.
-# ---------------------------------------------------------------------------
-
-
-def test_has_dockerfile_changed_true_when_stdout_nonempty(monkeypatch):
-    monkeypatch.setattr(
-        "subprocess.run",
-        lambda *a, **kw: MagicMock(returncode=0, stdout="Dockerfile\n"),
-    )
-    assert build.has_dockerfile_changed(".ci/images/nvidia/") is True
-
-
-def test_has_dockerfile_changed_false_when_stdout_empty(monkeypatch):
-    monkeypatch.setattr(
-        "subprocess.run",
-        lambda *a, **kw: MagicMock(returncode=0, stdout=""),
-    )
-    assert build.has_dockerfile_changed(".ci/images/nvidia/") is False
-
-
-def test_has_dockerfile_changed_true_on_git_error(monkeypatch):
-    # Shallow clone or initial commit: `git diff` returns non-zero.
-    monkeypatch.setattr(
-        "subprocess.run",
-        lambda *a, **kw: MagicMock(returncode=128, stdout=""),
-    )
-    assert build.has_dockerfile_changed(".ci/images/nvidia/") is True
-
-
-# ---------------------------------------------------------------------------
-# Tests for `docker_login`.
-# ---------------------------------------------------------------------------
-
-
-def test_docker_login_no_credentials_env(monkeypatch):
-    called = []
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1))
-    result = build.docker_login({"url": "localhost:5000"}, dry_run=False)
-    assert result is True
-    assert not called
-
-
-def test_docker_login_token_not_set(monkeypatch):
-    monkeypatch.delenv("REGISTRY_TOKEN", raising=False)
-    called = []
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1))
-    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
-    result = build.docker_login(cfg, dry_run=False)
-    assert result is False
-    assert not called
-
-
-def test_docker_login_dry_run_does_not_call_subprocess(monkeypatch):
-    monkeypatch.setenv("REGISTRY_TOKEN", "mytoken")
-    called = []
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1))
-    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
-    result = build.docker_login(cfg, dry_run=True)
-    assert result is True
-    assert not called
-
-
-def test_docker_login_success(monkeypatch):
-    monkeypatch.setenv("REGISTRY_TOKEN", "mytoken")
-    captured = {}
-
-    def mock_run(cmd, **kwargs):
-        captured["cmd"] = cmd
-        return MagicMock(returncode=0)
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-    cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"}
-    result = build.docker_login(cfg, dry_run=False)
-    assert result is True
-    assert "docker" in captured["cmd"]
-    assert "login" in captured["cmd"]
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_image` dry-run mode and proxy forwarding.
-# ---------------------------------------------------------------------------
-
-
-def _platform_cfg():
-    return {
-        "dockerfile": ".ci/images/nvidia/",
-        "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"},
-    }
-
-
-def _registry_cfg():
-    return {"url": "localhost:5000", "project": "infiniops"}
-
-
-def test_build_image_dry_run_no_subprocess(monkeypatch, capsys):
-    monkeypatch.delenv("HTTP_PROXY", raising=False)
-    monkeypatch.delenv("http_proxy", raising=False)
-    monkeypatch.delenv("HTTPS_PROXY", raising=False)
-    monkeypatch.delenv("https_proxy", raising=False)
-    monkeypatch.delenv("NO_PROXY", raising=False)
-    monkeypatch.delenv("no_proxy", raising=False)
-    called = []
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1))
-    build.build_image(
-        "nvidia",
-        _platform_cfg(),
-        _registry_cfg(),
-        "abc1234",
-        push=False,
-        dry_run=True,
-        logged_in=True,
-    )
-    assert not called
-    captured = capsys.readouterr()
-    assert "[dry-run]" in captured.out
-
-
-def test_build_image_dry_run_output_contains_image_tag(monkeypatch, capsys):
-    monkeypatch.delenv("HTTP_PROXY", raising=False)
-    monkeypatch.delenv("http_proxy", raising=False)
-    monkeypatch.delenv("HTTPS_PROXY", raising=False)
-    monkeypatch.delenv("https_proxy", raising=False)
-    monkeypatch.delenv("NO_PROXY", raising=False)
-    monkeypatch.delenv("no_proxy", raising=False)
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=0))
-    build.build_image(
-        "nvidia",
-        _platform_cfg(),
-        _registry_cfg(),
-        "abc1234",
-        push=False,
-        dry_run=True,
-        logged_in=True,
-    )
-    captured = capsys.readouterr()
-    assert "abc1234" in captured.out
-
-
-def test_build_image_proxy_in_build_args(monkeypatch):
-    monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128")
-    captured = {}
-
-    def mock_run(cmd, **kwargs):
-        captured["cmd"] = cmd
-        return MagicMock(returncode=0)
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-    build.build_image(
-        "nvidia",
-        _platform_cfg(),
-        _registry_cfg(),
-        "abc1234",
-        push=False,
-        dry_run=False,
-        logged_in=True,
-    )
-    joined = " ".join(captured["cmd"])
-    assert "HTTP_PROXY=http://proxy.test:3128" in joined
-    assert "http_proxy=http://proxy.test:3128" in joined
-
-
-def test_build_image_returns_false_on_docker_error(monkeypatch):
-    monkeypatch.delenv("HTTP_PROXY", raising=False)
-    monkeypatch.delenv("http_proxy", raising=False)
-    monkeypatch.delenv("HTTPS_PROXY", raising=False)
-    monkeypatch.delenv("https_proxy", raising=False)
-    monkeypatch.delenv("NO_PROXY", raising=False)
-    monkeypatch.delenv("no_proxy", raising=False)
-    monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=1))
-    result = build.build_image(
-        "nvidia",
-        _platform_cfg(),
-        _registry_cfg(),
-        "abc1234",
-        push=False,
-        dry_run=False,
-        logged_in=True,
-    )
-    assert result is False
diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py
deleted file mode 100644
index 9e29c792..00000000
--- a/.ci/tests/test_github_status.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import json
-from unittest.mock import MagicMock
-
-
-import github_status as gh
-
-
-# ---------------------------------------------------------------------------
-# Tests for `parse_repo_url`.
-# ---------------------------------------------------------------------------
-
-
-def test_parse_repo_url_https():
-    owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git")
-    assert owner == "InfiniTensor"
-    assert repo == "InfiniOps"
-
-
-def test_parse_repo_url_https_no_git():
-    owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo")
-    assert owner == "Owner"
-    assert repo == "Repo"
-
-
-def test_parse_repo_url_ssh():
-    owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git")
-    assert owner == "Owner"
-    assert repo == "Repo"
-
-
-def test_parse_repo_url_invalid():
-    owner, repo = gh.parse_repo_url("not-a-url")
-    assert owner == ""
-    assert repo == ""
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_status_context`.
-# ---------------------------------------------------------------------------
-
-
-def test_build_status_context():
-    ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu")
-    assert ctx == "ci/infiniops/nvidia_gpu"
-
-
-# ---------------------------------------------------------------------------
-# Tests for `post_commit_status`.
-# ---------------------------------------------------------------------------
-
-
-def test_post_status_no_token(monkeypatch):
-    monkeypatch.delenv("GITHUB_TOKEN", raising=False)
-    result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc")
-    assert result is False
-
-
-def test_post_status_missing_owner():
-    result = gh.post_commit_status(
-        "", "repo", "abc123", "success", "ctx", "desc", token="tok"
-    )
-    assert result is False
-
-
-def test_post_status_success(monkeypatch):
-    mock_response = MagicMock()
-    mock_response.status = 201
-    mock_response.__enter__ = MagicMock(return_value=mock_response)
-    mock_response.__exit__ = MagicMock(return_value=False)
-
-    captured_req = {}
-
-    def mock_urlopen(req, **kwargs):
-        captured_req["url"] = req.full_url
-        captured_req["data"] = json.loads(req.data)
-        captured_req["headers"] = dict(req.headers)
-        return mock_response
-
-    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
-
-    result = gh.post_commit_status(
-        "InfiniTensor",
-        "InfiniOps",
-        "abc123def",
-        "success",
-        "ci/infiniops/nvidia_gpu",
-        "Tests passed",
-        token="ghp_test_token",
-    )
-
-    assert result is True
-    assert "abc123def" in captured_req["url"]
-    assert captured_req["data"]["state"] == "success"
-    assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu"
-    assert "ghp_test_token" in captured_req["headers"]["Authorization"]
-
-
-def test_post_status_http_error(monkeypatch):
-    import urllib.error
-
-    def mock_urlopen(req, **kwargs):
-        raise urllib.error.HTTPError(
-            url="", code=422, msg="Unprocessable", hdrs=None, fp=None
-        )
-
-    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
-
-    result = gh.post_commit_status(
-        "owner", "repo", "sha", "success", "ctx", "desc", token="tok"
-    )
-    assert result is False
-
-
-def test_post_status_url_error(monkeypatch):
-    import urllib.error
-
-    def mock_urlopen(req, **kwargs):
-        raise urllib.error.URLError("connection refused")
-
-    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
-
-    result = gh.post_commit_status(
-        "owner", "repo", "sha", "success", "ctx", "desc", token="tok"
-    )
-    assert result is False
-
-
-def test_post_status_truncates_description(monkeypatch):
-    mock_response = MagicMock()
-    mock_response.status = 201
-    mock_response.__enter__ = MagicMock(return_value=mock_response)
-    mock_response.__exit__ = MagicMock(return_value=False)
-
-    captured = {}
-
-    def mock_urlopen(req, **kwargs):
-        captured["data"] = json.loads(req.data)
-        return mock_response
-
-    monkeypatch.setattr("urllib.request.urlopen", mock_urlopen)
-
-    long_desc = "x" * 200
-    gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok")
-
-    assert len(captured["data"]["description"]) == 140
diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py
deleted file mode 100644
index a7ba8f87..00000000
--- a/.ci/tests/test_resource.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import threading
-
-
-import ci_resource as res
-
-
-# ---------------------------------------------------------------------------
-# Tests for `GpuInfo` and `SystemResources`.
-# ---------------------------------------------------------------------------
-
-
-def test_gpu_info_fields():
-    g = res.GpuInfo(
-        index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50
-    )
-    assert g.index == 0
-    assert g.memory_total_mb == 8000
-
-
-def test_system_resources_fields():
-    s = res.SystemResources(
-        total_memory_mb=32000, available_memory_mb=16000, cpu_count=8
-    )
-    assert s.cpu_count == 8
-
-
-# ---------------------------------------------------------------------------
-# Tests for `detect_gpus`.
-# ---------------------------------------------------------------------------
-
-
-def test_detect_gpus_nvidia_parses_csv(monkeypatch):
-    csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia")
-    gpus = pool.detect_gpus()
-    assert len(gpus) == 2
-    assert gpus[0].index == 0
-    assert gpus[0].memory_used_mb == 512
-    assert gpus[0].utilization_pct == 5
-    assert gpus[1].index == 1
-    assert gpus[1].utilization_pct == 80
-
-
-def test_detect_gpus_empty_on_failure(monkeypatch):
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 1
-            stdout = ""
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia")
-    assert pool.detect_gpus() == []
-
-
-def test_detect_gpus_unknown_platform():
-    pool = res.ResourcePool("unknown_platform")
-    assert pool.detect_gpus() == []
-
-
-def test_detect_gpus_file_not_found(monkeypatch):
-    def mock_run(cmd, **kwargs):
-        raise FileNotFoundError("nvidia-smi not found")
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia")
-    assert pool.detect_gpus() == []
-
-
-# ---------------------------------------------------------------------------
-# Tests for `detect_system_resources`.
-# ---------------------------------------------------------------------------
-
-
-def test_detect_system_resources(monkeypatch, tmp_path):
-    meminfo = tmp_path / "meminfo"
-    meminfo.write_text(
-        "MemTotal:       32000000 kB\n"
-        "MemFree:        10000000 kB\n"
-        "MemAvailable:   20000000 kB\n"
-    )
-
-    _real_open = open
-
-    def fake_open(path, **kw):
-        if str(path) == "/proc/meminfo":
-            return _real_open(str(meminfo), **kw)
-        return _real_open(path, **kw)
-
-    monkeypatch.setattr("builtins.open", fake_open)
-
-    pool = res.ResourcePool("nvidia")
-    sys_res = pool.detect_system_resources()
-    assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1
-    assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1
-    assert sys_res.cpu_count > 0
-
-
-# ---------------------------------------------------------------------------
-# Tests for `allocate` picking least-loaded GPUs.
-# ---------------------------------------------------------------------------
-
-
-def test_allocate_picks_least_loaded(monkeypatch):
-    csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(1)
-    assert ok is True
-    assert gpu_ids == [1]  # GPU 1 has lowest utilization (2%).
-
-
-def test_allocate_picks_two_least_loaded(monkeypatch):
-    csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(2)
-    assert ok is True
-    assert gpu_ids == [1, 2]  # Sorted by utilization: 2% then 5%.
-
-
-def test_allocate_skips_busy_gpus(monkeypatch):
-    csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(2)
-    assert ok is True
-    assert set(gpu_ids) == {0, 2}
-    assert 1 not in gpu_ids  # GPU 1 at 95% is above threshold
-
-
-# ---------------------------------------------------------------------------
-# Tests for `allocate` and `release`.
-# ---------------------------------------------------------------------------
-
-
-def test_allocate_success(monkeypatch):
-    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(1)
-    assert ok is True
-    assert len(gpu_ids) == 1
-    assert gpu_ids[0] in (0, 1)
-
-
-def test_allocate_insufficient_gpus(monkeypatch):
-    csv_output = "0, 100, 8192, 5\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(3)
-    assert ok is False
-    assert gpu_ids == []
-
-
-def test_allocate_zero_gpus():
-    pool = res.ResourcePool("unknown")
-    gpu_ids, ok = pool.allocate(0)
-    assert ok is True
-    assert gpu_ids == []
-
-
-def test_release_frees_gpus(monkeypatch):
-    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids, ok = pool.allocate(2)
-    assert ok is True
-    assert len(gpu_ids) == 2
-
-    # All GPUs allocated; next allocation should fail.
-    _, ok2 = pool.allocate(1)
-    assert ok2 is False
-
-    # Release one GPU.
-    pool.release([gpu_ids[0]])
-    gpu_ids2, ok3 = pool.allocate(1)
-    assert ok3 is True
-    assert gpu_ids2 == [gpu_ids[0]]
-
-
-def test_allocate_excludes_allocated(monkeypatch):
-    csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=10)
-    gpu_ids1, _ = pool.allocate(1)
-    gpu_ids2, _ = pool.allocate(1)
-
-    assert gpu_ids1 != gpu_ids2
-    assert set(gpu_ids1 + gpu_ids2) == {0, 1}
-
-
-def test_thread_safety(monkeypatch):
-    csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia", utilization_threshold=50)
-    allocated_all = []
-    lock = threading.Lock()
-
-    def allocate_one():
-        ids, ok = pool.allocate(1)
-
-        if ok:
-            with lock:
-                allocated_all.extend(ids)
-
-    threads = [threading.Thread(target=allocate_one) for _ in range(4)]
-
-    for t in threads:
-        t.start()
-
-    for t in threads:
-        t.join()
-
-    assert len(allocated_all) == 4
-    assert len(set(allocated_all)) == 4
-
-
-# ---------------------------------------------------------------------------
-# Tests for `get_status`.
-# ---------------------------------------------------------------------------
-
-
-def test_get_status(monkeypatch):
-    csv_output = "0, 512, 8192, 5\n"
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = csv_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("nvidia")
-    status = pool.get_status()
-    assert status["platform"] == "nvidia"
-    assert len(status["gpus"]) == 1
-    assert "system" in status
-
-
-# ---------------------------------------------------------------------------
-# Tests for `parse_gpu_requirement` and `parse_memory_requirement`.
-# ---------------------------------------------------------------------------
-
-
-def test_parse_gpu_requirement_auto_default():
-    """`gpu_ids` omitted (defaults to `auto`) with `ngpus=1`."""
-    job = {"resources": {"ngpus": 1}}
-    assert res.parse_gpu_requirement(job) == 1
-
-
-def test_parse_gpu_requirement_auto_explicit():
-    """`gpu_ids=auto` with `ngpus=2`."""
-    job = {"resources": {"gpu_ids": "auto", "ngpus": 2}}
-    assert res.parse_gpu_requirement(job) == 2
-
-
-def test_parse_gpu_requirement_auto_no_ngpus():
-    """`gpu_ids=auto` without `ngpus` defaults to 1."""
-    job = {"resources": {"gpu_ids": "auto"}}
-    assert res.parse_gpu_requirement(job) == 1
-
-
-def test_parse_gpu_requirement_auto_implicit_no_ngpus():
-    """No `gpu_ids` and no `ngpus` defaults to 1."""
-    job = {"resources": {}}
-    assert res.parse_gpu_requirement(job) == 1
-
-
-def test_parse_gpu_requirement_static_pinning():
-    """Static `gpu_ids` counts explicit device IDs."""
-    job = {"resources": {"gpu_ids": "0,1"}}
-    assert res.parse_gpu_requirement(job) == 2
-
-
-def test_parse_gpu_requirement_static_single():
-    job = {"resources": {"gpu_ids": "0"}}
-    assert res.parse_gpu_requirement(job) == 1
-
-
-def test_parse_gpu_requirement_all():
-    job = {"resources": {"gpu_ids": "all"}}
-    assert res.parse_gpu_requirement(job) == 0
-
-
-def test_parse_gpu_requirement_ngpus_mismatch_warns(capsys):
-    """Warn when static `gpu_ids` count differs from `ngpus`."""
-    job = {"resources": {"gpu_ids": "0,1", "ngpus": 3}}
-    assert res.parse_gpu_requirement(job) == 2
-
-    captured = capsys.readouterr()
-    assert "warning:" in captured.err
-    assert "ngpus=3" in captured.err
-
-
-def test_parse_gpu_requirement_ignores_unknown_keys():
-    """Unknown keys in resources do not affect GPU counting."""
-    job = {"resources": {"gpu_ids": "0", "extra_key": "value"}}
-    assert res.parse_gpu_requirement(job) == 1
-
-
-def test_detect_gpus_ascend_hbm_parsing(monkeypatch):
-    """`npu-smi` row 2 has DDR (0/0) and HBM (2789/32768); we want HBM."""
-    npu_output = (
-        "+---------------------------+---------------+-------------------------------+\n"
-        "| 0     910B4               | OK            | 86.5  41                      |\n"
-        "| 0                         | 0000:c1:00.0  | 5     0 / 0   2789 / 32768    |\n"
-        "+---------------------------+---------------+-------------------------------+\n"
-    )
-
-    def mock_run(cmd, **kwargs):
-        class R:
-            returncode = 0
-            stdout = npu_output
-
-        return R()
-
-    monkeypatch.setattr("subprocess.run", mock_run)
-
-    pool = res.ResourcePool("ascend")
-    gpus = pool.detect_gpus()
-    assert len(gpus) == 1
-    assert gpus[0].index == 0
-    assert gpus[0].utilization_pct == 5.0
-    assert gpus[0].memory_used_mb == 2789.0
-    assert gpus[0].memory_total_mb == 32768.0
-
-
-def test_parse_memory_requirement_gb():
-    assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024
-
-
-def test_parse_memory_requirement_mb():
-    assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512
-
-
-def test_parse_memory_requirement_empty():
-    assert res.parse_memory_requirement({"resources": {}}) == 0
-
-
-def test_parse_memory_requirement_invalid_warns(capsys):
-    result = res.parse_memory_requirement({"resources": {"memory": "abc xyz"}})
-    assert result == 0
-
-    captured = capsys.readouterr()
-    assert "warning:" in captured.err
-    assert "abc xyz" in captured.err
diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py
deleted file mode 100644
index 844d941d..00000000
--- a/.ci/tests/test_run.py
+++ /dev/null
@@ -1,450 +0,0 @@
-from pathlib import Path
-
-import pytest
-
-import run
-
-
-# ---------------------------------------------------------------------------
-# Tests for `resolve_image`.
-# ---------------------------------------------------------------------------
-
-
-def test_resolve_image_with_registry():
-    cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}}
-    img = run.resolve_image(cfg, "nvidia", "latest")
-    assert img == "localhost:5000/infiniops/nvidia:latest"
-
-
-def test_resolve_image_without_registry(minimal_config):
-    img = run.resolve_image(minimal_config, "nvidia", "abc1234")
-    assert img == "infiniops-ci/nvidia:abc1234"
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_runner_script`.
-# ---------------------------------------------------------------------------
-
-
-def test_runner_script_contains_git_clone():
-    script = run.build_runner_script()
-    assert "git clone" in script
-
-
-def test_runner_script_contains_setup_cmd():
-    script = run.build_runner_script()
-    assert "SETUP_CMD" in script
-
-
-def test_runner_script_exits_on_failure():
-    script = run.build_runner_script()
-    assert "exit $rc" in script
-
-
-def test_runner_script_creates_results_dir():
-    script = run.build_runner_script()
-    assert "mkdir -p /workspace/results" in script
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` basic structure.
-# ---------------------------------------------------------------------------
-
-
-def test_docker_args_basic_structure(minimal_config):
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-    )
-    assert args[0] == "docker"
-    assert "run" in args
-    assert "--rm" in args
-
-
-def test_docker_args_correct_image(minimal_config):
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-    )
-    assert "infiniops-ci/nvidia:latest" in args
-
-
-def test_docker_args_image_tag_override(minimal_config):
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        "abc1234",
-    )
-    assert "infiniops-ci/nvidia:abc1234" in args
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` proxy passthrough.
-# ---------------------------------------------------------------------------
-
-
-def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch):
-    monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080")
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-    )
-    assert "-e" in args
-    assert "HTTP_PROXY=http://proxy.example.com:8080" in args
-    assert "http_proxy=http://proxy.example.com:8080" in args
-
-
-def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch):
-    monkeypatch.delenv("HTTP_PROXY", raising=False)
-    monkeypatch.delenv("http_proxy", raising=False)
-    monkeypatch.delenv("HTTPS_PROXY", raising=False)
-    monkeypatch.delenv("https_proxy", raising=False)
-    monkeypatch.delenv("NO_PROXY", raising=False)
-    monkeypatch.delenv("no_proxy", raising=False)
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-    )
-
-    for arg in args:
-        assert not arg.startswith("HTTP_PROXY=")
-        assert not arg.startswith("http_proxy=")
-        assert not arg.startswith("HTTPS_PROXY=")
-        assert not arg.startswith("https_proxy=")
-        assert not arg.startswith("NO_PROXY=")
-        assert not arg.startswith("no_proxy=")
-
-
-def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch):
-    monkeypatch.delenv("HTTP_PROXY", raising=False)
-    monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128")
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-    )
-    assert "HTTP_PROXY=http://lowercase.proxy:3128" in args
-    assert "http_proxy=http://lowercase.proxy:3128" in args
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` GPU flags.
-# ---------------------------------------------------------------------------
-
-
-def _make_args(config, gpu_id_override=None):
-    return run.build_docker_args(
-        config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-        gpu_id_override=gpu_id_override,
-    )
-
-
-def test_docker_args_gpu_auto_no_override(minimal_config):
-    """`gpu_ids=auto` (default) without override produces no `--gpus` flag."""
-    args = _make_args(minimal_config)
-    assert "--gpus" not in args
-
-
-def test_docker_args_gpu_auto_with_override(minimal_config):
-    """`gpu_ids=auto` with allocator override sets `--gpus device=...`."""
-    args = _make_args(minimal_config, gpu_id_override="2")
-    idx = args.index("--gpus")
-    assert args[idx + 1] == "device=2"
-
-
-def test_docker_args_gpu_static(minimal_config):
-    """Static `gpu_ids` pins to specific devices."""
-    minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0"
-    args = _make_args(minimal_config)
-    idx = args.index("--gpus")
-    assert args[idx + 1] == "device=0"
-
-
-def test_docker_args_gpu_all(minimal_config):
-    minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all"
-    args = _make_args(minimal_config)
-    idx = args.index("--gpus")
-    assert args[idx + 1] == "all"
-
-
-def test_docker_args_gpu_override_trumps_static(minimal_config):
-    """CLI `gpu_id_override` takes precedence over static `gpu_ids`."""
-    minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0"
-    args = _make_args(minimal_config, gpu_id_override="2,3")
-    idx = args.index("--gpus")
-    assert args[idx + 1] == "device=2,3"
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` platform-specific device env vars.
-# ---------------------------------------------------------------------------
-
-
-def _make_platform_config(platform, job_suffix="gpu"):
-    """Build a minimal normalized config for a given platform."""
-    from utils import normalize_config
-
-    raw = {
-        "platforms": {
-            platform: {
-                "image": {"dockerfile": f".ci/images/{platform}/"},
-                "setup": "pip install .[dev]",
-                "jobs": {
-                    job_suffix: {
-                        "resources": {"ngpus": 1, "memory": "32GB"},
-                        "stages": [{"name": "test", "run": "pytest tests/ -v"}],
-                    }
-                },
-            }
-        }
-    }
-
-    return normalize_config(raw)
-
-
-def _make_platform_args(platform, job_suffix="gpu", gpu_id_override=None):
-    config = _make_platform_config(platform, job_suffix)
-    job_name = f"{platform}_{job_suffix}"
-
-    return run.build_docker_args(
-        config,
-        job_name,
-        "https://github.com/example/repo.git",
-        "master",
-        config["jobs"][job_name]["stages"],
-        "/workspace",
-        None,
-        gpu_id_override=gpu_id_override,
-    )
-
-
-def test_docker_args_moore_mthreads_visible_devices():
-    """Moore uses `MTHREADS_VISIBLE_DEVICES`, not `CUDA_VISIBLE_DEVICES`."""
-    args = _make_platform_args("moore", gpu_id_override="0")
-    assert "MTHREADS_VISIBLE_DEVICES=0" in args
-    assert all("CUDA_VISIBLE_DEVICES" not in a for a in args)
-
-
-def test_docker_args_iluvatar_cuda_visible_devices():
-    args = _make_platform_args("iluvatar", gpu_id_override="1,2")
-    assert "CUDA_VISIBLE_DEVICES=1,2" in args
-
-
-def test_docker_args_cambricon_mlu_visible_devices():
-    args = _make_platform_args("cambricon", gpu_id_override="0")
-    assert "MLU_VISIBLE_DEVICES=0" in args
-
-
-def test_docker_args_ascend_visible_devices():
-    args = _make_platform_args("ascend", job_suffix="npu", gpu_id_override="0")
-    assert "ASCEND_VISIBLE_DEVICES=0" in args
-
-
-def test_docker_args_metax_cuda_visible_devices():
-    args = _make_platform_args("metax", gpu_id_override="0,1")
-    assert "CUDA_VISIBLE_DEVICES=0,1" in args
-
-
-def test_docker_args_non_nvidia_no_gpus_flag():
-    """Non-NVIDIA platforms should never use `--gpus` Docker flag."""
-    for platform in ("iluvatar", "metax", "moore", "cambricon"):
-        args = _make_platform_args(platform, gpu_id_override="0")
-        assert "--gpus" not in args
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` memory format.
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.parametrize(
-    "raw,expected",
-    [
-        ("32GB", "32g"),
-        ("512MB", "512m"),
-        ("8", "8g"),
-        ("16gb", "16g"),
-        ("256mb", "256m"),
-    ],
-)
-def test_docker_args_memory_format(minimal_config, raw, expected):
-    minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw
-    args = _make_args(minimal_config)
-    idx = args.index("--memory")
-    assert args[idx + 1] == expected
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` stages encoding.
-# ---------------------------------------------------------------------------
-
-
-def test_docker_args_num_stages(minimal_config):
-    args = _make_args(minimal_config)
-    assert "NUM_STAGES=1" in args
-
-
-def test_docker_args_stage_name_cmd(minimal_config):
-    args = _make_args(minimal_config)
-    assert "STAGE_1_NAME=test" in args
-    assert any(a.startswith("STAGE_1_CMD=") for a in args)
-
-
-def test_docker_args_multiple_stages(minimal_config):
-    minimal_config["jobs"]["nvidia_gpu"]["stages"] = [
-        {"name": "lint", "run": "ruff check ."},
-        {"name": "test", "run": "pytest tests/"},
-    ]
-    args = _make_args(minimal_config)
-    assert "NUM_STAGES=2" in args
-    assert "STAGE_1_NAME=lint" in args
-    assert "STAGE_2_NAME=test" in args
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_docker_args` `results_dir` mount.
-# ---------------------------------------------------------------------------
-
-
-def test_docker_args_results_dir(minimal_config, tmp_path):
-    args = run.build_docker_args(
-        minimal_config,
-        "nvidia_gpu",
-        "https://github.com/example/repo.git",
-        "master",
-        minimal_config["jobs"]["nvidia_gpu"]["stages"],
-        "/workspace",
-        None,
-        results_dir=tmp_path,
-    )
-    joined = " ".join(str(a) for a in args)
-    assert "-v" in args
-    assert "/workspace/results" in joined
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_results_dir`.
-# ---------------------------------------------------------------------------
-
-
-def test_build_results_dir_contains_platform():
-    stages = [{"name": "test", "run": "pytest"}]
-    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
-    assert "nvidia" in d.name
-
-
-def test_build_results_dir_contains_commit():
-    stages = [{"name": "test", "run": "pytest"}]
-    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
-    assert "abc1234" in d.name
-
-
-def test_build_results_dir_contains_stage_names():
-    stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}]
-    d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
-    assert "lint+test" in d.name
-
-
-def test_build_results_dir_under_base():
-    stages = [{"name": "test", "run": "pytest"}]
-    d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678")
-    assert d.parent == Path("/tmp/my-results")
-
-
-# ---------------------------------------------------------------------------
-# Tests for `apply_test_override`.
-# ---------------------------------------------------------------------------
-
-
-def test_apply_test_override_replaces_test_path():
-    result = run.apply_test_override("pytest tests/ -v", "tests/test_add.py")
-    assert result == "pytest tests/test_add.py -v"
-
-
-def test_apply_test_override_preserves_flags():
-    result = run.apply_test_override(
-        "pytest tests/ -n 4 -v --tb=short", "tests/test_gemm.py"
-    )
-    assert "tests/test_gemm.py" in result
-    assert "-n 4" in result
-    assert "-v" in result
-    assert "--tb=short" in result
-    assert "tests/" not in result.split("tests/test_gemm.py")[0]
-
-
-def test_apply_test_override_non_pytest_passthrough():
-    """Non-pytest commands are returned unchanged."""
-    assert run.apply_test_override("ruff check .", "tests/foo.py") == "ruff check ."
-
-
-def test_apply_test_override_empty_passthrough():
-    assert run.apply_test_override("", "tests/foo.py") == ""
-
-
-# ---------------------------------------------------------------------------
-# Tests for runner script fail-fast behavior.
-# ---------------------------------------------------------------------------
-
-
-def test_runner_script_breaks_on_failure():
-    script = run.build_runner_script()
-    assert "break" in script
-
-
-def test_runner_script_preserves_exit_code():
-    script = run.build_runner_script()
-    assert "rc=$?" in script
-
-
-# ---------------------------------------------------------------------------
-# Tests for `build_results_dir` uniqueness and sanitization.
-# ---------------------------------------------------------------------------
-
-
-def test_build_results_dir_unique():
-    stages = [{"name": "test", "run": "pytest"}]
-    d1 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
-    d2 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234")
-    assert d1 != d2
-
-
-def test_build_results_dir_sanitizes_commit():
-    stages = [{"name": "test", "run": "pytest"}]
-    d = run.build_results_dir("ci-results", "nvidia", stages, "../../etc/passwd")
-    # Path separators are stripped; the result stays under the base directory.
-    assert "/" not in d.name
-    assert d.parent == Path("ci-results")
diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py
deleted file mode 100644
index b8fa6d60..00000000
--- a/.ci/tests/test_utils.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from utils import get_git_commit, normalize_config
-
-
-def test_normalize_creates_flat_jobs():
-    raw = {
-        "repo": {"url": "https://github.com/org/repo.git"},
-        "platforms": {
-            "nvidia": {
-                "image": {"dockerfile": ".ci/images/nvidia/"},
-                "setup": "pip install .",
-                "docker_args": ["--gpus", "all"],
-                "jobs": {
-                    "gpu": {
-                        "resources": {"gpu_ids": "0"},
-                        "stages": [{"name": "test", "run": "pytest"}],
-                    },
-                    "multi_gpu": {
-                        "resources": {"gpu_ids": "0,1"},
-                        "stages": [{"name": "test", "run": "pytest"}],
-                    },
-                },
-            },
-        },
-    }
-    config = normalize_config(raw)
-
-    assert "nvidia_gpu" in config["jobs"]
-    assert "nvidia_multi_gpu" in config["jobs"]
-    assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia"
-    assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ."
-    assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"]
-    assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0"
-    assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1"
-
-
-def test_normalize_extracts_images():
-    raw = {
-        "platforms": {
-            "nvidia": {
-                "image": {
-                    "dockerfile": ".ci/images/nvidia/",
-                    "build_args": {"BASE_IMAGE": "pytorch:latest"},
-                },
-                "jobs": {},
-            },
-        },
-    }
-    config = normalize_config(raw)
-    assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/"
-    assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest"
-
-
-def test_normalize_job_overrides_platform_defaults():
-    raw = {
-        "platforms": {
-            "nvidia": {
-                "setup": "default setup",
-                "jobs": {
-                    "special": {
-                        "setup": "custom setup",
-                        "stages": [],
-                    },
-                },
-            },
-        },
-    }
-    config = normalize_config(raw)
-    assert config["jobs"]["nvidia_special"]["setup"] == "custom setup"
-
-
-def test_normalize_preserves_top_level_keys():
-    raw = {
-        "repo": {"url": "https://github.com/org/repo.git"},
-        "github": {"status_context_prefix": "ci/test"},
-        "agents": {"nvidia": {"url": "http://host:8080"}},
-        "platforms": {},
-    }
-    config = normalize_config(raw)
-    assert config["repo"]["url"] == "https://github.com/org/repo.git"
-    assert config["github"]["status_context_prefix"] == "ci/test"
-    assert config["agents"]["nvidia"]["url"] == "http://host:8080"
-
-
-def test_normalize_passthrough_flat_config():
-    """Old flat format without `platforms` key is returned as-is."""
-    flat = {
-        "images": {"nvidia": {}},
-        "jobs": {"nvidia_gpu": {"platform": "nvidia"}},
-    }
-    assert normalize_config(flat) is flat
-
-
-# ---------------------------------------------------------------------------
-# Tests for `get_git_commit`.
-# ---------------------------------------------------------------------------
-
-
-def test_get_git_commit_warns_on_failure(monkeypatch, capsys):
-    from unittest.mock import MagicMock
-
-    monkeypatch.setattr(
-        "subprocess.run", lambda *a, **kw: MagicMock(returncode=128, stdout="")
-    )
-    result = get_git_commit()
-    assert result == "unknown"
-
-    captured = capsys.readouterr()
-    assert "warning:" in captured.err
diff --git a/.ci/utils.py b/.ci/utils.py
deleted file mode 100644
index 2a3d36fb..00000000
--- a/.ci/utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-"""Shared utilities for the CI toolchain."""
-
-import subprocess
-import sys
-
-try:
-    import yaml
-except ImportError:
-    print(
-        "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr
-    )
-    sys.exit(1)
-
-
-def normalize_config(raw):
-    """Convert platform-centric config to flat images/jobs format.
-
-    Input (new format):
-        platforms:
-          nvidia:
-            image: {dockerfile: ..., build_args: ...}
-            setup: pip install .[dev]
-            jobs:
-              gpu: {resources: ..., stages: ...}
-
-    Output (flat format consumed by run.py / build.py / agent.py):
-        images:
-          nvidia: {dockerfile: ..., build_args: ...}
-        jobs:
-          nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...}
-
-    If the config already uses the flat format (no 'platforms' key), returns as-is.
-    """
-    if "platforms" not in raw:
-        return raw
-
-    config = {}
-
-    for key in ("repo", "github", "agents"):
-        if key in raw:
-            config[key] = raw[key]
-
-    config["images"] = {}
-    config["jobs"] = {}
-
-    for platform, pcfg in raw.get("platforms", {}).items():
-        # Image config
-        if "image" in pcfg:
-            config["images"][platform] = pcfg["image"]
-
-        # Platform-level defaults inherited by jobs
-        defaults = {}
-
-        for key in ("image_tag", "docker_args", "volumes", "setup", "env"):
-            if key in pcfg:
-                defaults[key] = pcfg[key]
-
-        # Flatten jobs: {platform}_{job_name}
-        for job_name, job_cfg in pcfg.get("jobs", {}).items():
-            full_name = f"{platform}_{job_name}"
-            flat = {
-                "platform": platform,
-                "image": defaults.get("image_tag", "latest"),
-            }
-
-            # Apply platform defaults
-            for key in ("docker_args", "volumes", "setup", "env"):
-                if key in defaults:
-                    flat[key] = defaults[key]
-
-            # Job-level overrides
-            flat.update(job_cfg)
-
-            config["jobs"][full_name] = flat
-
-    # Warn on mismatched agent/platform keys (catches typos like 'nvdia').
-    agent_keys = set(config.get("agents", {}).keys())
-    platform_keys = set(raw.get("platforms", {}).keys())
-
-    for key in agent_keys - platform_keys:
-        print(
-            f"warning: agents.{key} has no matching platform in platforms.*",
-            file=sys.stderr,
-        )
-
-    return config
-
-
-def load_config(path):
-    """Load a YAML config file and normalize to flat format."""
-    with open(path, encoding="utf-8") as f:
-        raw = yaml.safe_load(f)
-
-    return normalize_config(raw)
-
-
-def get_git_commit(ref="HEAD", short=True):
-    """Get git commit SHA. Returns 'unknown' on failure."""
-    cmd = ["git", "rev-parse"]
-
-    if short:
-        cmd.append("--short")
-
-    cmd.append(ref)
-    result = subprocess.run(cmd, capture_output=True, text=True)
-
-    if result.returncode != 0:
-        print(
-            f"warning: git rev-parse failed for {ref!r}, using 'unknown'",
-            file=sys.stderr,
-        )
-
-        return "unknown"
-
-    return result.stdout.strip()
diff --git a/.ci/config.yaml b/.github/ci_config.yml
similarity index 69%
rename from .ci/config.yaml
rename to .github/ci_config.yml
index ea6a0d48..3f76c16b 100644
--- a/.ci/config.yaml
+++ b/.github/ci_config.yml
@@ -5,45 +5,29 @@ repo:
 github:
   status_context_prefix: "ci/infiniops"
 
-# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote
-# machines via `agent.py run`. Required on the trigger machine when each platform's
-# agent runs on a separate host. See the README for multi-machine deployment details.
-# agents:
-#   nvidia:
-#     url: http://nvidia-host:8080
-#   iluvatar:
-#     url: http://iluvatar-host:8080
-#   metax:
-#     url: http://metax-host:8080
-#   moore:
-#     url: http://moore-host:8080
-#   cambricon:
-#     url: http://cambricon-host:8080
-
 platforms:
   nvidia:
     image:
-      dockerfile: .ci/images/nvidia/
+      dockerfile: images/nvidia/
       build_args:
         BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3
     setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
+        type: unittest
         resources:
-          ngpus: 1                         # Scheduler auto-picks this many free GPUs.
-          gpu_ids: auto                    # `auto`: dynamic allocation; or pin with `"0"`, `"0,2"`, `"all"`.
+          ngpus: 1                         # Auto allocator picks this many free GPUs
           memory: 32GB
-          shm_size: 16g                    # Prevent PyTorch default 64MB shared memory limit.
+          shm_size: 16g                    # Prevent PyTorch default 64MB shared memory limit
           timeout: 3600
         # env:                             # Uncomment to inject extra env vars into the container.
         #   MY_VAR: value
         stages:
           - name: test
-            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
-
+            run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
   iluvatar:
     image:
-      dockerfile: .ci/images/iluvatar/
+      dockerfile: images/iluvatar/
       build_args:
         BASE_IMAGE: corex:qs_pj20250825
         APT_MIRROR: http://archive.ubuntu.com/ubuntu
@@ -61,19 +45,20 @@ platforms:
     setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
+        type: unittest
         resources:
           ngpus: 1
-          gpu_ids: auto
+          gpu_style: none
           memory: 32GB
           shm_size: 16g
           timeout: 3600
         stages:
           - name: test
-            run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/ --devices iluvatar -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
   metax:
     image:
-      dockerfile: .ci/images/metax/
+      dockerfile: images/metax/
       build_args:
         BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64
         APT_MIRROR: http://archive.ubuntu.com/ubuntu
@@ -85,19 +70,20 @@ platforms:
     setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
+        type: unittest
         resources:
           ngpus: 1
-          gpu_ids: auto
+          gpu_style: none                  # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility
           memory: 32GB
           shm_size: 16g
           timeout: 3600
         stages:
           - name: test
-            run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/ --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
   moore:
     image:
-      dockerfile: .ci/images/moore/
+      dockerfile: images/moore/
       build_args:
         BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon
         APT_MIRROR: http://archive.ubuntu.com/ubuntu
@@ -107,19 +93,20 @@ platforms:
     setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
+        type: unittest
         resources:
           ngpus: 1
-          gpu_ids: auto
+          gpu_style: none                  # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES controls visibility
           memory: 32GB
           shm_size: 16g
           timeout: 3600
         stages:
           - name: test
-            run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
   cambricon:
     image:
-      dockerfile: .ci/images/cambricon/
+      dockerfile: images/cambricon/
       build_args:
         BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310
         PIP_INDEX_URL: https://pypi.org/simple
@@ -128,19 +115,20 @@ platforms:
     setup: pip install .[dev] --no-build-isolation
     jobs:
       gpu:
+        type: unittest
         resources:
           ngpus: 1
-          gpu_ids: auto
+          gpu_style: mlu                   # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control
           memory: 32GB
           shm_size: 16g
           timeout: 3600
         stages:
           - name: test
-            run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
+            run: pytest tests/test_gemm.py --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml
 
-  ascend:                                  # TODO: Ascend image is not ready yet.
+  ascend:
     image:
-      dockerfile: .ci/images/ascend/
+      dockerfile: images/ascend/
       build_args:
         BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler
         PIP_INDEX_URL: https://pypi.org/simple
@@ -160,9 +148,10 @@ platforms:
     setup: pip install .[dev] --no-build-isolation
     jobs:
       npu:
+        type: unittest
         resources:
           ngpus: 1
-          gpu_ids: auto
+          gpu_style: none
           memory: 32GB
           shm_size: 16g
           timeout: 3600
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
new file mode 100644
index 00000000..651cc689
--- /dev/null
+++ b/.github/workflows/ci_test.yml
@@ -0,0 +1,15 @@
+name: CI
+
+on:
+  push:
+    branches: ["master"]
+  pull_request:
+    branches: ["master"]
+
+jobs:
+  ci:
+    uses: InfiniTensor/ci/.github/workflows/infiniops-ci.yml@codex/prune-unused-ci-artifacts
+    with:
+      config_path: .github/ci_config.yml
+      ci_ref: codex/prune-unused-ci-artifacts
+    secrets: inherit
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..be99e8a8
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule ".ci"]
+	path = .ci
+	url = https://github.com/InfiniTensor/ci.git
diff --git a/tests/conftest.py b/tests/conftest.py
index d995459f..564047a1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -92,7 +92,8 @@ def skip_unsupported_dtypes(request):
 
 # PyTorch device type → InfiniOps platform names. A single torch device type
 # can map to several platforms (e.g., `cuda` is shared by `nvidia`, `metax`,
-# and `iluvatar`); at most one is actually available at runtime.
+# and `iluvatar`). CI passes concrete platforms through `--devices`; without
+# that explicit platform, tests keep the historical broad mapping.
 _TORCH_DEVICE_TO_PLATFORMS = {
     "cuda": ("nvidia", "metax", "iluvatar"),
     "mlu": ("cambricon",),
@@ -119,7 +120,7 @@ def skip_op_without_platform_impl(request):
     if "implementation_index" in params:
         return
 
-    platforms = _TORCH_DEVICE_TO_PLATFORMS.get(params.get("device"))
+    platforms = _active_platforms_for_torch_device(request.config, params.get("device"))
 
     if not platforms:
         return
@@ -151,6 +152,24 @@ def _set_random_seed(seed):
 }
 
 
+def _active_platforms_for_torch_device(config, torch_device):
+    """Return platform names selected for a torch device type."""
+    if not torch_device:
+        return ()
+
+    cli_devices = config.getoption("--devices") or ()
+    requested_platforms = tuple(
+        name
+        for name in cli_devices
+        if _PLATFORM_TO_TORCH_DEVICE.get(name) == torch_device
+    )
+
+    if requested_platforms:
+        return requested_platforms
+
+    return _TORCH_DEVICE_TO_PLATFORMS.get(torch_device, ())
+
+
 def _resolve_device(name):
     """Map a platform name (e.g., `ascend`) to a PyTorch device type (e.g., `npu`)."""
     return _PLATFORM_TO_TORCH_DEVICE.get(name, name)