diff --git a/.ci b/.ci new file mode 160000 index 00000000..c602c96a --- /dev/null +++ b/.ci @@ -0,0 +1 @@ +Subproject commit c602c96a0370cb3288f1cd0cbcdd0816dfb0621e diff --git a/.ci/README.md b/.ci/README.md deleted file mode 100644 index bfc28ea1..00000000 --- a/.ci/README.md +++ /dev/null @@ -1,388 +0,0 @@ -# .ci — CI Images and Pipeline - -``` -.ci/ -├── config.yaml # Unified config (images, jobs, agent definitions) -├── utils.py # Shared utilities (load_config, normalize_config, get_git_commit) -├── agent.py # Runner Agent (scheduler, webhooks, remote dispatch) -├── build.py # Image builder -├── run.py # CI pipeline runner (Docker layer) -├── ci_resource.py # GPU/memory detection and allocation -├── github_status.py # GitHub Commit Status reporting -├── images/ -│ ├── nvidia/Dockerfile -│ ├── iluvatar/Dockerfile -│ ├── metax/Dockerfile -│ ├── moore/Dockerfile -│ ├── cambricon/Dockerfile -│ └── ascend/Dockerfile -└── tests/ # Unit tests - ├── conftest.py - ├── test_agent.py - ├── test_build.py - ├── test_run.py - ├── test_resource.py - ├── test_github_status.py - └── test_utils.py -``` - -**Prerequisites**: Docker, Python 3.10+, `pip install pyyaml` - ---- - -## Configuration `config.yaml` - -Config uses a **platform-centric** top-level structure. Each platform defines its image, platform-level defaults, and job list. -At load time, jobs are flattened to `{platform}_{job}` format (e.g., `nvidia_gpu`). - -```yaml -repo: - url: https://github.com/InfiniTensor/InfiniOps.git - branch: master - -github: - status_context_prefix: "ci/infiniops" - -agents: # Remote agent URLs (used by CLI for cross-machine dispatch) - nvidia: - url: http://nvidia-host:8080 - iluvatar: - url: http://iluvatar-host:8080 - -platforms: - nvidia: - image: # Image definition - dockerfile: .ci/images/nvidia/ - build_args: - BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 - setup: pip install .[dev] --no-build-isolation - jobs: - gpu: # Flattened as `nvidia_gpu`. - resources: - ngpus: 1 # Scheduler auto-picks this many free GPUs. - memory: 32GB - shm_size: 16g - timeout: 3600 - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - - iluvatar: - image: - dockerfile: .ci/images/iluvatar/ - build_args: - BASE_IMAGE: corex:qs_pj20250825 - APT_MIRROR: http://archive.ubuntu.com/ubuntu - PIP_INDEX_URL: https://pypi.org/simple - docker_args: # Platform-level docker args, inherited by all jobs - - "--privileged" - - "--cap-add=ALL" - - "--pid=host" - - "--ipc=host" - volumes: - - /dev:/dev - - /lib/firmware:/lib/firmware - - /usr/src:/usr/src - - /lib/modules:/lib/modules - setup: pip install .[dev] --no-build-isolation - jobs: - gpu: # Flattened as `iluvatar_gpu`. - resources: - ngpus: 1 - gpu_ids: auto - memory: 32GB - shm_size: 16g - timeout: 3600 - stages: - - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml -``` - -### Config hierarchy - -| Level | Field | Description | -|---|---|---| -| **Platform** | `image` | Image definition (dockerfile, build_args) | -| | `image_tag` | Default image tag (defaults to `latest`) | -| | `docker_args` | Extra `docker run` args (e.g., `--privileged`) | -| | `volumes` | Extra volume mounts | -| | `setup` | In-container setup command | -| | `env` | Injected container env vars | -| **Job** | `resources.ngpus` | Number of GPUs to allocate (default: 1). Used with `gpu_ids: auto` for dynamic allocation | -| | `resources.gpu_ids` | `auto`: scheduler picks `ngpus` least-loaded GPUs. Static: pin to specific IDs (e.g., `"0"`, `"0,2"`). `all`: use all GPUs | -| | `resources.memory` | Container memory limit | -| | `resources.shm_size` | Shared memory size | -| | `resources.timeout` | Max run time in seconds | -| | `stages` | Execution stage list | -| | Any platform field | Jobs can override any platform-level default | - ---- - -## Image builder `build.py` - -| Flag | Description | -|---|---| -| `--platform nvidia\|iluvatar\|metax\|moore\|ascend\|all` | Target platform (default: `all`) | -| `--commit` | Use specific commit ref as image tag (default: HEAD) | -| `--force` | Skip Dockerfile change detection | -| `--dry-run` | Print commands without executing | - -```bash -# Build with change detection (skips if no Dockerfile changes) -python .ci/build.py --platform nvidia - -# Build Iluvatar image -python .ci/build.py --platform iluvatar --force - -# Force build all platforms -python .ci/build.py --force -``` - -Build artifacts are stored as local Docker image tags: `infiniops-ci/:` and `:latest`. -Proxy and `no_proxy` env vars are forwarded from the host to `docker build` automatically. - -> `--push` is reserved for future use; requires a `registry` section in `config.yaml`. - ---- - -## Pipeline runner `run.py` - -Platform is auto-detected (via `nvidia-smi`/`ixsmi`/`mx-smi`/`mthreads-gmi`/`cnmon`/`npu-smi` on PATH), no manual specification needed. - -| Flag | Description | -|---|---| -| `--config` | Config file path (default: `.ci/config.yaml`) | -| `--job` | Job name (e.g., `nvidia_gpu`, `ascend_npu`). Defaults to all jobs for the current platform | -| `--branch` | Override clone branch (default: config `repo.branch`) | -| `--stage` | Run only the specified stage | -| `--image-tag` | Override image tag | -| `--gpu-id` | Override GPU device IDs (nvidia via `--gpus`, others via platform-specific env var) | -| `--test` | Override pytest test path (e.g., `tests/test_gemm.py::test_gemm`) | -| `--results-dir` | Host directory mounted to `/workspace/results` inside the container | -| `--local` | Mount current directory (read-only) instead of cloning from git | -| `--dry-run` | Print docker command without executing | - -```bash -# Simplest usage: auto-detect platform, run all jobs, use config default branch -python .ci/run.py - -# Run a specific job -python .ci/run.py --job nvidia_gpu - -# Run only the test stage, preview mode -python .ci/run.py --job nvidia_gpu --stage test --dry-run - -# Test local uncommitted changes without pushing -python .ci/run.py --local -``` - -Container execution flow: `git clone` → `checkout` → `setup` → stages (fail-fast: first failure breaks the loop and preserves the real exit code). -With `--local`, the current directory is mounted read-only at `/workspace/repo` and copied to a writable temp directory inside the container before setup runs — host files are never modified. -Proxy vars are forwarded from the host. Test results are written to `--results-dir` (each run gets a unique directory with timestamp + UUID suffix). Each run uses a clean environment (no host pip cache mounted). - ---- - -## Platform differences - -| Platform | GPU passthrough | Device env var | Base image | Detection tool | -|---|---|---|---|---| -| NVIDIA | `--gpus` (NVIDIA Container Toolkit) | — (uses Docker flag) | `nvcr.io/nvidia/pytorch:25.12-py3` | `nvidia-smi` | -| Iluvatar | `--privileged` + `/dev` mount | `CUDA_VISIBLE_DEVICES` | `corex:qs_pj20250825` | `ixsmi` | -| MetaX | `--privileged` | `CUDA_VISIBLE_DEVICES` | `maca-pytorch:3.2.1.4-...` | `mx-smi` | -| Moore | `--privileged` | `MTHREADS_VISIBLE_DEVICES` | `vllm_musa:20251112_hygon` | `mthreads-gmi` | -| Cambricon | `--privileged` | `MLU_VISIBLE_DEVICES` | `cambricon/pytorch:v1.25.3` | `cnmon` | -| Ascend | `--privileged` + device mounts | `ASCEND_VISIBLE_DEVICES` | `vllm-ascend:v0.18.0rc1-openeuler` | `npu-smi` | - -Device visibility is derived from the platform name (see `PLATFORM_DEVICE_ENV` in `ci_resource.py`). NVIDIA uses Docker's `--gpus` flag; all other platforms use `--privileged` and control visibility via a platform-specific environment variable. - ---- - -## Runner Agent `agent.py` - -The Runner Agent supports CLI manual dispatch, GitHub webhook triggers, resource-aware dynamic scheduling, and cross-machine remote dispatch. - -### CLI manual execution - -```bash -# Run all jobs (dispatched to remote agents, using config default branch) -python .ci/agent.py run - -# Specify branch -python .ci/agent.py run --branch feat/xxx - -# Run a specific job -python .ci/agent.py run --job nvidia_gpu - -# Filter by platform -python .ci/agent.py run --platform nvidia - -# Preview mode -python .ci/agent.py run --dry-run -``` - -| Flag | Description | -|---|---| -| `--branch` | Test branch (default: config `repo.branch`) | -| `--job` | Specific job name | -| `--platform` | Filter jobs by platform | -| `--commit` | Override commit SHA used for GitHub status reporting | -| `--image-tag` | Override image tag | -| `--dry-run` | Preview mode | - -### Webhook server - -Deploy one Agent instance per platform machine (platform is auto-detected). On each machine: - -```bash -python .ci/agent.py serve --port 8080 -``` - -Additional `serve` flags: - -| Flag | Description | -|---|---| -| `--port` | Listen port (default: 8080) | -| `--host` | Listen address (default: `0.0.0.0`) | -| `--webhook-secret` | GitHub webhook signing secret (or `WEBHOOK_SECRET` env var) | -| `--api-token` | `/api/run` Bearer auth token (or `AGENT_API_TOKEN` env var) | -| `--results-dir` | Results directory (default: `ci-results`) | -| `--utilization-threshold` | GPU idle threshold percentage (default: 10) | - -| Endpoint | Method | Description | -|---|---|---| -| `/webhook` | POST | GitHub webhook (push/pull_request) | -| `/api/run` | POST | Remote job trigger | -| `/api/job/{id}` | GET | Query job status | -| `/api/job/{id}/log` | GET | Full job log (text/plain) | -| `/health` | GET | Health check | -| `/status` | GET | Queue + resource status | - -Webhook supports `X-Hub-Signature-256` signature verification via `--webhook-secret` or `WEBHOOK_SECRET` env var. - -### Remote agent configuration - -Configure agent URLs in `config.yaml`; the CLI automatically dispatches remote jobs to the corresponding agents: - -```yaml -agents: - nvidia: - url: http://:8080 - iluvatar: - url: http://:8080 - metax: - url: http://:8080 - moore: - url: http://:8080 -``` - -### Resource scheduling - -The Agent auto-detects GPU utilization and system memory to dynamically determine parallelism: -- GPUs with utilization < threshold (default 10%) and not already allocated → available -- Allocation picks the **least-loaded** GPUs first (sorted by utilization ascending) -- When `gpu_ids: auto` (default), the scheduler allocates `ngpus` GPUs per job -- When resources are insufficient, jobs are queued automatically (max 100 pending); completed jobs release resources and trigger scheduling of queued tasks -- Docker execution has a Python-level timeout fallback (job timeout + 120s) to prevent stuck containers - -### GitHub Status - -Set the `GITHUB_TOKEN` env var and the Agent will automatically report commit status: -- `pending` — job started -- `success` / `failure` — job completed - -Status context format: `ci/infiniops/{job_name}` - ---- - -## Multi-machine deployment guide - -### Per-platform setup - -Each machine needs Docker installed, the platform runtime, and the base CI image built. - -| Platform | Runtime check | Base image | Build command | -|---|---|---|---| -| NVIDIA | `nvidia-smi` (+ [Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) | `nvcr.io/nvidia/pytorch:25.12-py3` (public) | `python .ci/build.py --platform nvidia` | -| Iluvatar | `ixsmi` | `corex:qs_pj20250825` (import in advance) | `python .ci/build.py --platform iluvatar` | -| MetaX | `mx-smi` | `maca-pytorch:3.2.1.4-...` (import in advance) | `python .ci/build.py --platform metax` | -| Moore | `mthreads-gmi` | `vllm_musa:20251112_hygon` (import in advance) | `python .ci/build.py --platform moore` | -| Cambricon | `cnmon` | `cambricon/pytorch:v1.25.3` (import in advance) | `python .ci/build.py --platform cambricon` | -| Ascend | `npu-smi` (+ Ascend driver + CANN toolkit) | `vllm-ascend:v0.18.0rc1-openeuler` (import in advance) | `python .ci/build.py --platform ascend` | - -### Start Agent services - -On each machine (platform is auto-detected): - -```bash -python .ci/agent.py serve --port 8080 -``` - -### Configure remote agent URLs - -On the trigger machine, add the `agents` section to `config.yaml` (see [Remote agent configuration](#remote-agent-configuration) above for the format). - -### Trigger cross-platform tests - -```bash -# Run all platform jobs at once (using config default branch) -python .ci/agent.py run - -# Preview mode (no actual execution) -python .ci/agent.py run --dry-run - -# Run only a specific platform -python .ci/agent.py run --platform nvidia -``` - -### Optional configuration - -#### GitHub Status reporting - -Set the env var on all machines so each reports its own platform's test status: - -```bash -export GITHUB_TOKEN=ghp_xxxxxxxxxxxx -``` - -#### API Token authentication - -When agents are exposed on untrusted networks, enable token auth: - -```bash -python .ci/agent.py serve --port 8080 --api-token -# Or: export AGENT_API_TOKEN= -``` - -#### GitHub Webhook auto-trigger - -In GitHub repo → Settings → Webhooks, add a webhook for each machine: - -| Field | Value | -|---|---| -| Payload URL | `http://:8080/webhook` | -| Content type | `application/json` | -| Secret | Must match `--webhook-secret` | -| Events | `push` and `pull_request` | - -```bash -python .ci/agent.py serve --port 8080 --webhook-secret -# Or: export WEBHOOK_SECRET= -``` - -### Verification checklist - -```bash -# 1. Dry-run each machine individually -for platform in nvidia iluvatar metax moore cambricon ascend; do - python .ci/agent.py run --platform $platform --dry-run -done - -# 2. Health and resource checks -for ip in ; do - curl http://$ip:8080/health - curl http://$ip:8080/status -done - -# 3. Cross-platform test -python .ci/agent.py run --branch master -``` diff --git a/.ci/agent.py b/.ci/agent.py deleted file mode 100644 index 9e8899b5..00000000 --- a/.ci/agent.py +++ /dev/null @@ -1,1088 +0,0 @@ -#!/usr/bin/env python3 -"""CI Runner Agent: webhook server, resource-aware scheduler, GitHub status reporting. - -Usage: - # Run jobs locally (or dispatch to remote agents) - python .ci/agent.py run - python .ci/agent.py run --branch master --job nvidia_gpu --dry-run - - # Start webhook server (auto-detects platform) - python .ci/agent.py serve --port 8080 -""" - -import argparse -import collections -import hashlib -import hmac -import json -import os -import shlex -import subprocess -import sys -import threading -import time -import urllib.error -import urllib.request -import uuid -from concurrent.futures import ThreadPoolExecutor, as_completed -from datetime import datetime -from http.server import BaseHTTPRequestHandler, HTTPServer -from pathlib import Path - -import ci_resource as res -import github_status as gh -import run - -# Maximum POST body size (1 MB) to prevent memory exhaustion -MAX_CONTENT_LENGTH = 1 * 1024 * 1024 - -# Job states -STATE_QUEUED = "queued" -STATE_RUNNING = "running" -STATE_PENDING = "pending" -STATE_SUCCESS = "success" -STATE_FAILURE = "failure" -STATE_ERROR = "error" - -TAIL_LINES = 50 -MAX_QUEUE_SIZE = 100 - -# urllib helpers (module-level for easier mocking in tests) -urllib_request = urllib.request.Request -urllib_urlopen = urllib.request.urlopen - - -class QueueFullError(Exception): - """Raised when the job queue has reached its maximum size.""" - - -# --------------------------------------------------------------------------- -# Data classes -# --------------------------------------------------------------------------- - - -class JobRequest: - """Describes a CI job to be executed.""" - - def __init__( - self, job_name, branch, commit_sha, config, image_tag=None, results_dir=None - ): - self.job_id = str(uuid.uuid4())[:8] - self.job_name = job_name - self.branch = branch - self.commit_sha = commit_sha - self.config = config - self.image_tag = image_tag - self.results_dir = results_dir or Path("ci-results") - self.created_at = datetime.now().isoformat() - - job = config["jobs"][job_name] - self.platform = job.get("platform", "nvidia") - - def to_dict(self): - return { - "job_id": self.job_id, - "job_name": self.job_name, - "branch": self.branch, - "commit_sha": self.commit_sha, - "platform": self.platform, - "created_at": self.created_at, - } - - -class JobResult: - """Outcome of a completed job.""" - - def __init__( - self, - job_id, - job_name, - commit_sha, - returncode, - results_dir, - duration, - error_tail=None, - log_file=None, - ): - self.job_id = job_id - self.job_name = job_name - self.commit_sha = commit_sha - self.returncode = returncode - self.results_dir = results_dir - self.duration = duration - self.error_tail = error_tail or [] - self.log_file = log_file - - self.state = STATE_SUCCESS if returncode == 0 else STATE_FAILURE - - def to_dict(self): - d = { - "job_id": self.job_id, - "job_name": self.job_name, - "commit_sha": self.commit_sha, - "state": self.state, - "returncode": self.returncode, - "results_dir": str(self.results_dir), - "duration_seconds": round(self.duration, 1), - } - - if self.error_tail: - d["error_tail"] = self.error_tail - - if self.log_file: - d["log_file"] = str(self.log_file) - - return d - - -# --------------------------------------------------------------------------- -# Scheduler -# --------------------------------------------------------------------------- - - -class Scheduler: - """Resource-aware job scheduler with dynamic parallelism.""" - - def __init__( - self, - config, - platform, - resource_pool, - results_dir=None, - max_workers=4, - no_status=False, - dry_run=False, - ): - self._config = config - self._platform = platform - self._resource_pool = resource_pool - self._results_dir = results_dir or Path("ci-results") - self._no_status = no_status - self._dry_run = dry_run - self._queue = collections.deque() - self._jobs: dict[str, dict] = {} # job_id -> {request, result, state, gpu_ids} - self._executor = ThreadPoolExecutor(max_workers=max_workers) - self._lock = threading.Lock() - self._done_event = threading.Event() - - # GitHub config - github_cfg = config.get("github", {}) - self._status_prefix = github_cfg.get("status_context_prefix", "ci/infiniops") - repo = config.get("repo", {}) - repo_url = repo.get("url", "") - self._owner, self._repo = gh.parse_repo_url(repo_url) - - def submit(self, job_request): - """Add a job to the queue and attempt to schedule it. - - Returns the job_id. Raises ``QueueFullError`` if the queue is at - capacity. - """ - with self._lock: - if len(self._queue) >= MAX_QUEUE_SIZE: - raise QueueFullError( - f"queue full ({MAX_QUEUE_SIZE} jobs), try again later" - ) - - self._jobs[job_request.job_id] = { - "request": job_request, - "result": None, - "state": STATE_QUEUED, - "gpu_ids": [], - } - self._queue.append(job_request) - - self._try_schedule() - return job_request.job_id - - def get_job(self, job_id): - """Get job info by ID.""" - with self._lock: - entry = self._jobs.get(job_id) - - if not entry: - return None - - info = entry["request"].to_dict() - info["state"] = entry["state"] - - if entry["result"]: - info.update(entry["result"].to_dict()) - - return info - - def get_job_log_file(self, job_id): - """Return the log file path for a completed job, or None.""" - with self._lock: - entry = self._jobs.get(job_id) - - if not entry or not entry["result"]: - return None - - return entry["result"].log_file - - def get_status(self): - """Return scheduler status for the /status endpoint.""" - with self._lock: - queued = [self._jobs[r.job_id]["request"].to_dict() for r in self._queue] - running = [] - completed = [] - - for entry in self._jobs.values(): - state = entry["state"] - - if state == STATE_RUNNING: - running.append( - {**entry["request"].to_dict(), "gpu_ids": entry["gpu_ids"]} - ) - elif state in (STATE_SUCCESS, STATE_FAILURE): - completed.append(entry["result"].to_dict()) - - return { - "queued": queued, - "running": running, - "completed": completed[-20:], # Last 20 - "resources": self._resource_pool.get_status(), - } - - def wait_all(self): - """Block until all submitted jobs are done. Returns list of JobResult.""" - while True: - with self._lock: - pending = any( - e["state"] in (STATE_QUEUED, STATE_RUNNING) - for e in self._jobs.values() - ) - - if not pending: - break - - self._done_event.wait(timeout=2.0) - self._done_event.clear() - - with self._lock: - return [e["result"] for e in self._jobs.values() if e["result"] is not None] - - def _try_schedule(self): - """Try to run queued jobs that have enough resources. - - Resource allocation and job submission are split: allocation decisions - are made under the lock, but executor.submit() happens outside to - prevent deadlock when the thread pool is saturated. - """ - to_launch = [] # [(req, gpu_ids), ...] - - with self._lock: - remaining = collections.deque() - - while self._queue: - req = self._queue.popleft() - job_cfg = self._config["jobs"].get(req.job_name, {}) - gpu_count = res.parse_gpu_requirement(job_cfg) - memory_mb = res.parse_memory_requirement(job_cfg) - - if self._dry_run: - # In dry-run mode, skip resource checks - gpu_ids, ok = [], True - else: - gpu_ids, ok = self._resource_pool.allocate(gpu_count, memory_mb) - - if ok: - self._jobs[req.job_id]["state"] = STATE_RUNNING - self._jobs[req.job_id]["gpu_ids"] = gpu_ids - to_launch.append((req, gpu_ids)) - else: - remaining.append(req) - - self._queue = remaining - - # Submit outside the lock to avoid deadlock with ThreadPoolExecutor - for req, gpu_ids in to_launch: - self._executor.submit(self._run_job, req, gpu_ids) - - def _run_job(self, req, gpu_ids): - """Execute a single job in a worker thread. - - Wrapped in try/finally to guarantee GPU resources are always released - and job state is updated even on unexpected exceptions. - """ - context = gh.build_status_context(self._status_prefix, req.job_name) - result = None - - try: - # Post pending status - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - STATE_PENDING, - context, - f"Running {req.job_name}...", - ) - - job_cfg = self._config["jobs"][req.job_name] - all_stages = job_cfg.get("stages", []) - repo_url = self._config.get("repo", {}).get("url", "") - commit_short = ( - req.commit_sha[:7] if len(req.commit_sha) > 7 else req.commit_sha - ) - results_dir = run.build_results_dir( - req.results_dir, req.platform, all_stages, commit_short - ) - - gpu_id_str = ",".join(str(g) for g in gpu_ids) if gpu_ids else None - docker_args = run.build_docker_args( - self._config, - req.job_name, - repo_url, - req.branch, - all_stages, - "/workspace", - req.image_tag, - gpu_id_override=gpu_id_str, - results_dir=results_dir, - ) - - start = time.monotonic() - - if self._dry_run: - print(f"[dry-run] {req.job_name}: {shlex.join(docker_args)}") - returncode = 0 - error_tail = [] - log_file = None - else: - results_dir.mkdir(parents=True, exist_ok=True) - log_file = results_dir / "job.log" - proc = subprocess.Popen( - docker_args, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - ) - tail_buf = collections.deque(maxlen=TAIL_LINES) - - with open(log_file, "wb") as lf: - for line in proc.stdout: - sys.stdout.buffer.write(line) - lf.write(line) - tail_buf.append(line) - - proc.stdout.close() - - # Python-level timeout as fallback for the in-container timeout. - job_timeout = job_cfg.get("resources", {}).get("timeout") - fallback_timeout = (job_timeout + 120) if job_timeout else 7200 - - try: - returncode = proc.wait(timeout=fallback_timeout) - except subprocess.TimeoutExpired: - proc.kill() - proc.wait() - returncode = -9 - timeout_msg = f"Job killed: exceeded {fallback_timeout}s timeout\n" - tail_buf.append(timeout_msg.encode()) - - with open(log_file, "ab") as lf: - lf.write(timeout_msg.encode()) - - if returncode != 0: - error_tail = [ - raw.decode("utf-8", errors="replace").rstrip("\n") - for raw in tail_buf - ] - else: - error_tail = [] - - duration = time.monotonic() - start - - result = JobResult( - job_id=req.job_id, - job_name=req.job_name, - commit_sha=req.commit_sha, - returncode=returncode, - results_dir=results_dir, - duration=duration, - error_tail=error_tail, - log_file=log_file, - ) - - # Post final status - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - result.state, - context, - f"{req.job_name}: {result.state} in {duration:.0f}s", - ) - except Exception as e: - print( - f"error: job {req.job_name} failed with exception: {e}", file=sys.stderr - ) - - if result is None: - result = JobResult( - job_id=req.job_id, - job_name=req.job_name, - commit_sha=req.commit_sha, - returncode=-1, - results_dir=req.results_dir, - duration=0, - error_tail=[str(e)], - ) - - if not self._no_status: - gh.post_commit_status( - self._owner, - self._repo, - req.commit_sha, - STATE_ERROR, - context, - f"{req.job_name}: internal error", - ) - finally: - # Always release resources and update state - self._resource_pool.release(gpu_ids) - - with self._lock: - self._jobs[req.job_id]["result"] = result - self._jobs[req.job_id]["state"] = ( - result.state if result else STATE_FAILURE - ) - - self._done_event.set() - # Safe outside lock: `_try_schedule` acquires `self._lock` internally. - self._try_schedule() - - return result - - -# --------------------------------------------------------------------------- -# Webhook server -# --------------------------------------------------------------------------- - - -def verify_signature(secret, body, signature_header): - """Verify GitHub webhook HMAC-SHA256 signature.""" - if not signature_header: - return False - - expected = ( - "sha256=" + hmac.new(secret.encode("utf-8"), body, hashlib.sha256).hexdigest() - ) - return hmac.compare_digest(expected, signature_header) - - -def _verify_api_token(handler): - """Check Bearer token for /api/run authentication. - - Returns True if authenticated, False (and sends 401) if not. - When no api_token is configured on the server, all requests are allowed. - """ - api_token = getattr(handler.server, "api_token", None) - - if not api_token: - return True - - auth_header = handler.headers.get("Authorization", "") - - if auth_header == f"Bearer {api_token}": - return True - - handler._respond_json(401, {"error": "unauthorized"}) - return False - - -class WebhookHandler(BaseHTTPRequestHandler): - """HTTP handler for GitHub webhooks and API endpoints.""" - - def log_message(self, format, *args): - msg = format % args if args else format - print(f"[agent] {msg}", file=sys.stderr) - - def do_GET(self): - if self.path == "/health": - self._respond_json(200, {"status": "ok", "platform": self.server.platform}) - elif self.path == "/status": - status = self.server.scheduler.get_status() - self._respond_json(200, status) - elif self.path.startswith("/api/job/"): - self._handle_api_job() - else: - self._respond_json(404, {"error": "not found"}) - - def do_POST(self): - content_length = int(self.headers.get("Content-Length", 0)) - - if content_length > MAX_CONTENT_LENGTH: - self._respond_json(413, {"error": "payload too large"}) - return - - body = self.rfile.read(content_length) - - if self.path == "/webhook": - self._handle_webhook(body) - elif self.path == "/api/run": - self._handle_api_run(body) - else: - self._respond_json(404, {"error": "not found"}) - - def _handle_webhook(self, body): - # Verify signature if secret is configured - if self.server.webhook_secret: - sig = self.headers.get("X-Hub-Signature-256", "") - - if not verify_signature(self.server.webhook_secret, body, sig): - self._respond_json(401, {"error": "invalid signature"}) - return - - event_type = self.headers.get("X-GitHub-Event", "") - - if event_type == "ping": - self._respond_json(200, {"msg": "pong"}) - return - - try: - payload = json.loads(body) - except json.JSONDecodeError: - self._respond_json(400, {"error": "invalid JSON"}) - return - - if event_type == "push": - branch, sha = self._parse_push(payload) - elif event_type == "pull_request": - action = payload.get("action", "") - - if action not in ("opened", "synchronize"): - self._respond_json(200, {"msg": f"ignored PR action: {action}"}) - return - - branch, sha = self._parse_pull_request(payload) - else: - self._respond_json(200, {"msg": f"ignored event: {event_type}"}) - return - - if not branch or not sha: - self._respond_json(400, {"error": "could not extract branch/sha"}) - return - - job_ids = self._submit_jobs(branch, sha) - self._respond_json(200, {"accepted": True, "job_ids": job_ids}) - - def _handle_api_run(self, body): - """Handle /api/run: remote job trigger (requires Bearer token auth).""" - if not _verify_api_token(self): - return - - try: - payload = json.loads(body) - except json.JSONDecodeError: - self._respond_json(400, {"error": "invalid JSON"}) - return - - branch = payload.get("branch", "") - sha = payload.get("commit_sha", "") - job_name = payload.get("job") - image_tag = payload.get("image_tag") - - if not branch: - self._respond_json(400, {"error": "branch is required"}) - return - - if not sha: - sha = run.get_git_commit() - - job_ids = self._submit_jobs(branch, sha, job_name=job_name, image_tag=image_tag) - self._respond_json(200, {"accepted": True, "job_ids": job_ids}) - - def _handle_api_job(self): - """Handle `GET /api/job/{id}` and `GET /api/job/{id}/log`.""" - parts = self.path.rstrip("/").split("/") - - if len(parts) < 4: - self._respond_json(400, {"error": "missing job_id"}) - return - - job_id = parts[3] - - # `GET /api/job/{id}/log` — return full log file. - if len(parts) >= 5 and parts[4] == "log": - self._handle_job_log(job_id) - return - - info = self.server.scheduler.get_job(job_id) - - if info is None: - self._respond_json(404, {"error": f"job {job_id} not found"}) - else: - self._respond_json(200, info) - - def _handle_job_log(self, job_id): - """Return the full log file for a completed job.""" - log_file = self.server.scheduler.get_job_log_file(job_id) - - if log_file is None or not Path(log_file).is_file(): - self._respond_json(404, {"error": f"log not available for job {job_id}"}) - return - - try: - data = Path(log_file).read_bytes() - except OSError as e: - self._respond_json(500, {"error": f"failed to read log: {e}"}) - return - - self.send_response(200) - self.send_header("Content-Type", "text/plain; charset=utf-8") - self.send_header("Content-Length", str(len(data))) - self.end_headers() - self.wfile.write(data) - - def _parse_push(self, payload): - branch = payload.get("ref", "").removeprefix("refs/heads/") - sha = payload.get("after", "") - return branch, sha - - def _parse_pull_request(self, payload): - pr = payload.get("pull_request", {}) - head = pr.get("head", {}) - branch = head.get("ref", "") - sha = head.get("sha", "") - return branch, sha - - def _submit_jobs(self, branch, sha, job_name=None, image_tag=None): - config = self.server.config - - try: - job_names = run.resolve_job_names( - config.get("jobs", {}), - platform=self.server.platform, - job=job_name, - ) - except ValueError as e: - self._respond_json(400, {"error": str(e)}) - return [] - - job_ids = [] - - for name in job_names: - req = JobRequest( - job_name=name, - branch=branch, - commit_sha=sha, - config=config, - image_tag=image_tag, - results_dir=self.server.results_dir, - ) - - try: - jid = self.server.scheduler.submit(req) - except QueueFullError as e: - self._respond_json(503, {"error": str(e)}) - return job_ids - - job_ids.append(jid) - - return job_ids - - def _respond_json(self, status_code, data): - body = json.dumps(data, indent=2).encode("utf-8") - self.send_response(status_code) - self.send_header("Content-Type", "application/json") - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) - - -class AgentServer(HTTPServer): - """HTTP server with scheduler and config context.""" - - def __init__( - self, - host, - port, - config, - scheduler, - platform, - webhook_secret=None, - api_token=None, - results_dir=None, - ): - super().__init__((host, port), WebhookHandler) - self.config = config - self.scheduler = scheduler - self.platform = platform - self.webhook_secret = webhook_secret - self.api_token = api_token - self.results_dir = results_dir or Path("ci-results") - - -# --------------------------------------------------------------------------- -# Remote job dispatch (for CLI triggering remote agents) -# --------------------------------------------------------------------------- - - -def dispatch_remote_job( - agent_url, job_name, branch, commit_sha, image_tag=None, api_token=None -): - """Send a job to a remote agent via HTTP API. Returns job_id or None.""" - url = f"{agent_url.rstrip('/')}/api/run" - body = { - "branch": branch, - "commit_sha": commit_sha, - "job": job_name, - } - - if image_tag: - body["image_tag"] = image_tag - - data = json.dumps(body).encode("utf-8") - headers = {"Content-Type": "application/json"} - - if api_token: - headers["Authorization"] = f"Bearer {api_token}" - - req = urllib_request(url, data=data, headers=headers, method="POST") - - try: - with urllib_urlopen(req, timeout=30) as resp: - result = json.loads(resp.read()) - job_ids = result.get("job_ids", []) - return job_ids[0] if job_ids else None - except Exception as e: - print(f"error: failed to dispatch to {agent_url}: {e}", file=sys.stderr) - return None - - -def poll_remote_job(agent_url, job_id, interval=5.0, timeout=7200): - """Poll a remote agent for job completion. Returns final state dict or None.""" - url = f"{agent_url.rstrip('/')}/api/job/{job_id}" - deadline = time.monotonic() + timeout - consecutive_failures = 0 - - while time.monotonic() < deadline: - try: - req = urllib_request(url) - - with urllib_urlopen(req, timeout=10) as resp: - info = json.loads(resp.read()) - - consecutive_failures = 0 - state = info.get("state", "") - - if state in (STATE_SUCCESS, STATE_FAILURE): - return info - except Exception as e: - consecutive_failures += 1 - - if consecutive_failures == 1 or consecutive_failures % 20 == 0: - print( - f"warning: polling {url} failed ({consecutive_failures}x): {e}", - file=sys.stderr, - ) - - time.sleep(interval) - - return None - - -def fetch_remote_log(agent_url, job_id): - """Fetch the full log for a completed remote job. Returns text or None.""" - url = f"{agent_url.rstrip('/')}/api/job/{job_id}/log" - - try: - req = urllib_request(url) - - with urllib_urlopen(req, timeout=30) as resp: - return resp.read().decode("utf-8", errors="replace") - except Exception: - return None - - -# --------------------------------------------------------------------------- -# CLI -# --------------------------------------------------------------------------- - - -def cmd_run(args): - """Handle 'run' subcommand: dispatch jobs to platform agents via HTTP.""" - config = run.load_config(args.config) - agents = config.get("agents", {}) - branch = args.branch or config.get("repo", {}).get("branch", "master") - commit_sha = args.commit or run.get_git_commit(short=False) - - # Determine which jobs to run - try: - job_names = run.resolve_job_names( - config.get("jobs", {}), platform=args.platform, job=args.job - ) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - if not job_names: - print("error: no matching jobs found", file=sys.stderr) - sys.exit(1) - - # Resolve agent URL for each job - jobs_to_dispatch = [] # [(name, agent_url)] - - for name in job_names: - job = config.get("jobs", {}).get(name, {}) - platform = job.get("platform", "") - agent_url = agents.get(platform, {}).get("url", "") - - if not agent_url: - print( - f"error: no agent URL configured for platform {platform!r} (job {name})", - file=sys.stderr, - ) - sys.exit(1) - - jobs_to_dispatch.append((name, agent_url)) - - api_token = os.environ.get("AGENT_API_TOKEN", "") - results = [] - - if args.dry_run: - for name, agent_url in jobs_to_dispatch: - platform, _, job = name.partition("_") - print(f"[dry-run] dispatch {platform} {job} job to {agent_url}") - else: - # Dispatch all jobs, then poll concurrently. - dispatched = [] # [(name, agent_url, job_id)] - - for name, agent_url in jobs_to_dispatch: - platform, _, job = name.partition("_") - print( - f"==> dispatching {platform} {job} job to {agent_url}", - file=sys.stderr, - ) - job_id = dispatch_remote_job( - agent_url, - name, - branch, - commit_sha, - args.image_tag, - api_token=api_token or None, - ) - - if job_id: - print(f" job_id: {job_id}", file=sys.stderr) - dispatched.append((name, agent_url, job_id)) - else: - print(f" failed to dispatch {name}", file=sys.stderr) - results.append({"job_name": name, "state": "error"}) - - if dispatched: - with ThreadPoolExecutor(max_workers=len(dispatched)) as executor: - futures = { - executor.submit(poll_remote_job, url, jid): (name, url, jid) - for name, url, jid in dispatched - } - - # Collect name lengths for column alignment. - name_width = max(len(n) for n, _, _ in dispatched) - - for future in as_completed(futures): - name, agent_url, job_id = futures[future] - result = future.result() - - if result: - state = result.get("state", "unknown") - duration = result.get("duration_seconds", 0) - tag = "PASS" if state == STATE_SUCCESS else "FAIL" - print( - f"<== {tag} {name:<{name_width}} ({duration:.0f}s)", - file=sys.stderr, - ) - - if state != STATE_SUCCESS: - full_log = fetch_remote_log(agent_url, job_id) - - if full_log: - print( - f"--- full log ({name}) ---", - file=sys.stderr, - ) - print(full_log, file=sys.stderr) - print("---", file=sys.stderr) - else: - # Fall back to `error_tail` if full log unavailable. - error_tail = result.get("error_tail", []) - - if error_tail: - print( - f"--- error output (last {len(error_tail)} lines) ---", - file=sys.stderr, - ) - - for line in error_tail: - print(f" {line}", file=sys.stderr) - - print("---", file=sys.stderr) - - results.append(result) - else: - print( - f"<== TIMEOUT {name:<{name_width}}", - file=sys.stderr, - ) - results.append({"job_name": name, "state": "timeout"}) - - # Summary: only print when there are failures. - failed = [r for r in results if r.get("state") != STATE_SUCCESS] - - if failed: - print("\n========== Failed ==========", file=sys.stderr) - name_width = max(len(r.get("job_name", "?")) for r in failed) - - for r in failed: - name = r.get("job_name", "?") - state = r.get("state", "unknown") - duration = r.get("duration_seconds", 0) - print( - f" FAIL {name:<{name_width}} {state} ({duration:.0f}s)", - file=sys.stderr, - ) - - sys.exit(1) - - -def cmd_serve(args): - """Handle 'serve' subcommand: start webhook server.""" - config = run.load_config(args.config) - - platform = res.detect_platform() - - if not platform: - print( - "error: could not detect platform (no nvidia-smi or ixsmi found)", - file=sys.stderr, - ) - sys.exit(1) - - try: - run.resolve_job_names(config.get("jobs", {}), platform=platform) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - pool = res.ResourcePool( - platform, - utilization_threshold=args.utilization_threshold, - ) - scheduler = Scheduler( - config, - platform, - pool, - results_dir=args.results_dir, - ) - - webhook_secret = args.webhook_secret or os.environ.get("WEBHOOK_SECRET", "") - api_token = args.api_token or os.environ.get("AGENT_API_TOKEN", "") - - if not webhook_secret: - print( - "WARNING: No webhook secret configured. Webhook endpoint accepts " - "unsigned requests. Set --webhook-secret or WEBHOOK_SECRET for production.", - file=sys.stderr, - ) - - if not api_token: - print( - "WARNING: No API token configured. /api/run endpoint is unauthenticated. " - "Set --api-token or AGENT_API_TOKEN for production.", - file=sys.stderr, - ) - - server = AgentServer( - args.host, - args.port, - config, - scheduler, - platform, - webhook_secret=webhook_secret or None, - api_token=api_token or None, - results_dir=args.results_dir, - ) - - print( - f"Agent serving on {args.host}:{args.port} (platform={platform})", - file=sys.stderr, - ) - print(" POST /webhook — GitHub webhook", file=sys.stderr) - print(" POST /api/run — remote job trigger", file=sys.stderr) - print(" GET /health — health check", file=sys.stderr) - print(" GET /status — queue & resource status", file=sys.stderr) - print(" GET /api/job/{id} — job status", file=sys.stderr) - print(" GET /api/job/{id}/log — full job log", file=sys.stderr) - - try: - server.serve_forever() - except KeyboardInterrupt: - print("\nShutting down...", file=sys.stderr) - server.shutdown() - - -def main(): - parser = argparse.ArgumentParser( - description="CI Runner Agent: run jobs locally, dispatch remotely, or serve webhooks", - ) - subparsers = parser.add_subparsers(dest="command") - - # --- run subcommand --- - run_parser = subparsers.add_parser("run", help="Run CI jobs") - run_parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - ) - run_parser.add_argument( - "--branch", type=str, help="Branch to test (default: config repo.branch)" - ) - run_parser.add_argument("--job", type=str, help="Specific job name") - run_parser.add_argument("--platform", type=str, help="Filter jobs by platform") - run_parser.add_argument("--image-tag", type=str, help="Override image tag") - run_parser.add_argument("--commit", type=str, help="Override commit SHA") - run_parser.add_argument("--dry-run", action="store_true") - - # --- serve subcommand --- - serve_parser = subparsers.add_parser("serve", help="Start webhook server") - serve_parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - ) - serve_parser.add_argument("--port", type=int, default=8080) - serve_parser.add_argument("--host", type=str, default="0.0.0.0") - serve_parser.add_argument("--webhook-secret", type=str) - serve_parser.add_argument( - "--api-token", - type=str, - help="Bearer token for /api/run authentication (or AGENT_API_TOKEN env var)", - ) - serve_parser.add_argument( - "--results-dir", - type=Path, - default=Path("ci-results"), - ) - serve_parser.add_argument( - "--utilization-threshold", - type=int, - default=10, - ) - - args = parser.parse_args() - - if args.command == "run": - cmd_run(args) - elif args.command == "serve": - cmd_serve(args) - else: - parser.print_help() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/.ci/build.py b/.ci/build.py deleted file mode 100644 index b373cb7d..00000000 --- a/.ci/build.py +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env python3 -"""CI image builder: detect changes, build, tag, and optionally push Docker images.""" - -import argparse -import json -import os -import shlex -import subprocess -import sys -from pathlib import Path - -from utils import get_git_commit, load_config - - -def has_dockerfile_changed(dockerfile_dir, base_ref="HEAD~1"): - """Check if any file under `dockerfile_dir` changed since `base_ref`.""" - result = subprocess.run( - ["git", "diff", "--name-only", base_ref, "--", dockerfile_dir], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print( - "warning: git diff failed (shallow clone or initial commit?);" - " assuming Dockerfile changed", - file=sys.stderr, - ) - return True - - return bool(result.stdout.strip()) - - -def docker_login(registry_cfg, dry_run): - """Log in to the registry using `credentials_env` token. - - Returns True on success. - - NOTE: Registry support is currently unused (`config.yaml` has no registry - section). Retained for future integration with an external image management - system. - """ - credentials_env = registry_cfg.get("credentials_env") - registry_url = registry_cfg.get("url", "") - - if not credentials_env or not registry_url: - return True - - token = os.environ.get(credentials_env) - - if not token: - print( - f"error: {credentials_env} not set, cannot login", - file=sys.stderr, - ) - return False - - if dry_run: - print( - f"[dry-run] echo | docker login {registry_url}" - " --username token --password-stdin" - ) - return True - - result = subprocess.run( - ["docker", "login", registry_url, "--username", "token", "--password-stdin"], - input=token, - text=True, - ) - - if result.returncode != 0: - print("error: docker login failed", file=sys.stderr) - return False - - return True - - -def build_image_tag(registry_url, project, platform, tag): - if registry_url: - return f"{registry_url}/{project}/{platform}:{tag}" - - return f"{project}-ci/{platform}:{tag}" - - -def build_image(platform, platform_cfg, registry_cfg, commit, push, dry_run, logged_in): - """Build a single platform image. Returns True on success.""" - registry_url = registry_cfg.get("url", "") - project = registry_cfg.get("project", "infiniops") - dockerfile_dir = platform_cfg["dockerfile"] - commit_tag = build_image_tag(registry_url, project, platform, commit) - latest_tag = build_image_tag(registry_url, project, platform, "latest") - - build_args_cfg = platform_cfg.get("build_args", {}) - build_cmd = ["docker", "build", "--network", "host"] - - for key, value in build_args_cfg.items(): - build_cmd.extend(["--build-arg", f"{key}={value}"]) - - for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) - - if proxy_val: - build_cmd.extend(["--build-arg", f"{proxy_var}={proxy_val}"]) - build_cmd.extend(["--build-arg", f"{proxy_var.lower()}={proxy_val}"]) - - private_sdk = platform_cfg.get("private_sdk", {}) - - if private_sdk: - source_env = private_sdk.get("source_env", "") - sdk_url = os.environ.get(source_env, "") if source_env else "" - - if sdk_url: - build_cmd.extend(["--build-arg", f"PRIVATE_SDK_URL={sdk_url}"]) - - build_cmd.extend(["-t", commit_tag, "-t", latest_tag, dockerfile_dir]) - - if dry_run: - print(f"[dry-run] {shlex.join(build_cmd)}") - - if push: - if not logged_in: - print("[dry-run] (skipping push: docker login failed)") - else: - print(f"[dry-run] docker push {commit_tag}") - print(f"[dry-run] docker push {latest_tag}") - - return True - - print(f"==> building {platform}: {commit_tag}", file=sys.stderr) - result = subprocess.run(build_cmd) - - if result.returncode != 0: - error = { - "stage": "build", - "platform": platform, - "tag": commit_tag, - "exit_code": result.returncode, - } - print(json.dumps(error), file=sys.stderr) - - return False - - if push: - if not logged_in: - print("error: docker login failed, cannot push", file=sys.stderr) - return False - - for tag in (commit_tag, latest_tag): - print(f"==> pushing {tag}", file=sys.stderr) - push_result = subprocess.run(["docker", "push", tag]) - - if push_result.returncode != 0: - error = { - "stage": "push", - "platform": platform, - "tag": tag, - "exit_code": push_result.returncode, - } - print(json.dumps(error), file=sys.stderr) - - return False - - return True - - -def main(): - parser = argparse.ArgumentParser(description="Build CI Docker images") - parser.add_argument( - "--platform", - type=str, - default="all", - help="Platform to build (nvidia, iluvatar, metax, moore, cambricon, ascend, or all). Default: all", - ) - parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - help="Path to config.yaml", - ) - parser.add_argument( - "--commit", - type=str, - default="HEAD", - help="Git ref for tagging the image (default: HEAD)", - ) - parser.add_argument( - "--push", - action="store_true", - help="Push images to registry after building (requires registry in config)", - ) - parser.add_argument( - "--force", - action="store_true", - help="Skip change detection and force build", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print commands without executing", - ) - args = parser.parse_args() - - config = load_config(args.config) - registry_cfg = config.get("registry", {}) - images_cfg = config.get("images", {}) - - if not images_cfg: - print("error: no `images` section in config", file=sys.stderr) - sys.exit(1) - - if args.platform == "all": - platforms = list(images_cfg.keys()) - else: - if args.platform not in images_cfg: - print( - f"error: platform `{args.platform}` not found in config", - file=sys.stderr, - ) - sys.exit(1) - platforms = [args.platform] - - commit = get_git_commit(args.commit) - logged_in = docker_login(registry_cfg, args.dry_run) if args.push else True - failed = False - - for platform in platforms: - platform_cfg = images_cfg[platform] - dockerfile_dir = platform_cfg["dockerfile"] - - if not Path(dockerfile_dir).is_dir(): - print( - f"warning: dockerfile directory `{dockerfile_dir}` does not exist," - f" skipping {platform}", - file=sys.stderr, - ) - continue - - if not args.force and not has_dockerfile_changed(dockerfile_dir): - print(f"==> {platform}: no changes detected, skipping", file=sys.stderr) - continue - - ok = build_image( - platform, - platform_cfg, - registry_cfg, - commit, - args.push, - args.dry_run, - logged_in=logged_in, - ) - - if not ok: - failed = True - - if failed: - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/.ci/ci_resource.py b/.ci/ci_resource.py deleted file mode 100644 index b23fee73..00000000 --- a/.ci/ci_resource.py +++ /dev/null @@ -1,575 +0,0 @@ -#!/usr/bin/env python3 -"""Resource detection and allocation for CI Runner Agent.""" - -import json -import operator -import os -import re -import shutil -import subprocess -import sys -import threading -from dataclasses import dataclass - -# Platform-to-device-env mapping for non-NVIDIA platforms. -# NVIDIA uses Docker's --gpus flag instead of an environment variable. -PLATFORM_DEVICE_ENV = { - "iluvatar": "CUDA_VISIBLE_DEVICES", - "metax": "CUDA_VISIBLE_DEVICES", - "moore": "MTHREADS_VISIBLE_DEVICES", - "cambricon": "MLU_VISIBLE_DEVICES", - "ascend": "ASCEND_VISIBLE_DEVICES", -} - - -@dataclass -class GpuInfo: - index: int - memory_used_mb: float - memory_total_mb: float - utilization_pct: float - - -@dataclass -class SystemResources: - total_memory_mb: float - available_memory_mb: float - cpu_count: int - - -class ResourcePool: - """Thread-safe GPU and system resource manager. - - Detects available GPUs via platform-specific tools (nvidia-smi, ixsmi, mx-smi, mthreads-gmi) - and tracks allocations to enable dynamic parallel scheduling. - """ - - GPU_QUERY_TOOLS = { - "nvidia": "nvidia-smi", - "iluvatar": "ixsmi", - "metax": "mx-smi", - "moore": "mthreads-gmi", - "cambricon": "cnmon", - "ascend": "npu-smi", - } - - def __init__(self, platform, utilization_threshold=10): - self._platform = platform - self._utilization_threshold = utilization_threshold - self._allocated: set[int] = set() - self._lock = threading.Lock() - - @property - def platform(self): - return self._platform - - @property - def allocated(self): - with self._lock: - return set(self._allocated) - - def detect_gpus(self) -> list[GpuInfo]: - """Query GPU status via platform-specific CLI tool.""" - if self._platform == "metax": - return self._detect_gpus_metax() - - if self._platform == "moore": - return self._detect_gpus_moore() - - if self._platform == "cambricon": - return self._detect_gpus_cambricon() - - if self._platform == "ascend": - return self._detect_gpus_ascend() - - tool = self.GPU_QUERY_TOOLS.get(self._platform) - - if not tool: - return [] - - try: - result = subprocess.run( - [ - tool, - "--query-gpu=index,memory.used,memory.total,utilization.gpu", - "--format=csv,noheader,nounits", - ], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - - for line in result.stdout.strip().splitlines(): - parts = [p.strip() for p in line.split(",")] - - if len(parts) < 4: - continue - - try: - gpus.append( - GpuInfo( - index=int(parts[0]), - memory_used_mb=float(parts[1]), - memory_total_mb=float(parts[2]), - utilization_pct=float(parts[3]), - ) - ) - except (ValueError, IndexError): - continue - - return gpus - - def _detect_gpus_metax(self) -> list[GpuInfo]: - """Parse mx-smi output for MetaX GPUs. - - Runs --show-memory and --show-usage separately and merges results. - Output format example: - GPU#0 MXC550 0000:1a:00.0 - Memory - vis_vram total : 67108864 KB - vis_vram used : 879032 KB - Utilization - GPU : 0 % - """ - - def run_mxsmi(flag): - try: - r = subprocess.run( - ["mx-smi", flag], - capture_output=True, - text=True, - timeout=10, - ) - return r.stdout if r.returncode == 0 else "" - except (FileNotFoundError, subprocess.TimeoutExpired): - return "" - - mem_out = run_mxsmi("--show-memory") - util_out = run_mxsmi("--show-usage") - - # Parse memory: collect {index: (used_kb, total_kb)} - mem = {} - current = None - for line in mem_out.splitlines(): - m = re.match(r"GPU#(\d+)", line.strip()) - if m: - current = int(m.group(1)) - mem[current] = [0.0, 0.0] - continue - if current is None: - continue - m = re.search(r"vis_vram total\s*:\s*([\d.]+)\s*KB", line) - if m: - mem[current][1] = float(m.group(1)) / 1024 # KB -> MB - m = re.search(r"vis_vram used\s*:\s*([\d.]+)\s*KB", line) - if m: - mem[current][0] = float(m.group(1)) / 1024 # KB -> MB - - # Parse utilization: collect {index: utilization_pct} - util = {} - current = None - in_util = False - for line in util_out.splitlines(): - m = re.match(r"GPU#(\d+)", line.strip()) - if m: - current = int(m.group(1)) - in_util = False - continue - if current is None: - continue - if "Utilization" in line: - in_util = True - continue - if in_util: - m = re.match(r"\s*GPU\s*:\s*([\d.]+)\s*%", line) - if m: - util[current] = float(m.group(1)) - in_util = False - - gpus = [] - for idx in sorted(mem): - used_mb, total_mb = mem[idx] - gpus.append( - GpuInfo( - index=idx, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util.get(idx, 0.0), - ) - ) - return gpus - - def _detect_gpus_moore(self) -> list[GpuInfo]: - """Parse mthreads-gmi JSON output for Moore Threads GPUs. - - Uses: mthreads-gmi -q --json - Expected JSON structure: - { - "Attached GPUs": { - "GPU 00000000:3B:00.0": { - "Minor Number": "0", - "Memory Usage": { - "Total": "24576 MiB", - "Used": "512 MiB" - }, - "Utilization": { - "Gpu": "5 %" - } - } - } - } - """ - - def extract_number(s): - m = re.search(r"([\d.]+)", str(s)) - return float(m.group(1)) if m else 0.0 - - try: - result = subprocess.run( - ["mthreads-gmi", "-q", "--json"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - try: - data = json.loads(result.stdout) - except json.JSONDecodeError: - return [] - - gpus = [] - attached = data.get("Attached GPUs", {}) - - for gpu_data in attached.values(): - try: - index = int(gpu_data.get("Minor Number", len(gpus))) - - mem = gpu_data.get("Memory Usage", {}) - total_mb = extract_number(mem.get("Total", "0 MiB")) - used_mb = extract_number(mem.get("Used", "0 MiB")) - util_pct = extract_number( - gpu_data.get("Utilization", {}).get("Gpu", "0 %") - ) - - gpus.append( - GpuInfo( - index=index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - continue - - return sorted(gpus, key=operator.attrgetter("index")) - - def _detect_gpus_cambricon(self) -> list[GpuInfo]: - """Parse cnmon output for Cambricon MLU cards. - - Each card appears as two consecutive data rows: - Row 1: | {card} {vf} {name} {fw} | {bus_id} | {util}% {ecc} | - Row 2: | {fan}% {temp} {pwr} | {mem_used} MiB/ {mem_total} MiB | ... | - """ - try: - result = subprocess.run( - ["cnmon"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - lines = result.stdout.splitlines() - i = 0 - - while i < len(lines): - line = lines[i] - # Row 1: "| {index} ... | {bus_id} | {util}% {ecc} |" - m1 = re.match(r"^\|\s+(\d+)\s+.*\|\s*([\d.]+)%", line) - - if m1 and i + 1 < len(lines): - try: - card_index = int(m1.group(1)) - util_pct = float(m1.group(2)) - row2 = lines[i + 1] - mem_m = re.search(r"([\d.]+)\s+MiB/\s*([\d.]+)\s+MiB", row2) - - if mem_m: - used_mb = float(mem_m.group(1)) - total_mb = float(mem_m.group(2)) - else: - used_mb, total_mb = 0.0, 0.0 - - gpus.append( - GpuInfo( - index=card_index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - pass - i += 2 - continue - - i += 1 - - return sorted(gpus, key=operator.attrgetter("index")) - - def _detect_gpus_ascend(self) -> list[GpuInfo]: - """Parse npu-smi info output for Huawei Ascend NPUs. - - Output format (pipe-delimited table, two rows per NPU): - | 0 910B4 | OK | 86.5 41 ... - | 0 | 0000:C1:00.0 | 0 0 / 0 2789 / 32768 | - Row 1: index, name, health, power, temp, hugepages. - Row 2: chip_id, bus_id, aicore_util, memory_usage, hbm_usage. - """ - try: - result = subprocess.run( - ["npu-smi", "info"], - capture_output=True, - text=True, - timeout=10, - ) - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - if result.returncode != 0: - return [] - - gpus = [] - lines = result.stdout.splitlines() - i = 0 - - while i < len(lines): - line = lines[i] - # Match row 1: `| {index} {name} ...`. - m1 = re.match(r"^\|\s+(\d+)\s+", line) - - if m1 and i + 1 < len(lines): - try: - npu_index = int(m1.group(1)) - aicore_m = re.match( - r"^\|\s+\d+\s+\|\s+[\da-f:.]+\s+\|\s*([\d.]+)\s", lines[i + 1] - ) - - util_pct = float(aicore_m.group(1)) if aicore_m else 0.0 - - # Parse HBM usage from row 2. Row contains both DDR - # ("0 / 0") and HBM ("2789 / 32768"); HBM is always last. - hbm_matches = re.findall(r"([\d.]+)\s*/\s*([\d.]+)", lines[i + 1]) - - if hbm_matches: - used_mb = float(hbm_matches[-1][0]) - total_mb = float(hbm_matches[-1][1]) - else: - used_mb, total_mb = 0.0, 0.0 - - gpus.append( - GpuInfo( - index=npu_index, - memory_used_mb=used_mb, - memory_total_mb=total_mb, - utilization_pct=util_pct, - ) - ) - except (ValueError, AttributeError): - pass - - i += 2 - continue - - i += 1 - - return sorted(gpus, key=operator.attrgetter("index")) - - def detect_system_resources(self) -> SystemResources: - """Read system memory from /proc/meminfo and CPU count.""" - total_mb = 0.0 - available_mb = 0.0 - - try: - with open("/proc/meminfo", encoding="utf-8") as f: - for line in f: - if line.startswith("MemTotal:"): - total_mb = float(line.split()[1]) / 1024 - elif line.startswith("MemAvailable:"): - available_mb = float(line.split()[1]) / 1024 - except OSError: - pass - - return SystemResources( - total_memory_mb=total_mb, - available_memory_mb=available_mb, - cpu_count=os.cpu_count() or 1, - ) - - def allocate(self, gpu_count, memory_mb=0) -> tuple[list[int], bool]: - """Try to allocate GPUs and check memory. - - Returns (allocated_gpu_ids, success). On failure returns ([], False). - GPUs are selected by ascending utilization (least loaded first). - Detection runs outside the lock to avoid blocking other threads. - """ - if gpu_count <= 0: - if memory_mb > 0: - sys_res = self.detect_system_resources() - - if sys_res.available_memory_mb < memory_mb: - return ([], False) - - return ([], True) - - # Detect GPUs and memory outside the lock (subprocess.run can block). - gpus = self.detect_gpus() - sys_res = self.detect_system_resources() if memory_mb > 0 else None - - with self._lock: - available = [ - g - for g in gpus - if g.index not in self._allocated - and g.utilization_pct < self._utilization_threshold - ] - - if len(available) < gpu_count: - return ([], False) - - if sys_res is not None and sys_res.available_memory_mb < memory_mb: - return ([], False) - - # Pick least loaded GPUs. - available.sort(key=lambda g: g.utilization_pct) - selected = [g.index for g in available[:gpu_count]] - self._allocated.update(selected) - return (selected, True) - - def release(self, gpu_ids): - """Return GPUs to the free pool.""" - with self._lock: - self._allocated -= set(gpu_ids) - - def get_status(self) -> dict: - """Return current resource status for API endpoints.""" - gpus = self.detect_gpus() - sys_res = self.detect_system_resources() - - with self._lock: - allocated = sorted(self._allocated) - - return { - "platform": self._platform, - "gpus": [ - { - "index": g.index, - "memory_used_mb": g.memory_used_mb, - "memory_total_mb": g.memory_total_mb, - "utilization_pct": g.utilization_pct, - "allocated_by_agent": g.index in allocated, - } - for g in gpus - ], - "allocated_gpu_ids": allocated, - "system": { - "total_memory_mb": round(sys_res.total_memory_mb, 1), - "available_memory_mb": round(sys_res.available_memory_mb, 1), - "cpu_count": sys_res.cpu_count, - }, - "utilization_threshold": self._utilization_threshold, - } - - -def parse_gpu_requirement(job_config) -> int: - """Extract GPU count required by a job. - - Resolution rules: - - - ``gpu_ids: "auto"`` (or omitted) — dynamic allocation; returns ``ngpus`` - (default 1). - - ``gpu_ids: "all"`` — use every available GPU; returns 0 (no reservation). - - ``gpu_ids: "0,2"`` — static pinning; returns the count of listed IDs. - When ``ngpus`` is also present the two must agree. - - The platform name determines how GPUs are exposed to Docker (see - ``PLATFORM_DEVICE_ENV``) but does **not** affect GPU counting here. - """ - resources = job_config.get("resources", {}) - gpu_ids = str(resources.get("gpu_ids", "auto")).strip() - ngpus = resources.get("ngpus") - - if gpu_ids == "all": - return 0 - - if gpu_ids == "auto" or not gpu_ids: - return int(ngpus) if ngpus is not None else 1 - - # Static pinning — count explicit IDs. - count = len(gpu_ids.split(",")) - - if ngpus is not None and int(ngpus) != count: - print( - f"warning: gpu_ids has {count} device(s) but ngpus={ngpus}; " - f"using gpu_ids count ({count})", - file=sys.stderr, - ) - - return count - - -def parse_memory_requirement(job_config) -> float: - """Extract memory requirement in MB from a job config.""" - resources = job_config.get("resources", {}) - memory = str(resources.get("memory", "")) - - if not memory: - return 0 - - memory = memory.lower().strip() - - if memory.endswith("gb"): - return float(memory[:-2]) * 1024 - elif memory.endswith("g"): - return float(memory[:-1]) * 1024 - elif memory.endswith("mb"): - return float(memory[:-2]) - elif memory.endswith("m"): - return float(memory[:-1]) - - try: - return float(memory) * 1024 # Default: GB - except ValueError: - print( - f"warning: unrecognized memory format {memory!r}, treating as 0", - file=sys.stderr, - ) - - return 0 - - -def detect_platform(): - """Auto-detect the current platform by probing GPU query tools on PATH.""" - for platform, tool in ResourcePool.GPU_QUERY_TOOLS.items(): - if shutil.which(tool): - return platform - - return None diff --git a/.ci/github_status.py b/.ci/github_status.py deleted file mode 100644 index f8f017f1..00000000 --- a/.ci/github_status.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 -"""GitHub Commit Status API wrapper using urllib (zero external dependencies).""" - -import json -import os -import re -import sys -import urllib.error -import urllib.request - - -def parse_repo_url(url): - """Extract (owner, repo) from a GitHub URL. - - Handles: - - https://github.com/Owner/Repo.git - - git@github.com:Owner/Repo.git - """ - # HTTPS format - m = re.match(r"https?://[^/]+/([^/]+)/([^/]+?)(?:\.git)?$", url) - - if m: - return m.group(1), m.group(2) - - # SSH format - m = re.match(r"git@[^:]+:([^/]+)/([^/]+?)(?:\.git)?$", url) - - if m: - return m.group(1), m.group(2) - - return "", "" - - -def build_status_context(prefix, job_name): - """Build status context string, e.g. 'ci/infiniops/nvidia_gpu'.""" - return f"{prefix}/{job_name}" - - -def post_commit_status( - owner, - repo, - sha, - state, - context, - description, - target_url=None, - token=None, -): - """Post a commit status to GitHub. - - Args: - state: One of 'pending', 'success', 'failure', 'error'. - Returns True on success, False on failure. - """ - token = token or os.environ.get("GITHUB_TOKEN", "") - - if not token: - print("warning: GITHUB_TOKEN not set, skipping status update", file=sys.stderr) - return False - - if not owner or not repo or not sha: - print( - "warning: missing owner/repo/sha, skipping status update", file=sys.stderr - ) - return False - - url = f"https://api.github.com/repos/{owner}/{repo}/statuses/{sha}" - body = { - "state": state, - "context": context, - "description": description[:140], - } - - if target_url: - body["target_url"] = target_url - - data = json.dumps(body).encode("utf-8") - req = urllib.request.Request( - url, - data=data, - headers={ - "Authorization": f"token {token}", - "Accept": "application/vnd.github.v3+json", - "Content-Type": "application/json", - }, - method="POST", - ) - - try: - with urllib.request.urlopen(req, timeout=30) as resp: - return 200 <= resp.status < 300 - except urllib.error.HTTPError as e: - print( - f"warning: GitHub status API returned {e.code}: {e.reason}", - file=sys.stderr, - ) - return False - except urllib.error.URLError as e: - print(f"warning: GitHub status API error: {e.reason}", file=sys.stderr) - return False diff --git a/.ci/images/ascend/Dockerfile b/.ci/images/ascend/Dockerfile deleted file mode 100644 index a542b99e..00000000 --- a/.ci/images/ascend/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG PIP_INDEX_URL=https://pypi.org/simple - -RUN pip install --no-cache-dir --progress off \ - scikit-build-core \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - ruff - -ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest -ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH} -ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH} -ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH} -ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME} -ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp -ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit - -WORKDIR /workspace diff --git a/.ci/images/cambricon/Dockerfile b/.ci/images/cambricon/Dockerfile deleted file mode 100644 index 138f3cb4..00000000 --- a/.ci/images/cambricon/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -# Python 3.10 executables (`pip`-installed tools) live under `/usr/local/python3.10/bin`. -ENV PATH=/usr/local/python3.10/bin:${PATH} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -# `git` and `cmake` are pre-installed; `coreutils-single` covers coreutils needs. -RUN dnf install -y ninja-build && dnf clean all - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - ruff==0.15.7 - -# Pin pre-installed Cambricon `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/iluvatar/Dockerfile b/.ci/images/iluvatar/Dockerfile deleted file mode 100644 index 79afc858..00000000 --- a/.ci/images/iluvatar/Dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# CoreX runtime environment (base image sets these in `/etc/bash.bashrc`, -# but `docker build` `RUN` uses `/bin/sh` which doesn't source it). -ENV PATH=/usr/local/corex/bin:/usr/local/corex-4.3.0/corex-toolbox-1.0.0/bin:/usr/local/corex/lib64/python3/dist-packages/bin:/usr/local/openmpi/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV PYTHONPATH=/usr/local/corex/lib64/python3/dist-packages -ENV LD_LIBRARY_PATH=/usr/local/corex/lib64:/usr/local/lib:/usr/local/openmpi/lib - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - ninja-build \ - coreutils \ - && rm -rf /var/lib/apt/lists/* - -RUN ln -sf $(which python3) /usr/local/bin/python 2>/dev/null || true - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -RUN pip config set global.index-url https://pypi.org/simple - -# Pin pre-installed CoreX `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/metax/Dockerfile b/.ci/images/metax/Dockerfile deleted file mode 100644 index 540bc9d5..00000000 --- a/.ci/images/metax/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# `conda` Python is used in this image. -ENV PATH=/opt/conda/bin:${PATH} - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - cmake \ - ninja-build \ - coreutils \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -# Pin pre-installed MetaX `torch` to prevent `pip` from replacing it with upstream version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/moore/Dockerfile b/.ci/images/moore/Dockerfile deleted file mode 100644 index a95d9bd1..00000000 --- a/.ci/images/moore/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -# `MUSA_HOME`, `PATH`, `LD_LIBRARY_PATH` already set by base image. - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - ninja-build \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - libclang \ - pytest-cov \ - pytest-xdist \ - ruff==0.15.7 - -# Pin pre-installed `torch` to prevent `pip` from replacing it with upstream version. -RUN echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/images/nvidia/Dockerfile b/.ci/images/nvidia/Dockerfile deleted file mode 100644 index b4984dac..00000000 --- a/.ci/images/nvidia/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -ENV DEBIAN_FRONTEND=noninteractive - -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY -ARG http_proxy -ARG https_proxy -ARG no_proxy - -ARG APT_MIRROR -RUN if [ -n "$APT_MIRROR" ]; then \ - sed -i "s|http://[^/]*/ubuntu|${APT_MIRROR}|g" /etc/apt/sources.list; \ - fi && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - git \ - cmake \ - ninja-build \ - coreutils \ - libclang-dev \ - && rm -rf /var/lib/apt/lists/* - - -ARG PIP_INDEX_URL -RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir \ - ${PIP_INDEX_URL:+--index-url "$PIP_INDEX_URL"} \ - scikit-build-core \ - pybind11 \ - libclang \ - pytest \ - pytest-cov \ - pytest-xdist \ - pyyaml \ - ruff==0.15.7 - -# Pin pre-installed `torch` to prevent `pip` from replacing it with a different version. -RUN pip show torch >/dev/null 2>&1 && \ - echo "torch==$(pip show torch | grep '^Version:' | awk '{print $2}')" > /etc/pip-constraints.txt || \ - touch /etc/pip-constraints.txt -ENV PIP_CONSTRAINT=/etc/pip-constraints.txt - -WORKDIR /workspace diff --git a/.ci/restart-agent.sh b/.ci/restart-agent.sh deleted file mode 100755 index efe0a900..00000000 --- a/.ci/restart-agent.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -# Usage: bash .ci/restart-agent.sh [port] [webhook-secret] -# -# Restart the CI agent with proxy configured. -# Edit the HTTPS_PROXY line below for your environment, then: -# bash .ci/restart-agent.sh -# bash .ci/restart-agent.sh 8080 my-webhook-secret - -set -euo pipefail - -PORT="${1:-8080}" -WEBHOOK_SECRET="${2:-}" - -# --- Proxy config (edit this) --- -export HTTPS_PROXY="http://your-proxy:port" -export HTTP_PROXY="$HTTPS_PROXY" -export NO_PROXY="localhost,127.0.0.1" -export https_proxy="$HTTPS_PROXY" -export http_proxy="$HTTP_PROXY" -export no_proxy="$NO_PROXY" - -# --- Kill existing agent --- -if pgrep -f "agent.py serve" > /dev/null 2>&1; then - echo "Stopping existing agent..." - pkill -f "agent.py serve" || true - sleep 2 -fi - -# --- Start agent --- -CI_DIR="$(cd "$(dirname "$0")" && pwd)" - -if [ ! -f "$CI_DIR/agent.py" ]; then - echo "error: $CI_DIR/agent.py not found" - exit 1 -fi - -ARGS="serve --port $PORT" -if [ -n "$WEBHOOK_SECRET" ]; then - ARGS="$ARGS --webhook-secret $WEBHOOK_SECRET" -fi - -echo "Starting CI agent on port $PORT..." -nohup python "$CI_DIR/agent.py" $ARGS > /tmp/ci-agent.log 2>&1 & - -HOST_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || hostname) - -echo "PID: $!" -echo "Listen: http://${HOST_IP}:${PORT}" -echo "Log: /tmp/ci-agent.log" -echo "Proxy: $HTTPS_PROXY" diff --git a/.ci/run.py b/.ci/run.py deleted file mode 100644 index e293b4a2..00000000 --- a/.ci/run.py +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env python3 -"""Standalone Docker CI runner: clone repo, setup, run stages. Output to stdout.""" - -import argparse -import os -import re -import shlex -import subprocess -import sys -import uuid -import xml.etree.ElementTree as ET -from datetime import datetime -from pathlib import Path - -from ci_resource import ( - PLATFORM_DEVICE_ENV, - ResourcePool, - detect_platform, - parse_gpu_requirement, - parse_memory_requirement, -) -from utils import get_git_commit, load_config - -# Flags that consume the next token as their value (e.g. -n 4, -k expr). -_PYTEST_VALUE_FLAGS = {"-n", "-k", "-m", "-p", "--tb", "--junitxml", "--rootdir"} - - -def _junit_xml_indicates_pass(results_dir): - """Return True if `pytest` junit XML under `results_dir` reports no failures/errors. - - Used to distinguish a real CI failure from the docker 18.09 - container-teardown `SIGKILL` (exit code 137) that occurs on this host - after a child process exits successfully — bash returns 0 from inside - the container, but the docker daemon reports 137 due to a race in its - `--rm` cleanup path. The junit XML is written by pytest before that - teardown and reliably captures the real outcome of the test stage. - """ - for junit in Path(results_dir).rglob("test-results.xml"): - try: - root = ET.parse(junit).getroot() - except ET.ParseError: - continue - - suites = root.findall("testsuite") if root.tag == "testsuites" else [root] - - if not suites: - continue - - for suite in suites: - try: - if int(suite.get("failures", 0)) > 0: - return False - - if int(suite.get("errors", 0)) > 0: - return False - except ValueError: - return False - - return True - - return False - - -def apply_test_override(run_cmd, test_path): - """Replace positional test path(s) in a pytest stage command. - - For example: ``pytest tests/ -n 4 ...`` becomes - ``pytest tests/test_gemm.py -n 4 ...`` when ``test_path`` is - ``tests/test_gemm.py``. - """ - parts = shlex.split(run_cmd) - - if not parts or parts[0] != "pytest": - return run_cmd - - result = ["pytest", test_path] - skip_next = False - - for p in parts[1:]: - if skip_next: - result.append(p) - skip_next = False - continue - - if p.startswith("-"): - result.append(p) - if p in _PYTEST_VALUE_FLAGS: - skip_next = True - continue - - # Skip existing test paths; the override is already in result[1]. - if not ("/" in p or p.endswith(".py") or "::" in p): - result.append(p) - - return shlex.join(result) - - -def build_results_dir(base, platform, stages, commit): - """Build a results directory path: `{base}/{platform}_{stages}_{commit}_{timestamp}_{id}`.""" - stage_names = "+".join(s["name"] for s in stages) - safe_commit = re.sub(r"[^a-zA-Z0-9._-]", "", commit) or "unknown" - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - short_id = uuid.uuid4().hex[:6] - dirname = f"{platform}_{stage_names}_{safe_commit}_{timestamp}_{short_id}" - - return Path(base) / dirname - - -def resolve_image(config, platform, image_tag): - """Resolve an image reference to a full image name. - - Accepts `stable`, `latest`, or a commit hash as `image_tag`. When config - contains a registry section, returns a registry-prefixed URL. Otherwise - returns a local tag (current default). - """ - registry = config.get("registry", {}) - registry_url = registry.get("url", "") - project = registry.get("project", "infiniops") - - if not registry_url: - return f"{project}-ci/{platform}:{image_tag}" - - return f"{registry_url}/{project}/{platform}:{image_tag}" - - -def build_runner_script(): - return r""" -set -e -cd /workspace -mkdir -p /workspace/results -if [ -n "$LOCAL_SRC" ]; then - cp -r "$LOCAL_SRC" /tmp/src - cd /tmp/src -else - git clone "$REPO_URL" repo - cd repo - git checkout "$BRANCH" -fi -echo "========== Setup ==========" -eval "$SETUP_CMD" -set +e -rc=0 -for i in $(seq 1 "$NUM_STAGES"); do - name_var="STAGE_${i}_NAME" - cmd_var="STAGE_${i}_CMD" - name="${!name_var}" - cmd="${!cmd_var}" - echo "========== Stage: $name ==========" - if [ -n "$cmd" ]; then - eval "$cmd" - rc=$? - if [ $rc -ne 0 ]; then - echo "Stage '$name' failed with exit code $rc" - break - fi - fi -done -echo "========== Summary ==========" -if [ -n "$HOST_UID" ] && [ -n "$HOST_GID" ]; then - chown -R "$HOST_UID:$HOST_GID" /workspace/results 2>/dev/null || true -fi -exit $rc -""" - - -def build_docker_args( - config, - job_name, - repo_url, - branch, - stages, - workdir, - image_tag_override, - gpu_id_override=None, - results_dir=None, - local_path=None, -): - job = config["jobs"][job_name] - platform = job.get("platform", "nvidia") - image_tag = image_tag_override or job.get("image", "latest") - image = resolve_image(config, platform, image_tag) - resources = job.get("resources", {}) - setup_raw = job.get("setup", "pip install .[dev]") - - if isinstance(setup_raw, list): - setup_cmd = "\n".join(setup_raw) - else: - setup_cmd = setup_raw - - args = [ - "docker", - "run", - "--rm", - "--network", - "host", - "-i", - "-w", - workdir, - "-e", - f"REPO_URL={repo_url}", - "-e", - f"BRANCH={branch}", - "-e", - f"SETUP_CMD={setup_cmd}", - "-e", - f"NUM_STAGES={len(stages)}", - "-e", - f"HOST_UID={os.getuid()}", - "-e", - f"HOST_GID={os.getgid()}", - ] - - for proxy_var in ("HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY"): - proxy_val = os.environ.get(proxy_var) or os.environ.get(proxy_var.lower()) - - if proxy_val: - args.extend(["-e", f"{proxy_var}={proxy_val}"]) - args.extend(["-e", f"{proxy_var.lower()}={proxy_val}"]) - - for key, value in job.get("env", {}).items(): - args.extend(["-e", f"{key}={value}"]) - - if results_dir: - args.extend(["-v", f"{results_dir.resolve()}:/workspace/results"]) - - if local_path: - args.extend(["-v", f"{local_path}:/workspace/repo:ro"]) - args.extend(["-e", "LOCAL_SRC=/workspace/repo"]) - - for i, s in enumerate(stages): - args.append("-e") - args.append(f"STAGE_{i + 1}_NAME={s['name']}") - args.append("-e") - args.append(f"STAGE_{i + 1}_CMD={s.get('run', '')}") - - # Platform-specific device access - for flag in job.get("docker_args", []): - args.append(flag) - - for vol in job.get("volumes", []): - args.extend(["-v", vol]) - - raw_gpu_ids = str(resources.get("gpu_ids", "auto")).strip() - gpu_id = gpu_id_override or ("" if raw_gpu_ids == "auto" else raw_gpu_ids) - - if gpu_id: - if platform == "nvidia": - args.extend(["--gpus", "all" if gpu_id == "all" else f"device={gpu_id}"]) - elif gpu_id != "all": - device_env = PLATFORM_DEVICE_ENV.get(platform) - - if device_env: - args.extend(["-e", f"{device_env}={gpu_id}"]) - - memory = resources.get("memory") - - if memory: - mem = str(memory).lower().replace("gb", "g").replace("mb", "m") - - if not mem.endswith("g") and not mem.endswith("m"): - mem = f"{mem}g" - - args.extend(["--memory", mem]) - - shm_size = resources.get("shm_size") - - if shm_size: - args.extend(["--shm-size", str(shm_size)]) - - timeout_sec = resources.get("timeout") - args.append(image) - - if timeout_sec: - # Requires coreutils `timeout` inside the container image. - args.extend(["timeout", str(timeout_sec)]) - - args.extend(["bash", "-c", build_runner_script().strip()]) - - return args - - -def resolve_job_names(jobs, platform=None, job=None): - """Resolve job names for a platform. - - - ``job=None`` — all jobs for the platform. - - ``job="nvidia_gpu"`` — direct lookup by full name. - - Raises ``ValueError`` if no matching jobs are found. - """ - if job: - if job not in jobs: - raise ValueError(f"job {job!r} not found in config") - - return [job] - - if not platform: - return list(jobs.keys()) - - matches = [name for name, cfg in jobs.items() if cfg.get("platform") == platform] - - if not matches: - raise ValueError(f"no jobs for platform {platform!r}") - - return matches - - -def main(): - parser = argparse.ArgumentParser(description="Run Docker CI pipeline") - parser.add_argument( - "--config", - type=Path, - default=Path(__file__).resolve().parent / "config.yaml", - help="Path to config.yaml", - ) - parser.add_argument( - "--branch", type=str, help="Override repo branch (default: config repo.branch)" - ) - parser.add_argument( - "--job", - type=str, - help="Job name (e.g. nvidia_gpu, ascend_npu). Default: all jobs for detected platform", - ) - parser.add_argument( - "--stage", - type=str, - help="Run only this stage name (still runs setup first)", - ) - parser.add_argument( - "--image-tag", - type=str, - help="Override image tag (stable, latest, or commit hash)", - ) - parser.add_argument( - "--gpu-id", - type=str, - help='GPU device IDs to use, e.g. "0", "0,2", "all"', - ) - parser.add_argument( - "--results-dir", - type=Path, - default=Path("ci-results"), - help="Base directory for test results (default: ./ci-results)", - ) - parser.add_argument( - "--test", - type=str, - help='Override pytest test path, e.g. "tests/test_gemm.py" or "tests/test_gemm.py::test_gemm"', - ) - parser.add_argument( - "--local", - action="store_true", - help="Mount current directory (read-only) into the container instead of cloning from git", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print docker command and exit", - ) - args = parser.parse_args() - - config = load_config(args.config) - repo = config.get("repo", {}) - repo_url = repo.get("url", "https://github.com/InfiniTensor/InfiniOps.git") - branch = args.branch or repo.get("branch", "master") - - platform = detect_platform() - - if not platform: - tools = ", ".join(ResourcePool.GPU_QUERY_TOOLS.values()) - print(f"error: could not detect platform (no {tools} found)", file=sys.stderr) - sys.exit(1) - - print(f"platform: {platform}", file=sys.stderr) - - jobs = config.get("jobs", {}) - - if not jobs: - print("error: no jobs in config", file=sys.stderr) - sys.exit(1) - - try: - job_names = resolve_job_names(jobs, platform, job=args.job) - except ValueError as e: - print(f"error: {e}", file=sys.stderr) - sys.exit(1) - - pool = ResourcePool(platform) - failed = 0 - - for job_name in job_names: - job = jobs[job_name] - all_stages = job.get("stages", []) - - if args.stage: - stages = [s for s in all_stages if s["name"] == args.stage] - - if not stages: - print( - f"error: stage {args.stage!r} not found in {job_name}", - file=sys.stderr, - ) - sys.exit(1) - else: - stages = all_stages - - if args.test: - stages = [ - {**s, "run": apply_test_override(s.get("run", ""), args.test)} - for s in stages - ] - - # Resolve GPU assignment: CLI override > auto-allocate > static config. - gpu_id_override = args.gpu_id - allocated_ids = [] - raw_gpu_ids = str(job.get("resources", {}).get("gpu_ids", "auto")).strip() - - if not gpu_id_override and raw_gpu_ids == "auto": - gpu_count = parse_gpu_requirement(job) - memory_mb = parse_memory_requirement(job) - allocated_ids, ok = pool.allocate(gpu_count, memory_mb) - - if not ok: - detected = pool.detect_gpus() - if not detected: - hint = ( - f"error: cannot allocate {gpu_count} GPU(s) for {job_name}" - f" — GPU detection returned no devices" - f" (is {ResourcePool.GPU_QUERY_TOOLS.get(platform, '?')} working?)" - f"\nhint: use --gpu-id 0 to bypass auto-allocation" - ) - else: - hint = ( - f"error: cannot allocate {gpu_count} GPU(s) for {job_name}" - f" — {len(detected)} GPU(s) detected but none available" - f" (utilization threshold: {pool._utilization_threshold}%)" - f"\nhint: use --gpu-id 0 to bypass auto-allocation" - ) - print(hint, file=sys.stderr) - failed += 1 - continue - - if allocated_ids: - gpu_id_override = ",".join(str(g) for g in allocated_ids) - - job_platform = job.get("platform", platform) - commit = get_git_commit() - results_dir = build_results_dir(args.results_dir, job_platform, stages, commit) - - local_path = Path.cwd().resolve() if args.local else None - docker_args = build_docker_args( - config, - job_name, - repo_url, - branch, - stages, - "/workspace", - args.image_tag, - gpu_id_override=gpu_id_override, - results_dir=results_dir, - local_path=local_path, - ) - - if args.dry_run: - print(shlex.join(docker_args)) - pool.release(allocated_ids) - continue - - print(f"==> running job: {job_name}", file=sys.stderr) - results_dir.mkdir(parents=True, exist_ok=True) - - try: - returncode = subprocess.run(docker_args).returncode - finally: - pool.release(allocated_ids) - - if returncode != 0: - # Docker 18.09 on this host occasionally SIGKILLs containers - # during `--rm` cleanup after the inner process already exited - # cleanly, producing exit code 137. Fall back to the pytest - # junit XML to recover the real outcome in that case. - if returncode == 137 and _junit_xml_indicates_pass(results_dir): - print( - f"[warn] job {job_name}: container exited with 137 " - f"(likely docker teardown SIGKILL after clean pytest); " - f"junit XML reports no failures — treating as success", - file=sys.stderr, - ) - else: - print( - f"job {job_name} failed (exit code {returncode})", - file=sys.stderr, - ) - failed += 1 - - sys.exit(1 if failed else 0) - - -if __name__ == "__main__": - main() diff --git a/.ci/tests/__init__.py b/.ci/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/.ci/tests/conftest.py b/.ci/tests/conftest.py deleted file mode 100644 index 7b028764..00000000 --- a/.ci/tests/conftest.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys -from pathlib import Path - -# Allow `import run` and `import build` directly. -sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) - -import pytest - -from utils import normalize_config - - -@pytest.fixture -def minimal_config(): - """Minimal platform-centric config, normalized to flat format.""" - raw = { - "repo": { - "url": "https://github.com/InfiniTensor/InfiniOps.git", - "branch": "master", - }, - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [ - { - "name": "test", - "run": "pytest tests/ -v", - } - ], - } - }, - } - }, - } - return normalize_config(raw) diff --git a/.ci/tests/test_agent.py b/.ci/tests/test_agent.py deleted file mode 100644 index a0c8cccc..00000000 --- a/.ci/tests/test_agent.py +++ /dev/null @@ -1,724 +0,0 @@ -import hashlib -import hmac -import json -import threading -from pathlib import Path -from unittest.mock import MagicMock - -import pytest - -import agent -import ci_resource as res -import run -from utils import normalize_config - - -# --------------------------------------------------------------------------- -# Test fixtures. -# --------------------------------------------------------------------------- - - -@pytest.fixture -def agent_config(): - raw = { - "repo": { - "url": "https://github.com/InfiniTensor/InfiniOps.git", - "branch": "master", - }, - "github": { - "status_context_prefix": "ci/infiniops", - }, - "agents": { - "nvidia": {"url": "http://nvidia-host:8080"}, - "iluvatar": {"url": "http://iluvatar-host:8080"}, - }, - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - }, - }, - }, - "iluvatar": { - "image": { - "dockerfile": ".ci/images/iluvatar/", - "build_args": {"BASE_IMAGE": "corex:qs_pj20250825"}, - }, - "setup": "pip install .[dev]", - "jobs": { - "gpu": { - "resources": { - "ngpus": 1, - "memory": "32GB", - "shm_size": "16g", - "timeout": 3600, - }, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - }, - }, - }, - }, - } - return normalize_config(raw) - - -@pytest.fixture -def mock_resource_pool(): - pool = MagicMock(spec=res.ResourcePool) - pool.platform = "nvidia" - pool.allocate.return_value = ([0], True) - pool.release.return_value = None - pool.get_status.return_value = { - "platform": "nvidia", - "gpus": [], - "allocated_gpu_ids": [], - "system": {}, - } - return pool - - -# --------------------------------------------------------------------------- -# Tests for `resolve_job_names`. -# --------------------------------------------------------------------------- - - -def test_resolve_job_names_by_name(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], job="nvidia_gpu") - assert jobs == ["nvidia_gpu"] - - -def test_resolve_job_names_by_platform(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], platform="nvidia") - assert jobs == ["nvidia_gpu"] - - -def test_resolve_job_names_by_platform_iluvatar(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"], platform="iluvatar") - assert jobs == ["iluvatar_gpu"] - - -def test_resolve_job_names_all(agent_config): - jobs = run.resolve_job_names(agent_config["jobs"]) - assert set(jobs) == {"nvidia_gpu", "iluvatar_gpu"} - - -def test_resolve_job_names_invalid(agent_config): - with pytest.raises(ValueError, match="not_exist"): - run.resolve_job_names(agent_config["jobs"], job="not_exist") - - -# --------------------------------------------------------------------------- -# Tests for `verify_signature`. -# --------------------------------------------------------------------------- - - -def test_verify_signature_valid(): - secret = "my-secret" - body = b'{"action": "push"}' - sig = "sha256=" + hmac.new(secret.encode(), body, hashlib.sha256).hexdigest() - assert agent.verify_signature(secret, body, sig) is True - - -def test_verify_signature_invalid(): - assert agent.verify_signature("secret", b"body", "sha256=wrong") is False - - -def test_verify_signature_empty(): - assert agent.verify_signature("secret", b"body", "") is False - - -# --------------------------------------------------------------------------- -# Tests for `JobRequest` and `JobResult`. -# --------------------------------------------------------------------------- - - -def test_job_request_fields(agent_config): - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - assert req.job_name == "nvidia_gpu" - assert req.platform == "nvidia" - assert req.commit_sha == "abc123" - assert len(req.job_id) == 8 - d = req.to_dict() - assert d["job_name"] == "nvidia_gpu" - - -def test_job_result_success(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 42.5) - assert r.state == "success" - - -def test_job_result_failure(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 1, Path("/tmp/res"), 10.0) - assert r.state == "failure" - - -# --------------------------------------------------------------------------- -# Tests for the `Scheduler` class. -# --------------------------------------------------------------------------- - - -def test_scheduler_submit_and_run(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("subprocess.run", lambda cmd, **kw: MagicMock(returncode=0)) - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - results_dir=Path("/tmp/test-results"), - no_status=True, - dry_run=True, - ) - req = agent.JobRequest( - "nvidia_gpu", - "master", - "abc123", - agent_config, - results_dir=Path("/tmp/test-results"), - ) - scheduler.submit(req) - results = scheduler.wait_all() - assert len(results) == 1 - assert results[0].state == "success" - - -def test_scheduler_queues_when_no_resources(agent_config, monkeypatch): - pool = MagicMock(spec=res.ResourcePool) - pool.allocate.return_value = ([], False) - pool.get_status.return_value = { - "platform": "nvidia", - "gpus": [], - "allocated_gpu_ids": [], - "system": {}, - } - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - pool, - no_status=True, - dry_run=False, - ) - - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - scheduler.submit(req) - - info = scheduler.get_job(req.job_id) - assert info["state"] == "queued" - - -def test_scheduler_get_status(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - - status = scheduler.get_status() - assert "queued" in status - assert "running" in status - assert "completed" in status - assert "resources" in status - - -# --------------------------------------------------------------------------- -# Tests for `WebhookHandler` push event parsing. -# --------------------------------------------------------------------------- - - -def test_webhook_parse_push(): - handler = agent.WebhookHandler.__new__(agent.WebhookHandler) - payload = {"ref": "refs/heads/feat/test", "after": "abc123def456"} - branch, sha = handler._parse_push(payload) - assert branch == "feat/test" - assert sha == "abc123def456" - - -def test_webhook_parse_pr(): - handler = agent.WebhookHandler.__new__(agent.WebhookHandler) - payload = { - "pull_request": { - "head": { - "ref": "feat/pr-branch", - "sha": "def789", - } - } - } - branch, sha = handler._parse_pull_request(payload) - assert branch == "feat/pr-branch" - assert sha == "def789" - - -# --------------------------------------------------------------------------- -# Integration-style webhook HTTP tests. -# --------------------------------------------------------------------------- - - -def _urlopen_no_proxy(url_or_req, **kwargs): - """`urlopen` mock that bypasses any `HTTP_PROXY`.""" - import urllib.request - - opener = urllib.request.build_opener(urllib.request.ProxyHandler({})) - return opener.open(url_or_req, **kwargs) - - -def test_health_endpoint(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - try: - resp = _urlopen_no_proxy(f"http://127.0.0.1:{port}/health", timeout=5) - data = json.loads(resp.read()) - assert data["status"] == "ok" - assert data["platform"] == "nvidia" - finally: - server.server_close() - - -def test_api_run_endpoint(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={"Content-Type": "application/json"}, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - assert len(data["job_ids"]) >= 1 - finally: - server.server_close() - - -def test_webhook_with_signature(agent_config, mock_resource_pool, monkeypatch): - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - secret = "test-secret" - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - webhook_secret=secret, - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - payload = json.dumps( - { - "ref": "refs/heads/master", - "after": "abc123def456", - } - ).encode() - sig = "sha256=" + hmac.new(secret.encode(), payload, hashlib.sha256).hexdigest() - - req = urllib.request.Request( - f"http://127.0.0.1:{port}/webhook", - data=payload, - headers={ - "Content-Type": "application/json", - "X-GitHub-Event": "push", - "X-Hub-Signature-256": sig, - }, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - finally: - server.server_close() - - -def test_webhook_invalid_signature(agent_config, mock_resource_pool): - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - webhook_secret="real-secret", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - payload = b'{"ref": "refs/heads/master", "after": "abc"}' - req = urllib.request.Request( - f"http://127.0.0.1:{port}/webhook", - data=payload, - headers={ - "Content-Type": "application/json", - "X-GitHub-Event": "push", - "X-Hub-Signature-256": "sha256=invalid", - }, - ) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req, timeout=5) - - assert exc_info.value.code == 401 - finally: - server.server_close() - - -# --------------------------------------------------------------------------- -# Tests for API token authentication. -# --------------------------------------------------------------------------- - - -def test_api_run_requires_token(agent_config, mock_resource_pool, monkeypatch): - """When `api_token` is set, `/api/run` rejects requests without a valid token.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - api_token="my-secret-token", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={"Content-Type": "application/json"}, - ) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req, timeout=5) - - assert exc_info.value.code == 401 - finally: - server.server_close() - - -def test_api_run_accepts_valid_token(agent_config, mock_resource_pool, monkeypatch): - """When `api_token` is set, `/api/run` accepts requests with a correct Bearer token.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - api_token="my-secret-token", - results_dir=Path("/tmp/test-results"), - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - body = json.dumps({"branch": "master", "commit_sha": "abc123"}).encode() - req = urllib.request.Request( - f"http://127.0.0.1:{port}/api/run", - data=body, - headers={ - "Content-Type": "application/json", - "Authorization": "Bearer my-secret-token", - }, - ) - - try: - resp = _urlopen_no_proxy(req, timeout=5) - data = json.loads(resp.read()) - assert data["accepted"] is True - finally: - server.server_close() - - -# --------------------------------------------------------------------------- -# Tests for queue backpressure. -# --------------------------------------------------------------------------- - - -def test_scheduler_rejects_when_queue_full(agent_config, monkeypatch): - """Scheduler raises QueueFullError when queue is at capacity.""" - pool = MagicMock(spec=res.ResourcePool) - pool.allocate.return_value = ([], False) # Never allocate → jobs stay queued. - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - pool, - no_status=True, - dry_run=False, - ) - - # Fill queue to capacity. - monkeypatch.setattr(agent, "MAX_QUEUE_SIZE", 3) - - for _ in range(3): - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - scheduler.submit(req) - - # Next submit should fail. - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - - with pytest.raises(agent.QueueFullError): - scheduler.submit(req) - - -# --------------------------------------------------------------------------- -# Tests for `poll_remote_job` error logging. -# --------------------------------------------------------------------------- - - -def test_poll_remote_job_logs_errors(monkeypatch, capsys): - """`poll_remote_job` warns on first failure instead of silently swallowing.""" - call_count = 0 - - def fake_urlopen(req, **kwargs): - nonlocal call_count - call_count += 1 - raise ConnectionError("connection refused") - - monkeypatch.setattr(agent, "urllib_urlopen", fake_urlopen) - monkeypatch.setattr(agent, "urllib_request", lambda url: url) - - result = agent.poll_remote_job( - "http://fake:8080", "job1", interval=0.01, timeout=0.05 - ) - assert result is None - - captured = capsys.readouterr() - assert "connection refused" in captured.err - assert "warning:" in captured.err - - -# --------------------------------------------------------------------------- -# Tests for `JobResult` `log_file` field. -# --------------------------------------------------------------------------- - - -def test_job_result_includes_log_file(): - r = agent.JobResult( - "id1", - "nvidia_gpu", - "abc", - 1, - Path("/tmp/res"), - 10.0, - error_tail=["error"], - log_file=Path("/tmp/res/job.log"), - ) - d = r.to_dict() - assert d["log_file"] == "/tmp/res/job.log" - - -def test_job_result_omits_log_file_when_none(): - r = agent.JobResult("id1", "nvidia_gpu", "abc", 0, Path("/tmp/res"), 5.0) - d = r.to_dict() - assert "log_file" not in d - - -# --------------------------------------------------------------------------- -# Tests for `/api/job/{id}/log` endpoint. -# --------------------------------------------------------------------------- - - -def test_job_log_endpoint(agent_config, mock_resource_pool, monkeypatch, tmp_path): - """`GET /api/job/{id}/log` returns the full log file content.""" - monkeypatch.setattr("agent.gh.post_commit_status", lambda *a, **kw: True) - - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - dry_run=True, - ) - - # Manually inject a completed job with a log file. - log_file = tmp_path / "job.log" - log_file.write_text("line 1\nline 2\nline 3\n") - - req = agent.JobRequest("nvidia_gpu", "master", "abc123", agent_config) - result = agent.JobResult( - req.job_id, - "nvidia_gpu", - "abc123", - 0, - tmp_path, - 1.0, - log_file=log_file, - ) - - with scheduler._lock: - scheduler._jobs[req.job_id] = { - "request": req, - "result": result, - "state": "success", - "gpu_ids": [], - } - - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.request - - url = f"http://127.0.0.1:{port}/api/job/{req.job_id}/log" - req_http = urllib.request.Request(url) - - try: - resp = _urlopen_no_proxy(req_http, timeout=5) - body = resp.read().decode("utf-8") - assert "line 1" in body - assert "line 2" in body - assert "line 3" in body - assert resp.headers["Content-Type"] == "text/plain; charset=utf-8" - finally: - server.server_close() - - -def test_job_log_endpoint_not_found(agent_config, mock_resource_pool): - """`GET /api/job/{id}/log` returns 404 for unknown job.""" - scheduler = agent.Scheduler( - agent_config, - "nvidia", - mock_resource_pool, - no_status=True, - ) - - server = agent.AgentServer( - "127.0.0.1", - 0, - agent_config, - scheduler, - "nvidia", - ) - port = server.server_address[1] - - t = threading.Thread(target=server.handle_request, daemon=True) - t.start() - - import urllib.error - import urllib.request - - url = f"http://127.0.0.1:{port}/api/job/nonexist/log" - req_http = urllib.request.Request(url) - - try: - with pytest.raises(urllib.error.HTTPError) as exc_info: - _urlopen_no_proxy(req_http, timeout=5) - - assert exc_info.value.code == 404 - finally: - server.server_close() diff --git a/.ci/tests/test_build.py b/.ci/tests/test_build.py deleted file mode 100644 index df25d606..00000000 --- a/.ci/tests/test_build.py +++ /dev/null @@ -1,207 +0,0 @@ -from unittest.mock import MagicMock - -import build - - -# --------------------------------------------------------------------------- -# Tests for `build_image_tag`. -# --------------------------------------------------------------------------- - - -def test_build_image_tag_with_registry(): - tag = build.build_image_tag("localhost:5000", "infiniops", "nvidia", "latest") - assert tag == "localhost:5000/infiniops/nvidia:latest" - - -def test_build_image_tag_without_registry(): - tag = build.build_image_tag("", "infiniops", "nvidia", "abc1234") - assert tag == "infiniops-ci/nvidia:abc1234" - - -def test_build_image_tag_commit_hash(): - tag = build.build_image_tag( - "registry.example.com:5000", "proj", "ascend", "deadbeef" - ) - assert tag == "registry.example.com:5000/proj/ascend:deadbeef" - - -# --------------------------------------------------------------------------- -# Tests for `has_dockerfile_changed`. -# --------------------------------------------------------------------------- - - -def test_has_dockerfile_changed_true_when_stdout_nonempty(monkeypatch): - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=0, stdout="Dockerfile\n"), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is True - - -def test_has_dockerfile_changed_false_when_stdout_empty(monkeypatch): - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=0, stdout=""), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is False - - -def test_has_dockerfile_changed_true_on_git_error(monkeypatch): - # Shallow clone or initial commit: `git diff` returns non-zero. - monkeypatch.setattr( - "subprocess.run", - lambda *a, **kw: MagicMock(returncode=128, stdout=""), - ) - assert build.has_dockerfile_changed(".ci/images/nvidia/") is True - - -# --------------------------------------------------------------------------- -# Tests for `docker_login`. -# --------------------------------------------------------------------------- - - -def test_docker_login_no_credentials_env(monkeypatch): - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - result = build.docker_login({"url": "localhost:5000"}, dry_run=False) - assert result is True - assert not called - - -def test_docker_login_token_not_set(monkeypatch): - monkeypatch.delenv("REGISTRY_TOKEN", raising=False) - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=False) - assert result is False - assert not called - - -def test_docker_login_dry_run_does_not_call_subprocess(monkeypatch): - monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=True) - assert result is True - assert not called - - -def test_docker_login_success(monkeypatch): - monkeypatch.setenv("REGISTRY_TOKEN", "mytoken") - captured = {} - - def mock_run(cmd, **kwargs): - captured["cmd"] = cmd - return MagicMock(returncode=0) - - monkeypatch.setattr("subprocess.run", mock_run) - cfg = {"url": "localhost:5000", "credentials_env": "REGISTRY_TOKEN"} - result = build.docker_login(cfg, dry_run=False) - assert result is True - assert "docker" in captured["cmd"] - assert "login" in captured["cmd"] - - -# --------------------------------------------------------------------------- -# Tests for `build_image` dry-run mode and proxy forwarding. -# --------------------------------------------------------------------------- - - -def _platform_cfg(): - return { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "nvcr.io/nvidia/pytorch:24.10-py3"}, - } - - -def _registry_cfg(): - return {"url": "localhost:5000", "project": "infiniops"} - - -def test_build_image_dry_run_no_subprocess(monkeypatch, capsys): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - called = [] - monkeypatch.setattr("subprocess.run", lambda *a, **kw: called.append(1)) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=True, - logged_in=True, - ) - assert not called - captured = capsys.readouterr() - assert "[dry-run]" in captured.out - - -def test_build_image_dry_run_output_contains_image_tag(monkeypatch, capsys): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=0)) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=True, - logged_in=True, - ) - captured = capsys.readouterr() - assert "abc1234" in captured.out - - -def test_build_image_proxy_in_build_args(monkeypatch): - monkeypatch.setenv("HTTP_PROXY", "http://proxy.test:3128") - captured = {} - - def mock_run(cmd, **kwargs): - captured["cmd"] = cmd - return MagicMock(returncode=0) - - monkeypatch.setattr("subprocess.run", mock_run) - build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=False, - logged_in=True, - ) - joined = " ".join(captured["cmd"]) - assert "HTTP_PROXY=http://proxy.test:3128" in joined - assert "http_proxy=http://proxy.test:3128" in joined - - -def test_build_image_returns_false_on_docker_error(monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - monkeypatch.setattr("subprocess.run", lambda *a, **kw: MagicMock(returncode=1)) - result = build.build_image( - "nvidia", - _platform_cfg(), - _registry_cfg(), - "abc1234", - push=False, - dry_run=False, - logged_in=True, - ) - assert result is False diff --git a/.ci/tests/test_github_status.py b/.ci/tests/test_github_status.py deleted file mode 100644 index 9e29c792..00000000 --- a/.ci/tests/test_github_status.py +++ /dev/null @@ -1,145 +0,0 @@ -import json -from unittest.mock import MagicMock - - -import github_status as gh - - -# --------------------------------------------------------------------------- -# Tests for `parse_repo_url`. -# --------------------------------------------------------------------------- - - -def test_parse_repo_url_https(): - owner, repo = gh.parse_repo_url("https://github.com/InfiniTensor/InfiniOps.git") - assert owner == "InfiniTensor" - assert repo == "InfiniOps" - - -def test_parse_repo_url_https_no_git(): - owner, repo = gh.parse_repo_url("https://github.com/Owner/Repo") - assert owner == "Owner" - assert repo == "Repo" - - -def test_parse_repo_url_ssh(): - owner, repo = gh.parse_repo_url("git@github.com:Owner/Repo.git") - assert owner == "Owner" - assert repo == "Repo" - - -def test_parse_repo_url_invalid(): - owner, repo = gh.parse_repo_url("not-a-url") - assert owner == "" - assert repo == "" - - -# --------------------------------------------------------------------------- -# Tests for `build_status_context`. -# --------------------------------------------------------------------------- - - -def test_build_status_context(): - ctx = gh.build_status_context("ci/infiniops", "nvidia_gpu") - assert ctx == "ci/infiniops/nvidia_gpu" - - -# --------------------------------------------------------------------------- -# Tests for `post_commit_status`. -# --------------------------------------------------------------------------- - - -def test_post_status_no_token(monkeypatch): - monkeypatch.delenv("GITHUB_TOKEN", raising=False) - result = gh.post_commit_status("owner", "repo", "abc123", "success", "ctx", "desc") - assert result is False - - -def test_post_status_missing_owner(): - result = gh.post_commit_status( - "", "repo", "abc123", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_success(monkeypatch): - mock_response = MagicMock() - mock_response.status = 201 - mock_response.__enter__ = MagicMock(return_value=mock_response) - mock_response.__exit__ = MagicMock(return_value=False) - - captured_req = {} - - def mock_urlopen(req, **kwargs): - captured_req["url"] = req.full_url - captured_req["data"] = json.loads(req.data) - captured_req["headers"] = dict(req.headers) - return mock_response - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "InfiniTensor", - "InfiniOps", - "abc123def", - "success", - "ci/infiniops/nvidia_gpu", - "Tests passed", - token="ghp_test_token", - ) - - assert result is True - assert "abc123def" in captured_req["url"] - assert captured_req["data"]["state"] == "success" - assert captured_req["data"]["context"] == "ci/infiniops/nvidia_gpu" - assert "ghp_test_token" in captured_req["headers"]["Authorization"] - - -def test_post_status_http_error(monkeypatch): - import urllib.error - - def mock_urlopen(req, **kwargs): - raise urllib.error.HTTPError( - url="", code=422, msg="Unprocessable", hdrs=None, fp=None - ) - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "owner", "repo", "sha", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_url_error(monkeypatch): - import urllib.error - - def mock_urlopen(req, **kwargs): - raise urllib.error.URLError("connection refused") - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - result = gh.post_commit_status( - "owner", "repo", "sha", "success", "ctx", "desc", token="tok" - ) - assert result is False - - -def test_post_status_truncates_description(monkeypatch): - mock_response = MagicMock() - mock_response.status = 201 - mock_response.__enter__ = MagicMock(return_value=mock_response) - mock_response.__exit__ = MagicMock(return_value=False) - - captured = {} - - def mock_urlopen(req, **kwargs): - captured["data"] = json.loads(req.data) - return mock_response - - monkeypatch.setattr("urllib.request.urlopen", mock_urlopen) - - long_desc = "x" * 200 - gh.post_commit_status("o", "r", "sha", "success", "ctx", long_desc, token="tok") - - assert len(captured["data"]["description"]) == 140 diff --git a/.ci/tests/test_resource.py b/.ci/tests/test_resource.py deleted file mode 100644 index a7ba8f87..00000000 --- a/.ci/tests/test_resource.py +++ /dev/null @@ -1,434 +0,0 @@ -import threading - - -import ci_resource as res - - -# --------------------------------------------------------------------------- -# Tests for `GpuInfo` and `SystemResources`. -# --------------------------------------------------------------------------- - - -def test_gpu_info_fields(): - g = res.GpuInfo( - index=0, memory_used_mb=1000, memory_total_mb=8000, utilization_pct=50 - ) - assert g.index == 0 - assert g.memory_total_mb == 8000 - - -def test_system_resources_fields(): - s = res.SystemResources( - total_memory_mb=32000, available_memory_mb=16000, cpu_count=8 - ) - assert s.cpu_count == 8 - - -# --------------------------------------------------------------------------- -# Tests for `detect_gpus`. -# --------------------------------------------------------------------------- - - -def test_detect_gpus_nvidia_parses_csv(monkeypatch): - csv_output = "0, 512, 8192, 5\n1, 1024, 8192, 80\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - gpus = pool.detect_gpus() - assert len(gpus) == 2 - assert gpus[0].index == 0 - assert gpus[0].memory_used_mb == 512 - assert gpus[0].utilization_pct == 5 - assert gpus[1].index == 1 - assert gpus[1].utilization_pct == 80 - - -def test_detect_gpus_empty_on_failure(monkeypatch): - def mock_run(cmd, **kwargs): - class R: - returncode = 1 - stdout = "" - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - assert pool.detect_gpus() == [] - - -def test_detect_gpus_unknown_platform(): - pool = res.ResourcePool("unknown_platform") - assert pool.detect_gpus() == [] - - -def test_detect_gpus_file_not_found(monkeypatch): - def mock_run(cmd, **kwargs): - raise FileNotFoundError("nvidia-smi not found") - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - assert pool.detect_gpus() == [] - - -# --------------------------------------------------------------------------- -# Tests for `detect_system_resources`. -# --------------------------------------------------------------------------- - - -def test_detect_system_resources(monkeypatch, tmp_path): - meminfo = tmp_path / "meminfo" - meminfo.write_text( - "MemTotal: 32000000 kB\n" - "MemFree: 10000000 kB\n" - "MemAvailable: 20000000 kB\n" - ) - - _real_open = open - - def fake_open(path, **kw): - if str(path) == "/proc/meminfo": - return _real_open(str(meminfo), **kw) - return _real_open(path, **kw) - - monkeypatch.setattr("builtins.open", fake_open) - - pool = res.ResourcePool("nvidia") - sys_res = pool.detect_system_resources() - assert abs(sys_res.total_memory_mb - 32000000 / 1024) < 1 - assert abs(sys_res.available_memory_mb - 20000000 / 1024) < 1 - assert sys_res.cpu_count > 0 - - -# --------------------------------------------------------------------------- -# Tests for `allocate` picking least-loaded GPUs. -# --------------------------------------------------------------------------- - - -def test_allocate_picks_least_loaded(monkeypatch): - csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(1) - assert ok is True - assert gpu_ids == [1] # GPU 1 has lowest utilization (2%). - - -def test_allocate_picks_two_least_loaded(monkeypatch): - csv_output = "0, 100, 8192, 8\n1, 200, 8192, 2\n2, 300, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert gpu_ids == [1, 2] # Sorted by utilization: 2% then 5%. - - -def test_allocate_skips_busy_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 4000, 8192, 95\n2, 200, 8192, 8\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert set(gpu_ids) == {0, 2} - assert 1 not in gpu_ids # GPU 1 at 95% is above threshold - - -# --------------------------------------------------------------------------- -# Tests for `allocate` and `release`. -# --------------------------------------------------------------------------- - - -def test_allocate_success(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(1) - assert ok is True - assert len(gpu_ids) == 1 - assert gpu_ids[0] in (0, 1) - - -def test_allocate_insufficient_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(3) - assert ok is False - assert gpu_ids == [] - - -def test_allocate_zero_gpus(): - pool = res.ResourcePool("unknown") - gpu_ids, ok = pool.allocate(0) - assert ok is True - assert gpu_ids == [] - - -def test_release_frees_gpus(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids, ok = pool.allocate(2) - assert ok is True - assert len(gpu_ids) == 2 - - # All GPUs allocated; next allocation should fail. - _, ok2 = pool.allocate(1) - assert ok2 is False - - # Release one GPU. - pool.release([gpu_ids[0]]) - gpu_ids2, ok3 = pool.allocate(1) - assert ok3 is True - assert gpu_ids2 == [gpu_ids[0]] - - -def test_allocate_excludes_allocated(monkeypatch): - csv_output = "0, 100, 8192, 5\n1, 200, 8192, 3\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=10) - gpu_ids1, _ = pool.allocate(1) - gpu_ids2, _ = pool.allocate(1) - - assert gpu_ids1 != gpu_ids2 - assert set(gpu_ids1 + gpu_ids2) == {0, 1} - - -def test_thread_safety(monkeypatch): - csv_output = "0, 0, 8192, 0\n1, 0, 8192, 0\n2, 0, 8192, 0\n3, 0, 8192, 0\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia", utilization_threshold=50) - allocated_all = [] - lock = threading.Lock() - - def allocate_one(): - ids, ok = pool.allocate(1) - - if ok: - with lock: - allocated_all.extend(ids) - - threads = [threading.Thread(target=allocate_one) for _ in range(4)] - - for t in threads: - t.start() - - for t in threads: - t.join() - - assert len(allocated_all) == 4 - assert len(set(allocated_all)) == 4 - - -# --------------------------------------------------------------------------- -# Tests for `get_status`. -# --------------------------------------------------------------------------- - - -def test_get_status(monkeypatch): - csv_output = "0, 512, 8192, 5\n" - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = csv_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("nvidia") - status = pool.get_status() - assert status["platform"] == "nvidia" - assert len(status["gpus"]) == 1 - assert "system" in status - - -# --------------------------------------------------------------------------- -# Tests for `parse_gpu_requirement` and `parse_memory_requirement`. -# --------------------------------------------------------------------------- - - -def test_parse_gpu_requirement_auto_default(): - """`gpu_ids` omitted (defaults to `auto`) with `ngpus=1`.""" - job = {"resources": {"ngpus": 1}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_auto_explicit(): - """`gpu_ids=auto` with `ngpus=2`.""" - job = {"resources": {"gpu_ids": "auto", "ngpus": 2}} - assert res.parse_gpu_requirement(job) == 2 - - -def test_parse_gpu_requirement_auto_no_ngpus(): - """`gpu_ids=auto` without `ngpus` defaults to 1.""" - job = {"resources": {"gpu_ids": "auto"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_auto_implicit_no_ngpus(): - """No `gpu_ids` and no `ngpus` defaults to 1.""" - job = {"resources": {}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_static_pinning(): - """Static `gpu_ids` counts explicit device IDs.""" - job = {"resources": {"gpu_ids": "0,1"}} - assert res.parse_gpu_requirement(job) == 2 - - -def test_parse_gpu_requirement_static_single(): - job = {"resources": {"gpu_ids": "0"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_parse_gpu_requirement_all(): - job = {"resources": {"gpu_ids": "all"}} - assert res.parse_gpu_requirement(job) == 0 - - -def test_parse_gpu_requirement_ngpus_mismatch_warns(capsys): - """Warn when static `gpu_ids` count differs from `ngpus`.""" - job = {"resources": {"gpu_ids": "0,1", "ngpus": 3}} - assert res.parse_gpu_requirement(job) == 2 - - captured = capsys.readouterr() - assert "warning:" in captured.err - assert "ngpus=3" in captured.err - - -def test_parse_gpu_requirement_ignores_unknown_keys(): - """Unknown keys in resources do not affect GPU counting.""" - job = {"resources": {"gpu_ids": "0", "extra_key": "value"}} - assert res.parse_gpu_requirement(job) == 1 - - -def test_detect_gpus_ascend_hbm_parsing(monkeypatch): - """`npu-smi` row 2 has DDR (0/0) and HBM (2789/32768); we want HBM.""" - npu_output = ( - "+---------------------------+---------------+-------------------------------+\n" - "| 0 910B4 | OK | 86.5 41 |\n" - "| 0 | 0000:c1:00.0 | 5 0 / 0 2789 / 32768 |\n" - "+---------------------------+---------------+-------------------------------+\n" - ) - - def mock_run(cmd, **kwargs): - class R: - returncode = 0 - stdout = npu_output - - return R() - - monkeypatch.setattr("subprocess.run", mock_run) - - pool = res.ResourcePool("ascend") - gpus = pool.detect_gpus() - assert len(gpus) == 1 - assert gpus[0].index == 0 - assert gpus[0].utilization_pct == 5.0 - assert gpus[0].memory_used_mb == 2789.0 - assert gpus[0].memory_total_mb == 32768.0 - - -def test_parse_memory_requirement_gb(): - assert res.parse_memory_requirement({"resources": {"memory": "32GB"}}) == 32 * 1024 - - -def test_parse_memory_requirement_mb(): - assert res.parse_memory_requirement({"resources": {"memory": "512MB"}}) == 512 - - -def test_parse_memory_requirement_empty(): - assert res.parse_memory_requirement({"resources": {}}) == 0 - - -def test_parse_memory_requirement_invalid_warns(capsys): - result = res.parse_memory_requirement({"resources": {"memory": "abc xyz"}}) - assert result == 0 - - captured = capsys.readouterr() - assert "warning:" in captured.err - assert "abc xyz" in captured.err diff --git a/.ci/tests/test_run.py b/.ci/tests/test_run.py deleted file mode 100644 index 844d941d..00000000 --- a/.ci/tests/test_run.py +++ /dev/null @@ -1,450 +0,0 @@ -from pathlib import Path - -import pytest - -import run - - -# --------------------------------------------------------------------------- -# Tests for `resolve_image`. -# --------------------------------------------------------------------------- - - -def test_resolve_image_with_registry(): - cfg = {"registry": {"url": "localhost:5000", "project": "infiniops"}} - img = run.resolve_image(cfg, "nvidia", "latest") - assert img == "localhost:5000/infiniops/nvidia:latest" - - -def test_resolve_image_without_registry(minimal_config): - img = run.resolve_image(minimal_config, "nvidia", "abc1234") - assert img == "infiniops-ci/nvidia:abc1234" - - -# --------------------------------------------------------------------------- -# Tests for `build_runner_script`. -# --------------------------------------------------------------------------- - - -def test_runner_script_contains_git_clone(): - script = run.build_runner_script() - assert "git clone" in script - - -def test_runner_script_contains_setup_cmd(): - script = run.build_runner_script() - assert "SETUP_CMD" in script - - -def test_runner_script_exits_on_failure(): - script = run.build_runner_script() - assert "exit $rc" in script - - -def test_runner_script_creates_results_dir(): - script = run.build_runner_script() - assert "mkdir -p /workspace/results" in script - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` basic structure. -# --------------------------------------------------------------------------- - - -def test_docker_args_basic_structure(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert args[0] == "docker" - assert "run" in args - assert "--rm" in args - - -def test_docker_args_correct_image(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "infiniops-ci/nvidia:latest" in args - - -def test_docker_args_image_tag_override(minimal_config): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - "abc1234", - ) - assert "infiniops-ci/nvidia:abc1234" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` proxy passthrough. -# --------------------------------------------------------------------------- - - -def test_docker_args_proxy_present_when_set(minimal_config, monkeypatch): - monkeypatch.setenv("HTTP_PROXY", "http://proxy.example.com:8080") - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "-e" in args - assert "HTTP_PROXY=http://proxy.example.com:8080" in args - assert "http_proxy=http://proxy.example.com:8080" in args - - -def test_docker_args_proxy_absent_when_not_set(minimal_config, monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.delenv("http_proxy", raising=False) - monkeypatch.delenv("HTTPS_PROXY", raising=False) - monkeypatch.delenv("https_proxy", raising=False) - monkeypatch.delenv("NO_PROXY", raising=False) - monkeypatch.delenv("no_proxy", raising=False) - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - - for arg in args: - assert not arg.startswith("HTTP_PROXY=") - assert not arg.startswith("http_proxy=") - assert not arg.startswith("HTTPS_PROXY=") - assert not arg.startswith("https_proxy=") - assert not arg.startswith("NO_PROXY=") - assert not arg.startswith("no_proxy=") - - -def test_docker_args_proxy_lowercase_fallback(minimal_config, monkeypatch): - monkeypatch.delenv("HTTP_PROXY", raising=False) - monkeypatch.setenv("http_proxy", "http://lowercase.proxy:3128") - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - ) - assert "HTTP_PROXY=http://lowercase.proxy:3128" in args - assert "http_proxy=http://lowercase.proxy:3128" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` GPU flags. -# --------------------------------------------------------------------------- - - -def _make_args(config, gpu_id_override=None): - return run.build_docker_args( - config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - gpu_id_override=gpu_id_override, - ) - - -def test_docker_args_gpu_auto_no_override(minimal_config): - """`gpu_ids=auto` (default) without override produces no `--gpus` flag.""" - args = _make_args(minimal_config) - assert "--gpus" not in args - - -def test_docker_args_gpu_auto_with_override(minimal_config): - """`gpu_ids=auto` with allocator override sets `--gpus device=...`.""" - args = _make_args(minimal_config, gpu_id_override="2") - idx = args.index("--gpus") - assert args[idx + 1] == "device=2" - - -def test_docker_args_gpu_static(minimal_config): - """Static `gpu_ids` pins to specific devices.""" - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0" - args = _make_args(minimal_config) - idx = args.index("--gpus") - assert args[idx + 1] == "device=0" - - -def test_docker_args_gpu_all(minimal_config): - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "all" - args = _make_args(minimal_config) - idx = args.index("--gpus") - assert args[idx + 1] == "all" - - -def test_docker_args_gpu_override_trumps_static(minimal_config): - """CLI `gpu_id_override` takes precedence over static `gpu_ids`.""" - minimal_config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] = "0" - args = _make_args(minimal_config, gpu_id_override="2,3") - idx = args.index("--gpus") - assert args[idx + 1] == "device=2,3" - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` platform-specific device env vars. -# --------------------------------------------------------------------------- - - -def _make_platform_config(platform, job_suffix="gpu"): - """Build a minimal normalized config for a given platform.""" - from utils import normalize_config - - raw = { - "platforms": { - platform: { - "image": {"dockerfile": f".ci/images/{platform}/"}, - "setup": "pip install .[dev]", - "jobs": { - job_suffix: { - "resources": {"ngpus": 1, "memory": "32GB"}, - "stages": [{"name": "test", "run": "pytest tests/ -v"}], - } - }, - } - } - } - - return normalize_config(raw) - - -def _make_platform_args(platform, job_suffix="gpu", gpu_id_override=None): - config = _make_platform_config(platform, job_suffix) - job_name = f"{platform}_{job_suffix}" - - return run.build_docker_args( - config, - job_name, - "https://github.com/example/repo.git", - "master", - config["jobs"][job_name]["stages"], - "/workspace", - None, - gpu_id_override=gpu_id_override, - ) - - -def test_docker_args_moore_mthreads_visible_devices(): - """Moore uses `MTHREADS_VISIBLE_DEVICES`, not `CUDA_VISIBLE_DEVICES`.""" - args = _make_platform_args("moore", gpu_id_override="0") - assert "MTHREADS_VISIBLE_DEVICES=0" in args - assert all("CUDA_VISIBLE_DEVICES" not in a for a in args) - - -def test_docker_args_iluvatar_cuda_visible_devices(): - args = _make_platform_args("iluvatar", gpu_id_override="1,2") - assert "CUDA_VISIBLE_DEVICES=1,2" in args - - -def test_docker_args_cambricon_mlu_visible_devices(): - args = _make_platform_args("cambricon", gpu_id_override="0") - assert "MLU_VISIBLE_DEVICES=0" in args - - -def test_docker_args_ascend_visible_devices(): - args = _make_platform_args("ascend", job_suffix="npu", gpu_id_override="0") - assert "ASCEND_VISIBLE_DEVICES=0" in args - - -def test_docker_args_metax_cuda_visible_devices(): - args = _make_platform_args("metax", gpu_id_override="0,1") - assert "CUDA_VISIBLE_DEVICES=0,1" in args - - -def test_docker_args_non_nvidia_no_gpus_flag(): - """Non-NVIDIA platforms should never use `--gpus` Docker flag.""" - for platform in ("iluvatar", "metax", "moore", "cambricon"): - args = _make_platform_args(platform, gpu_id_override="0") - assert "--gpus" not in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` memory format. -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "raw,expected", - [ - ("32GB", "32g"), - ("512MB", "512m"), - ("8", "8g"), - ("16gb", "16g"), - ("256mb", "256m"), - ], -) -def test_docker_args_memory_format(minimal_config, raw, expected): - minimal_config["jobs"]["nvidia_gpu"]["resources"]["memory"] = raw - args = _make_args(minimal_config) - idx = args.index("--memory") - assert args[idx + 1] == expected - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` stages encoding. -# --------------------------------------------------------------------------- - - -def test_docker_args_num_stages(minimal_config): - args = _make_args(minimal_config) - assert "NUM_STAGES=1" in args - - -def test_docker_args_stage_name_cmd(minimal_config): - args = _make_args(minimal_config) - assert "STAGE_1_NAME=test" in args - assert any(a.startswith("STAGE_1_CMD=") for a in args) - - -def test_docker_args_multiple_stages(minimal_config): - minimal_config["jobs"]["nvidia_gpu"]["stages"] = [ - {"name": "lint", "run": "ruff check ."}, - {"name": "test", "run": "pytest tests/"}, - ] - args = _make_args(minimal_config) - assert "NUM_STAGES=2" in args - assert "STAGE_1_NAME=lint" in args - assert "STAGE_2_NAME=test" in args - - -# --------------------------------------------------------------------------- -# Tests for `build_docker_args` `results_dir` mount. -# --------------------------------------------------------------------------- - - -def test_docker_args_results_dir(minimal_config, tmp_path): - args = run.build_docker_args( - minimal_config, - "nvidia_gpu", - "https://github.com/example/repo.git", - "master", - minimal_config["jobs"]["nvidia_gpu"]["stages"], - "/workspace", - None, - results_dir=tmp_path, - ) - joined = " ".join(str(a) for a in args) - assert "-v" in args - assert "/workspace/results" in joined - - -# --------------------------------------------------------------------------- -# Tests for `build_results_dir`. -# --------------------------------------------------------------------------- - - -def test_build_results_dir_contains_platform(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "nvidia" in d.name - - -def test_build_results_dir_contains_commit(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "abc1234" in d.name - - -def test_build_results_dir_contains_stage_names(): - stages = [{"name": "lint", "run": "ruff"}, {"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert "lint+test" in d.name - - -def test_build_results_dir_under_base(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("/tmp/my-results", "ascend", stages, "def5678") - assert d.parent == Path("/tmp/my-results") - - -# --------------------------------------------------------------------------- -# Tests for `apply_test_override`. -# --------------------------------------------------------------------------- - - -def test_apply_test_override_replaces_test_path(): - result = run.apply_test_override("pytest tests/ -v", "tests/test_add.py") - assert result == "pytest tests/test_add.py -v" - - -def test_apply_test_override_preserves_flags(): - result = run.apply_test_override( - "pytest tests/ -n 4 -v --tb=short", "tests/test_gemm.py" - ) - assert "tests/test_gemm.py" in result - assert "-n 4" in result - assert "-v" in result - assert "--tb=short" in result - assert "tests/" not in result.split("tests/test_gemm.py")[0] - - -def test_apply_test_override_non_pytest_passthrough(): - """Non-pytest commands are returned unchanged.""" - assert run.apply_test_override("ruff check .", "tests/foo.py") == "ruff check ." - - -def test_apply_test_override_empty_passthrough(): - assert run.apply_test_override("", "tests/foo.py") == "" - - -# --------------------------------------------------------------------------- -# Tests for runner script fail-fast behavior. -# --------------------------------------------------------------------------- - - -def test_runner_script_breaks_on_failure(): - script = run.build_runner_script() - assert "break" in script - - -def test_runner_script_preserves_exit_code(): - script = run.build_runner_script() - assert "rc=$?" in script - - -# --------------------------------------------------------------------------- -# Tests for `build_results_dir` uniqueness and sanitization. -# --------------------------------------------------------------------------- - - -def test_build_results_dir_unique(): - stages = [{"name": "test", "run": "pytest"}] - d1 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - d2 = run.build_results_dir("ci-results", "nvidia", stages, "abc1234") - assert d1 != d2 - - -def test_build_results_dir_sanitizes_commit(): - stages = [{"name": "test", "run": "pytest"}] - d = run.build_results_dir("ci-results", "nvidia", stages, "../../etc/passwd") - # Path separators are stripped; the result stays under the base directory. - assert "/" not in d.name - assert d.parent == Path("ci-results") diff --git a/.ci/tests/test_utils.py b/.ci/tests/test_utils.py deleted file mode 100644 index b8fa6d60..00000000 --- a/.ci/tests/test_utils.py +++ /dev/null @@ -1,108 +0,0 @@ -from utils import get_git_commit, normalize_config - - -def test_normalize_creates_flat_jobs(): - raw = { - "repo": {"url": "https://github.com/org/repo.git"}, - "platforms": { - "nvidia": { - "image": {"dockerfile": ".ci/images/nvidia/"}, - "setup": "pip install .", - "docker_args": ["--gpus", "all"], - "jobs": { - "gpu": { - "resources": {"gpu_ids": "0"}, - "stages": [{"name": "test", "run": "pytest"}], - }, - "multi_gpu": { - "resources": {"gpu_ids": "0,1"}, - "stages": [{"name": "test", "run": "pytest"}], - }, - }, - }, - }, - } - config = normalize_config(raw) - - assert "nvidia_gpu" in config["jobs"] - assert "nvidia_multi_gpu" in config["jobs"] - assert config["jobs"]["nvidia_gpu"]["platform"] == "nvidia" - assert config["jobs"]["nvidia_gpu"]["setup"] == "pip install ." - assert config["jobs"]["nvidia_gpu"]["docker_args"] == ["--gpus", "all"] - assert config["jobs"]["nvidia_gpu"]["resources"]["gpu_ids"] == "0" - assert config["jobs"]["nvidia_multi_gpu"]["resources"]["gpu_ids"] == "0,1" - - -def test_normalize_extracts_images(): - raw = { - "platforms": { - "nvidia": { - "image": { - "dockerfile": ".ci/images/nvidia/", - "build_args": {"BASE_IMAGE": "pytorch:latest"}, - }, - "jobs": {}, - }, - }, - } - config = normalize_config(raw) - assert config["images"]["nvidia"]["dockerfile"] == ".ci/images/nvidia/" - assert config["images"]["nvidia"]["build_args"]["BASE_IMAGE"] == "pytorch:latest" - - -def test_normalize_job_overrides_platform_defaults(): - raw = { - "platforms": { - "nvidia": { - "setup": "default setup", - "jobs": { - "special": { - "setup": "custom setup", - "stages": [], - }, - }, - }, - }, - } - config = normalize_config(raw) - assert config["jobs"]["nvidia_special"]["setup"] == "custom setup" - - -def test_normalize_preserves_top_level_keys(): - raw = { - "repo": {"url": "https://github.com/org/repo.git"}, - "github": {"status_context_prefix": "ci/test"}, - "agents": {"nvidia": {"url": "http://host:8080"}}, - "platforms": {}, - } - config = normalize_config(raw) - assert config["repo"]["url"] == "https://github.com/org/repo.git" - assert config["github"]["status_context_prefix"] == "ci/test" - assert config["agents"]["nvidia"]["url"] == "http://host:8080" - - -def test_normalize_passthrough_flat_config(): - """Old flat format without `platforms` key is returned as-is.""" - flat = { - "images": {"nvidia": {}}, - "jobs": {"nvidia_gpu": {"platform": "nvidia"}}, - } - assert normalize_config(flat) is flat - - -# --------------------------------------------------------------------------- -# Tests for `get_git_commit`. -# --------------------------------------------------------------------------- - - -def test_get_git_commit_warns_on_failure(monkeypatch, capsys): - from unittest.mock import MagicMock - - monkeypatch.setattr( - "subprocess.run", lambda *a, **kw: MagicMock(returncode=128, stdout="") - ) - result = get_git_commit() - assert result == "unknown" - - captured = capsys.readouterr() - assert "warning:" in captured.err diff --git a/.ci/utils.py b/.ci/utils.py deleted file mode 100644 index 2a3d36fb..00000000 --- a/.ci/utils.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -"""Shared utilities for the CI toolchain.""" - -import subprocess -import sys - -try: - import yaml -except ImportError: - print( - "error: pyyaml is required. Install with: pip install pyyaml", file=sys.stderr - ) - sys.exit(1) - - -def normalize_config(raw): - """Convert platform-centric config to flat images/jobs format. - - Input (new format): - platforms: - nvidia: - image: {dockerfile: ..., build_args: ...} - setup: pip install .[dev] - jobs: - gpu: {resources: ..., stages: ...} - - Output (flat format consumed by run.py / build.py / agent.py): - images: - nvidia: {dockerfile: ..., build_args: ...} - jobs: - nvidia_gpu: {platform: nvidia, setup: ..., resources: ..., stages: ...} - - If the config already uses the flat format (no 'platforms' key), returns as-is. - """ - if "platforms" not in raw: - return raw - - config = {} - - for key in ("repo", "github", "agents"): - if key in raw: - config[key] = raw[key] - - config["images"] = {} - config["jobs"] = {} - - for platform, pcfg in raw.get("platforms", {}).items(): - # Image config - if "image" in pcfg: - config["images"][platform] = pcfg["image"] - - # Platform-level defaults inherited by jobs - defaults = {} - - for key in ("image_tag", "docker_args", "volumes", "setup", "env"): - if key in pcfg: - defaults[key] = pcfg[key] - - # Flatten jobs: {platform}_{job_name} - for job_name, job_cfg in pcfg.get("jobs", {}).items(): - full_name = f"{platform}_{job_name}" - flat = { - "platform": platform, - "image": defaults.get("image_tag", "latest"), - } - - # Apply platform defaults - for key in ("docker_args", "volumes", "setup", "env"): - if key in defaults: - flat[key] = defaults[key] - - # Job-level overrides - flat.update(job_cfg) - - config["jobs"][full_name] = flat - - # Warn on mismatched agent/platform keys (catches typos like 'nvdia'). - agent_keys = set(config.get("agents", {}).keys()) - platform_keys = set(raw.get("platforms", {}).keys()) - - for key in agent_keys - platform_keys: - print( - f"warning: agents.{key} has no matching platform in platforms.*", - file=sys.stderr, - ) - - return config - - -def load_config(path): - """Load a YAML config file and normalize to flat format.""" - with open(path, encoding="utf-8") as f: - raw = yaml.safe_load(f) - - return normalize_config(raw) - - -def get_git_commit(ref="HEAD", short=True): - """Get git commit SHA. Returns 'unknown' on failure.""" - cmd = ["git", "rev-parse"] - - if short: - cmd.append("--short") - - cmd.append(ref) - result = subprocess.run(cmd, capture_output=True, text=True) - - if result.returncode != 0: - print( - f"warning: git rev-parse failed for {ref!r}, using 'unknown'", - file=sys.stderr, - ) - - return "unknown" - - return result.stdout.strip() diff --git a/.ci/config.yaml b/.github/ci_config.yml similarity index 69% rename from .ci/config.yaml rename to .github/ci_config.yml index ea6a0d48..3f76c16b 100644 --- a/.ci/config.yaml +++ b/.github/ci_config.yml @@ -5,45 +5,29 @@ repo: github: status_context_prefix: "ci/infiniops" -# Uncomment and replace the URLs below with actual host IPs to dispatch jobs to remote -# machines via `agent.py run`. Required on the trigger machine when each platform's -# agent runs on a separate host. See the README for multi-machine deployment details. -# agents: -# nvidia: -# url: http://nvidia-host:8080 -# iluvatar: -# url: http://iluvatar-host:8080 -# metax: -# url: http://metax-host:8080 -# moore: -# url: http://moore-host:8080 -# cambricon: -# url: http://cambricon-host:8080 - platforms: nvidia: image: - dockerfile: .ci/images/nvidia/ + dockerfile: images/nvidia/ build_args: BASE_IMAGE: nvcr.io/nvidia/pytorch:25.12-py3 setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: - ngpus: 1 # Scheduler auto-picks this many free GPUs. - gpu_ids: auto # `auto`: dynamic allocation; or pin with `"0"`, `"0,2"`, `"all"`. + ngpus: 1 # Auto allocator picks this many free GPUs memory: 32GB - shm_size: 16g # Prevent PyTorch default 64MB shared memory limit. + shm_size: 16g # Prevent PyTorch default 64MB shared memory limit timeout: 3600 # env: # Uncomment to inject extra env vars into the container. # MY_VAR: value stages: - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml - + run: pytest tests/ --devices nvidia -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml iluvatar: image: - dockerfile: .ci/images/iluvatar/ + dockerfile: images/iluvatar/ build_args: BASE_IMAGE: corex:qs_pj20250825 APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -61,19 +45,20 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none memory: 32GB shm_size: 16g timeout: 3600 stages: - name: test - run: pytest tests/ -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices iluvatar -n 8 -v --tb=short --junitxml=/workspace/results/test-results.xml metax: image: - dockerfile: .ci/images/metax/ + dockerfile: images/metax/ build_args: BASE_IMAGE: cr.metax-tech.com/public-library/maca-pytorch:3.2.1.4-torch2.4-py310-ubuntu22.04-amd64 APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -85,19 +70,20 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none # MetaX: passthrough via --privileged, CUDA_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g timeout: 3600 stages: - name: test - run: pytest tests/ -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices metax -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml moore: image: - dockerfile: .ci/images/moore/ + dockerfile: images/moore/ build_args: BASE_IMAGE: sh-harbor.mthreads.com/mcctest/vllm_musa:20251112_hygon APT_MIRROR: http://archive.ubuntu.com/ubuntu @@ -107,19 +93,20 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none # Moore: passthrough via --privileged, MTHREADS_VISIBLE_DEVICES controls visibility memory: 32GB shm_size: 16g timeout: 3600 stages: - name: test - run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_add.py tests/test_gemm.py tests/test_swiglu.py --devices moore -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml cambricon: image: - dockerfile: .ci/images/cambricon/ + dockerfile: images/cambricon/ build_args: BASE_IMAGE: cambricon/pytorch:v1.25.3-torch2.1-anolisos8.8-py310 PIP_INDEX_URL: https://pypi.org/simple @@ -128,19 +115,20 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: gpu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: mlu # Cambricon: passthrough via --privileged, MLU_VISIBLE_DEVICES for device control memory: 32GB shm_size: 16g timeout: 3600 stages: - name: test - run: pytest tests/test_gemm.py -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/test_gemm.py --devices cambricon -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml - ascend: # TODO: Ascend image is not ready yet. + ascend: image: - dockerfile: .ci/images/ascend/ + dockerfile: images/ascend/ build_args: BASE_IMAGE: quay.io/ascend/vllm-ascend:v0.18.0rc1-openeuler PIP_INDEX_URL: https://pypi.org/simple @@ -160,9 +148,10 @@ platforms: setup: pip install .[dev] --no-build-isolation jobs: npu: + type: unittest resources: ngpus: 1 - gpu_ids: auto + gpu_style: none memory: 32GB shm_size: 16g timeout: 3600 diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml new file mode 100644 index 00000000..651cc689 --- /dev/null +++ b/.github/workflows/ci_test.yml @@ -0,0 +1,15 @@ +name: CI + +on: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +jobs: + ci: + uses: InfiniTensor/ci/.github/workflows/infiniops-ci.yml@codex/prune-unused-ci-artifacts + with: + config_path: .github/ci_config.yml + ci_ref: codex/prune-unused-ci-artifacts + secrets: inherit diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..be99e8a8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule ".ci"] + path = .ci + url = https://github.com/InfiniTensor/ci.git diff --git a/tests/conftest.py b/tests/conftest.py index d995459f..564047a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -92,7 +92,8 @@ def skip_unsupported_dtypes(request): # PyTorch device type → InfiniOps platform names. A single torch device type # can map to several platforms (e.g., `cuda` is shared by `nvidia`, `metax`, -# and `iluvatar`); at most one is actually available at runtime. +# and `iluvatar`). CI passes concrete platforms through `--devices`; without +# that explicit platform, tests keep the historical broad mapping. _TORCH_DEVICE_TO_PLATFORMS = { "cuda": ("nvidia", "metax", "iluvatar"), "mlu": ("cambricon",), @@ -119,7 +120,7 @@ def skip_op_without_platform_impl(request): if "implementation_index" in params: return - platforms = _TORCH_DEVICE_TO_PLATFORMS.get(params.get("device")) + platforms = _active_platforms_for_torch_device(request.config, params.get("device")) if not platforms: return @@ -151,6 +152,24 @@ def _set_random_seed(seed): } +def _active_platforms_for_torch_device(config, torch_device): + """Return platform names selected for a torch device type.""" + if not torch_device: + return () + + cli_devices = config.getoption("--devices") or () + requested_platforms = tuple( + name + for name in cli_devices + if _PLATFORM_TO_TORCH_DEVICE.get(name) == torch_device + ) + + if requested_platforms: + return requested_platforms + + return _TORCH_DEVICE_TO_PLATFORMS.get(torch_device, ()) + + def _resolve_device(name): """Map a platform name (e.g., `ascend`) to a PyTorch device type (e.g., `npu`).""" return _PLATFORM_TO_TORCH_DEVICE.get(name, name)