From 87573b150ab04ea063c6f2ea8ab5b88fdc3d67c3 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 07:28:16 -0700 Subject: [PATCH 1/6] gw: add admin API token authentication The admin server has historically relied on network isolation alone: `construct()` does no caller checks, so anyone reachable on the admin port (default `0.0.0.0:8001` inside the deployment container) can call any admin RPC or load the dashboard. Combined with the dashboard's visibility into cluster state, this is an obvious hardening target. Add an opt-in shared-secret check, modelled on KMS `ensure_admin` but using the plain token in config (no hash) so the value can be injected via dstack encrypted env without round-tripping a SHA-256 step. - `AdminConfig.admin_token` (empty = no auth, with a startup WARN so existing deployments keep their current behaviour). - New `admin_auth::AdminAuthFairing` attached to the admin Rocket instance. Accepts the token via `X-Admin-Token`, `Authorization: Bearer `, or `?token=...` (for the browser dashboard). Constant-time compare via `subtle::ConstantTimeEq`. Rejected requests are rewritten to a sentinel URI that returns HTTP 401, so all currently-mounted routes (prpc + dashboard) are covered without per-route guards. - Thread `ADMIN_TOKEN` through `dstack-app` entrypoint + compose so encrypted-env values land in `gateway.toml`. - Update `bootstrap-cluster.sh` to send `X-Admin-Token` when `ADMIN_TOKEN` is set. --- Cargo.lock | 1 + Cargo.toml | 1 + gateway/Cargo.toml | 1 + gateway/dstack-app/bootstrap-cluster.sh | 20 +- gateway/dstack-app/builder/entrypoint.sh | 2 + gateway/dstack-app/docker-compose.yaml | 1 + gateway/gateway.toml | 4 + gateway/src/admin_auth.rs | 326 +++++++++++++++++++++++ gateway/src/config.rs | 5 + gateway/src/main.rs | 11 + 10 files changed, 366 insertions(+), 6 deletions(-) create mode 100644 gateway/src/admin_auth.rs diff --git a/Cargo.lock b/Cargo.lock index 45f54db2..409112aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2321,6 +2321,7 @@ dependencies = [ "sha2 0.10.9", "shared_child", "smallvec", + "subtle", "tdx-attest", "tempfile", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 01c20ecd..c712629e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -192,6 +192,7 @@ rustls-webpki = "0.103.10" schnorrkel = "0.11.4" sha2 = { version = "0.10.8", default-features = false } sha3 = "0.10.8" +subtle = "2" blake2 = "0.10.6" tokio-rustls = { version = "0.26.2", features = ["ring"] } x25519-dalek = { version = "2.0.1", features = ["static_secrets"] } diff --git a/gateway/Cargo.toml b/gateway/Cargo.toml index a1aefe17..af522ec4 100644 --- a/gateway/Cargo.toml +++ b/gateway/Cargo.toml @@ -62,6 +62,7 @@ uuid = { workspace = true, features = ["v4"] } rmp-serde.workspace = true or-panic.workspace = true base64.workspace = true +subtle.workspace = true [target.'cfg(unix)'.dependencies] nix = { workspace = true, features = ["resource"] } diff --git a/gateway/dstack-app/bootstrap-cluster.sh b/gateway/dstack-app/bootstrap-cluster.sh index 06bae6da..a6ec2387 100755 --- a/gateway/dstack-app/bootstrap-cluster.sh +++ b/gateway/dstack-app/bootstrap-cluster.sh @@ -14,17 +14,25 @@ # Load .env if present if [ -f ".env" ]; then set -a + # shellcheck source=/dev/null source .env set +a fi ADMIN_ADDR="${1:-${GATEWAY_ADMIN_RPC_ADDR:-127.0.0.1:9203}}" +# When admin auth is enabled, ADMIN_TOKEN must be set so curl can present +# X-Admin-Token. Empty token = legacy unauthenticated mode. +AUTH_HEADER=() +if [ -n "$ADMIN_TOKEN" ]; then + AUTH_HEADER=(-H "X-Admin-Token: $ADMIN_TOKEN") +fi + echo "Waiting for gateway admin API at $ADMIN_ADDR..." max_retries=60 retry=0 while [ $retry -lt $max_retries ]; do - if curl -sf "http://$ADMIN_ADDR/prpc/Status" >/dev/null 2>&1; then + if curl -sf "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/Status" >/dev/null 2>&1; then break fi retry=$((retry + 1)) @@ -47,19 +55,19 @@ else fi echo "Setting certbot config (ACME URL: $ACME_URL)..." -curl -sf -X POST "http://$ADMIN_ADDR/prpc/SetCertbotConfig" \ +curl -sf -X POST "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/SetCertbotConfig" \ -H "Content-Type: application/json" \ -d '{"acme_url":"'"$ACME_URL"'","renew_interval_secs":3600,"renew_before_expiration_secs":864000,"renew_timeout_secs":300}' >/dev/null \ && echo " Certbot config set" || echo " WARN: failed to set certbot config" # Create DNS credential if CF_API_TOKEN is provided and no credentials exist yet if [ -n "$CF_API_TOKEN" ]; then - existing=$(curl -sf "http://$ADMIN_ADDR/prpc/ListDnsCredentials" 2>/dev/null) + existing=$(curl -sf "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/ListDnsCredentials" 2>/dev/null) cred_count=$(echo "$existing" | jq -r '.credentials | length' 2>/dev/null || echo "0") if [ "$cred_count" = "0" ]; then echo "Creating default DNS credential..." - curl -sf -X POST "http://$ADMIN_ADDR/prpc/CreateDnsCredential" \ + curl -sf -X POST "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/CreateDnsCredential" \ -H "Content-Type: application/json" \ -d '{"name":"cloudflare","provider_type":"cloudflare","cf_api_token":"'"$CF_API_TOKEN"'","set_as_default":true}' >/dev/null \ && echo " DNS credential created" || echo " WARN: failed to create DNS credential" @@ -72,12 +80,12 @@ fi # Add ZT-Domain if SRV_DOMAIN is provided and domain doesn't exist yet if [ -n "$SRV_DOMAIN" ]; then - existing=$(curl -sf "http://$ADMIN_ADDR/prpc/ListZtDomains" 2>/dev/null) + existing=$(curl -sf "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/ListZtDomains" 2>/dev/null) has_domain=$(echo "$existing" | jq -r '.domains[]? | select(.domain=="'"$SRV_DOMAIN"'") | .domain' 2>/dev/null) if [ -z "$has_domain" ]; then echo "Adding ZT-Domain: $SRV_DOMAIN..." - curl -sf -X POST "http://$ADMIN_ADDR/prpc/AddZtDomain" \ + curl -sf -X POST "${AUTH_HEADER[@]}" "http://$ADMIN_ADDR/prpc/AddZtDomain" \ -H "Content-Type: application/json" \ -d '{"domain":"'"$SRV_DOMAIN"'","port":443,"priority":100}' >/dev/null \ && echo " ZT-Domain added" || echo " WARN: failed to add ZT-Domain" diff --git a/gateway/dstack-app/builder/entrypoint.sh b/gateway/dstack-app/builder/entrypoint.sh index 39182d10..5bf71a38 100755 --- a/gateway/dstack-app/builder/entrypoint.sh +++ b/gateway/dstack-app/builder/entrypoint.sh @@ -36,6 +36,7 @@ validate_env "$NODE_ID" validate_env "$WG_IP" validate_env "$WG_RESERVED_NET" validate_env "$WG_CLIENT_RANGE" +validate_env "$ADMIN_TOKEN" # Validate $NODE_ID, must be a number if [[ ! "$NODE_ID" =~ ^[0-9]+$ ]]; then @@ -89,6 +90,7 @@ sync_connections_interval = "${SYNC_CONNECTIONS_INTERVAL:-30s}" enabled = true address = "${ADMIN_LISTEN_ADDR:-0.0.0.0}" port = ${ADMIN_LISTEN_PORT:-8001} +admin_token = "${ADMIN_TOKEN}" [core.wg] public_key = "$PUBLIC_KEY" diff --git a/gateway/dstack-app/docker-compose.yaml b/gateway/dstack-app/docker-compose.yaml index 7ec32af3..869e0080 100644 --- a/gateway/dstack-app/docker-compose.yaml +++ b/gateway/dstack-app/docker-compose.yaml @@ -41,6 +41,7 @@ services: - TIMEOUT_TOTAL=${TIMEOUT_TOTAL:-5h} - ADMIN_LISTEN_ADDR=${ADMIN_LISTEN_ADDR:-0.0.0.0} - ADMIN_LISTEN_PORT=${ADMIN_LISTEN_PORT:-8001} + - ADMIN_TOKEN=${ADMIN_TOKEN:-} - INBOUND_PP_ENABLED=${INBOUND_PP_ENABLED:-false} - TIMEOUT_PP_HEADER=${TIMEOUT_PP_HEADER:-5s} - PORT_POLICY_FETCH_TIMEOUT=${PORT_POLICY_FETCH_TIMEOUT:-10s} diff --git a/gateway/gateway.toml b/gateway/gateway.toml index 6d4151e0..0e569650 100644 --- a/gateway/gateway.toml +++ b/gateway/gateway.toml @@ -24,6 +24,10 @@ timeout = "5s" [core.admin] enabled = false address = "127.0.0.1:8011" +# Shared secret required by every admin endpoint (RPC + dashboard) when +# non-empty. Clients send it via the `X-Admin-Token` header (or `?token=...` +# for the dashboard / browser links). Empty disables auth. +admin_token = "" [core.debug] insecure_enable_debug_rpc = false diff --git a/gateway/src/admin_auth.rs b/gateway/src/admin_auth.rs new file mode 100644 index 00000000..1c1444ac --- /dev/null +++ b/gateway/src/admin_auth.rs @@ -0,0 +1,326 @@ +// SPDX-FileCopyrightText: © 2025 Phala Network +// +// SPDX-License-Identifier: Apache-2.0 + +//! Admin server authentication. +//! +//! Attaches to the admin Rocket instance and rejects requests that do not +//! present the configured shared secret. The token is accepted via, in order: +//! 1. `X-Admin-Token` header (any method) +//! 2. `Authorization: Bearer ` header (any method) +//! 3. `?token=` query parameter (GET only, for dashboard links) +//! +//! For (3), the `token` query parameter is stripped from the request URI after +//! successful validation so it doesn't propagate to access logs, downstream +//! handlers, or the Referer header. +//! +//! Rejected requests are forwarded to a sentinel route that returns HTTP 401, +//! so all admin routes (prpc-generated and dashboard) are protected by a single +//! attachment without modifying the route declarations. + +use rocket::{ + fairing::{Fairing, Info, Kind}, + http::{uri::Origin, Method, Status}, + Data, Request, Route, +}; +use subtle::ConstantTimeEq; + +const UNAUTH_URI: &str = "/__admin_unauthorized"; +const HEADER_NAME: &str = "X-Admin-Token"; +const QUERY_PARAM: &str = "token"; + +pub struct AdminAuthFairing { + /// `None` means auth is disabled (empty config); any request is allowed. + token: Option, +} + +impl AdminAuthFairing { + pub fn new(token: String) -> Self { + Self { + token: (!token.is_empty()).then_some(token), + } + } + + fn extract_token(req: &Request<'_>) -> Option { + if let Some(t) = req.headers().get_one(HEADER_NAME) { + return Some(t.to_string()); + } + if let Some(auth) = req.headers().get_one("Authorization") { + if let Some(t) = auth.strip_prefix("Bearer ") { + return Some(t.to_string()); + } + } + // Query token is intended for browser links to the dashboard, so only + // accept it on GET to avoid leaking via mutating request URIs. + if req.method() == Method::Get { + for field in req.query_fields() { + if field.name.key_lossy().as_str() == QUERY_PARAM { + return Some(field.value.to_string()); + } + } + } + None + } +} + +/// Rebuild the request URI without the `token` query parameter, if present. +/// Returns `None` when there is nothing to strip. +fn strip_token_query(uri: &Origin<'_>) -> Option> { + let query = uri.query()?.as_str(); + let mut kept = Vec::new(); + let mut found = false; + for pair in query.split('&') { + let key = pair.split('=').next().unwrap_or(""); + if key == QUERY_PARAM { + found = true; + } else if !pair.is_empty() { + kept.push(pair); + } + } + if !found { + return None; + } + let path = uri.path().as_str(); + let new_uri = if kept.is_empty() { + path.to_string() + } else { + format!("{}?{}", path, kept.join("&")) + }; + Origin::parse_owned(new_uri).ok() +} + +#[rocket::async_trait] +impl Fairing for AdminAuthFairing { + fn info(&self) -> Info { + Info { + name: "admin auth", + kind: Kind::Request, + } + } + + async fn on_request(&self, req: &mut Request<'_>, _: &mut Data<'_>) { + let Some(expected) = self.token.as_deref() else { + return; + }; + // Avoid infinite re-routing if the fairing fires on the sentinel itself. + if req.uri().path() == UNAUTH_URI { + return; + } + let provided = Self::extract_token(req).unwrap_or_default(); + let matches: bool = provided.as_bytes().ct_eq(expected.as_bytes()).into(); + if !matches { + if let Ok(origin) = Origin::parse_owned(UNAUTH_URI.to_string()) { + req.set_uri(origin); + } + return; + } + // Authorized — strip ?token=... so it doesn't propagate to logs/handlers. + if let Some(stripped) = strip_token_query(req.uri()) { + req.set_uri(stripped); + } + } +} + +// Sentinel 401 handlers for every HTTP method Rocket can dispatch. We have to +// enumerate them because Rocket doesn't support a method-agnostic route. + +#[rocket::get("/__admin_unauthorized")] +fn unauth_get() -> Status { + Status::Unauthorized +} + +#[rocket::post("/__admin_unauthorized", data = "<_data>")] +fn unauth_post(_data: Data<'_>) -> Status { + Status::Unauthorized +} + +#[rocket::put("/__admin_unauthorized", data = "<_data>")] +fn unauth_put(_data: Data<'_>) -> Status { + Status::Unauthorized +} + +#[rocket::patch("/__admin_unauthorized", data = "<_data>")] +fn unauth_patch(_data: Data<'_>) -> Status { + Status::Unauthorized +} + +#[rocket::delete("/__admin_unauthorized")] +fn unauth_delete() -> Status { + Status::Unauthorized +} + +#[rocket::options("/__admin_unauthorized")] +fn unauth_options() -> Status { + Status::Unauthorized +} + +#[rocket::head("/__admin_unauthorized")] +fn unauth_head() -> Status { + Status::Unauthorized +} + +pub fn routes() -> Vec { + rocket::routes![ + unauth_get, + unauth_post, + unauth_put, + unauth_patch, + unauth_delete, + unauth_options, + unauth_head, + ] +} + +#[cfg(test)] +mod tests { + use super::*; + use rocket::http::{ContentType, Header, Status}; + use rocket::local::asynchronous::Client; + + #[rocket::get("/protected")] + fn protected_get() -> &'static str { + "ok" + } + + #[rocket::post("/protected", data = "<_data>")] + fn protected_post(_data: Data<'_>) -> &'static str { + "ok" + } + + #[rocket::get("/echo?&")] + fn echo(token: Option<&str>, other: Option<&str>) -> String { + format!( + "token={} other={}", + token.unwrap_or(""), + other.unwrap_or("") + ) + } + + async fn make_client(token: &str) -> Client { + let r = rocket::build() + .attach(AdminAuthFairing::new(token.to_string())) + .mount("/", routes()) + .mount("/", rocket::routes![protected_get, protected_post, echo]); + Client::tracked(r).await.unwrap() + } + + #[rocket::async_test] + async fn empty_token_disables_auth() { + let client = make_client("").await; + let resp = client.get("/protected").dispatch().await; + assert_eq!(resp.status(), Status::Ok); + let resp = client.post("/protected").dispatch().await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn missing_token_returns_401() { + let client = make_client("s3cret").await; + let resp = client.get("/protected").dispatch().await; + assert_eq!(resp.status(), Status::Unauthorized); + let resp = client.post("/protected").dispatch().await; + assert_eq!(resp.status(), Status::Unauthorized); + } + + #[rocket::async_test] + async fn header_token_accepted() { + let client = make_client("s3cret").await; + let resp = client + .get("/protected") + .header(Header::new(HEADER_NAME, "s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + let resp = client + .post("/protected") + .header(ContentType::JSON) + .header(Header::new(HEADER_NAME, "s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn bearer_token_accepted() { + let client = make_client("s3cret").await; + let resp = client + .get("/protected") + .header(Header::new("Authorization", "Bearer s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn wrong_token_rejected() { + let client = make_client("s3cret").await; + let resp = client + .get("/protected") + .header(Header::new(HEADER_NAME, "wrong")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Unauthorized); + } + + #[rocket::async_test] + async fn header_takes_precedence_over_query() { + let client = make_client("s3cret").await; + // Wrong query token but correct header → authorized. + let resp = client + .get("/protected?token=wrong") + .header(Header::new(HEADER_NAME, "s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn query_token_only_accepted_on_get() { + let client = make_client("s3cret").await; + // GET with ?token= → allowed + let resp = client.get("/protected?token=s3cret").dispatch().await; + assert_eq!(resp.status(), Status::Ok); + // POST with ?token= → rejected (query auth not honored on mutating methods) + let resp = client.post("/protected?token=s3cret").dispatch().await; + assert_eq!(resp.status(), Status::Unauthorized); + } + + #[rocket::async_test] + async fn query_token_stripped_after_auth() { + let client = make_client("s3cret").await; + // Token is stripped → handler sees no `token` param, only `other`. + let resp = client.get("/echo?token=s3cret&other=keep").dispatch().await; + assert_eq!(resp.status(), Status::Ok); + let body = resp.into_string().await.unwrap(); + assert_eq!(body, "token= other=keep"); + } + + #[rocket::async_test] + async fn query_token_stripped_when_authed_via_header() { + let client = make_client("s3cret").await; + let resp = client + .get("/echo?token=anything&other=keep") + .header(Header::new(HEADER_NAME, "s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + let body = resp.into_string().await.unwrap(); + assert_eq!(body, "token= other=keep"); + } + + #[rocket::async_test] + async fn unauth_returns_401_on_all_methods() { + let client = make_client("s3cret").await; + // PUT / DELETE / PATCH / OPTIONS to a protected URI with no token + // should be rewritten to the sentinel and return 401, not 404. + for m in [Method::Put, Method::Delete, Method::Patch, Method::Options] { + let resp = client.req(m, "/protected").dispatch().await; + assert_eq!( + resp.status(), + Status::Unauthorized, + "method {m:?} expected 401, got {}", + resp.status() + ); + } + } +} diff --git a/gateway/src/config.rs b/gateway/src/config.rs index 9f57f984..e7b7b5dd 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -286,6 +286,11 @@ impl Config { #[derive(Debug, Clone, Deserialize)] pub struct AdminConfig { pub enabled: bool, + /// Shared secret required to call any admin endpoint (RPC + dashboard). + /// Empty disables authentication; operators are warned at startup so the + /// historical "network-isolation only" deployments keep working. + #[serde(default)] + pub admin_token: String, } #[derive(Debug, Clone, Deserialize)] diff --git a/gateway/src/main.rs b/gateway/src/main.rs index 1349d966..ae138d20 100644 --- a/gateway/src/main.rs +++ b/gateway/src/main.rs @@ -24,6 +24,7 @@ use main_service::{Proxy, ProxyOptions, RpcHandler}; use crate::debug_service::DebugRpcHandler; +mod admin_auth; mod admin_service; mod cert_store; mod config; @@ -276,6 +277,7 @@ async fn main() -> Result<()> { let proxy_config = config.proxy.clone(); let pccs_url = config.pccs_url.clone(); let admin_enabled = config.admin.enabled; + let admin_token = config.admin.admin_token.clone(); let debug_config = config.debug.clone(); let state = Proxy::new(ProxyOptions { config, @@ -321,7 +323,16 @@ async fn main() -> Result<()> { let debug_state = state; let admin_srv = async move { if admin_enabled { + if admin_token.is_empty() { + tracing::warn!( + "admin server enabled without admin_token; admin API is exposed without authentication" + ); + } else { + tracing::info!("admin server authentication enabled"); + } rocket::custom(admin_figment) + .attach(admin_auth::AdminAuthFairing::new(admin_token)) + .mount("/", admin_auth::routes()) .mount("/", web_routes::routes()) .mount("/", prpc!(Proxy, AdminRpcHandler, trim: "Admin.")) .mount("/prpc", prpc!(Proxy, AdminRpcHandler, trim: "Admin.")) From 84babfa729f8ce1352ac6955bfa0a384af7eba09 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 08:20:49 -0700 Subject: [PATCH 2/6] gw: harden admin auth defaults and unify ADMIN_API_TOKEN naming Address review-style improvements lifted from PR #675: - Fail-by-default. When `admin.enabled = true`, gateway now refuses to start unless either an `admin_token` is configured (via `core.admin.admin_token`, `DSTACK_GATEWAY_ADMIN_TOKEN`, or `ADMIN_API_TOKEN`) or `insecure_no_auth = true` is set explicitly. Replaces the previous "empty token = open access + WARN" policy. e2e configs opt into `insecure_no_auth = true`. - Env-var fallback in Rust. `AdminAuthFairing::from_config` resolves the token from config first, then `DSTACK_GATEWAY_ADMIN_TOKEN`, then `ADMIN_API_TOKEN`. Operators running the binary directly no longer have to template the TOML. - SHA-256 in-memory storage. The plaintext token is hashed at startup; only the 32-byte digest is retained. Verification SHA-256s the request token and constant-time compares the digests via `subtle::ConstantTimeEq`. - dstack-app deployment chain renamed `ADMIN_TOKEN` -> `ADMIN_API_TOKEN` end-to-end (deploy-to-vmm.sh auto-generates it into .env, docker-compose forwards it, entrypoint.sh injects it into `gateway.toml`, bootstrap-cluster.sh requires it and sends `Authorization: Bearer`). - Docs in `gateway/docs/cluster-deployment.md` now show the bearer-auth pattern in every admin-curl example. Tests: 4 new cases in `admin_auth::tests` cover the from_config policy (insecure flag, config path, env fallbacks, error message). All 13 `admin_auth` tests pass; clippy with `-D clippy::expect_used -D clippy::unwrap_used` is clean. --- gateway/docs/cluster-deployment.md | 32 +++-- gateway/dstack-app/bootstrap-cluster.sh | 11 +- gateway/dstack-app/builder/entrypoint.sh | 9 +- gateway/dstack-app/deploy-to-vmm.sh | 7 + gateway/dstack-app/docker-compose.yaml | 2 +- gateway/gateway.toml | 11 +- gateway/src/admin_auth.rs | 135 +++++++++++++++++++- gateway/src/config.rs | 8 +- gateway/src/main.rs | 16 ++- gateway/test-run/e2e/configs/gateway-1.toml | 1 + gateway/test-run/e2e/configs/gateway-2.toml | 1 + gateway/test-run/e2e/configs/gateway-3.toml | 1 + 12 files changed, 199 insertions(+), 35 deletions(-) diff --git a/gateway/docs/cluster-deployment.md b/gateway/docs/cluster-deployment.md index 9441e625..191d4c0e 100644 --- a/gateway/docs/cluster-deployment.md +++ b/gateway/docs/cluster-deployment.md @@ -289,12 +289,18 @@ Important: ### 2.7 Verify Cluster Sync +The admin API requires a bearer token (see `core.admin.admin_token` in `gateway.toml`, +or the `ADMIN_API_TOKEN` env injected by `deploy-to-vmm.sh`). Export it once: + ```bash +export ADMIN_API_TOKEN=... # value from .env or gateway.toml +ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_API_TOKEN") + # Check sync status on any node (replace port with your admin port) -curl -s http://localhost:9016/prpc/WaveKvStatus | jq . +curl -s "${ADMIN_AUTH[@]}" http://localhost:9016/prpc/WaveKvStatus | jq . # List known cluster nodes -curl -s http://localhost:9016/prpc/Status | jq '.nodes' +curl -s "${ADMIN_AUTH[@]}" http://localhost:9016/prpc/Status | jq '.nodes' ``` A healthy cluster sync shows: @@ -567,7 +573,8 @@ $CLI info Check that the gateway sees the new app: ```bash -curl -s http://localhost:/prpc/Status | jq '.hosts' +curl -s -H "Authorization: Bearer $ADMIN_API_TOKEN" \ + http://localhost:/prpc/Status | jq '.hosts' ``` Expected output should include an entry with the app's `instance_id` and an assigned WireGuard IP: @@ -619,8 +626,11 @@ Gateway supports automatic TLS certificate management via the ACME protocol. Con ### 6.1 Configure ACME Service ```bash +ADMIN_AUTH=(-H "Authorization: Bearer $ADMIN_API_TOKEN") + # Set ACME URL (Let's Encrypt production) -curl -X POST "http://localhost:9016/prpc/SetCertbotConfig" \ +curl -X POST "${ADMIN_AUTH[@]}" \ + "http://localhost:9016/prpc/SetCertbotConfig" \ -H "Content-Type: application/json" \ -d '{"acme_url": "https://acme-v02.api.letsencrypt.org/directory"}' @@ -637,7 +647,8 @@ The Cloudflare API token needs the **DNS:Edit** permission on the target zone. C Cloudflare example: ```bash -curl -X POST "http://localhost:9016/prpc/CreateDnsCredential" \ +curl -X POST "${ADMIN_AUTH[@]}" \ + "http://localhost:9016/prpc/CreateDnsCredential" \ -H "Content-Type: application/json" \ -d '{ "name": "cloudflare-prod", @@ -669,7 +680,8 @@ Parameter description: Basic usage (using default DNS credential): ```bash -curl -X POST "http://localhost:9016/prpc/AddZtDomain" \ +curl -X POST "${ADMIN_AUTH[@]}" \ + "http://localhost:9016/prpc/AddZtDomain" \ -H "Content-Type: application/json" \ -d '{"domain": "example.com", "port": 443}' ``` @@ -677,7 +689,8 @@ curl -X POST "http://localhost:9016/prpc/AddZtDomain" \ Specifying DNS credential and node binding: ```bash -curl -X POST "http://localhost:9016/prpc/AddZtDomain" \ +curl -X POST "${ADMIN_AUTH[@]}" \ + "http://localhost:9016/prpc/AddZtDomain" \ -H "Content-Type: application/json" \ -d '{ "domain": "internal.example.com", @@ -711,7 +724,8 @@ Note: After adding a domain, the certificate is not issued immediately. Gateway ### 6.4 Manually Trigger Certificate Renewal ```bash -curl -X POST "http://localhost:9016/prpc/RenewZtDomainCert" \ +curl -X POST "${ADMIN_AUTH[@]}" \ + "http://localhost:9016/prpc/RenewZtDomainCert" \ -H "Content-Type: application/json" \ -d '{"domain": "example.com", "force": true}' ``` @@ -719,7 +733,7 @@ curl -X POST "http://localhost:9016/prpc/RenewZtDomainCert" \ ### 6.5 Check Certificate Status ```bash -curl -s http://localhost:9016/prpc/ListZtDomains | jq . +curl -s "${ADMIN_AUTH[@]}" http://localhost:9016/prpc/ListZtDomains | jq . ``` A healthy certificate shows `has_cert: true` and `loaded_in_memory: true`: diff --git a/gateway/dstack-app/bootstrap-cluster.sh b/gateway/dstack-app/bootstrap-cluster.sh index a6ec2387..f1ad6b48 100755 --- a/gateway/dstack-app/bootstrap-cluster.sh +++ b/gateway/dstack-app/bootstrap-cluster.sh @@ -21,12 +21,13 @@ fi ADMIN_ADDR="${1:-${GATEWAY_ADMIN_RPC_ADDR:-127.0.0.1:9203}}" -# When admin auth is enabled, ADMIN_TOKEN must be set so curl can present -# X-Admin-Token. Empty token = legacy unauthenticated mode. -AUTH_HEADER=() -if [ -n "$ADMIN_TOKEN" ]; then - AUTH_HEADER=(-H "X-Admin-Token: $ADMIN_TOKEN") +# bootstrap-cluster.sh authenticates to the admin API as an operator. The token +# is generated by deploy-to-vmm.sh and persisted in .env. +if [ -z "${ADMIN_API_TOKEN:-}" ]; then + echo "ERROR: ADMIN_API_TOKEN must be set (check .env)" >&2 + exit 1 fi +AUTH_HEADER=(-H "Authorization: Bearer $ADMIN_API_TOKEN") echo "Waiting for gateway admin API at $ADMIN_ADDR..." max_retries=60 diff --git a/gateway/dstack-app/builder/entrypoint.sh b/gateway/dstack-app/builder/entrypoint.sh index 5bf71a38..1a3811ad 100755 --- a/gateway/dstack-app/builder/entrypoint.sh +++ b/gateway/dstack-app/builder/entrypoint.sh @@ -36,7 +36,12 @@ validate_env "$NODE_ID" validate_env "$WG_IP" validate_env "$WG_RESERVED_NET" validate_env "$WG_CLIENT_RANGE" -validate_env "$ADMIN_TOKEN" +validate_env "$ADMIN_API_TOKEN" + +if [ -z "$ADMIN_API_TOKEN" ]; then + echo "ADMIN_API_TOKEN must be set when admin API is enabled" + exit 1 +fi # Validate $NODE_ID, must be a number if [[ ! "$NODE_ID" =~ ^[0-9]+$ ]]; then @@ -90,7 +95,7 @@ sync_connections_interval = "${SYNC_CONNECTIONS_INTERVAL:-30s}" enabled = true address = "${ADMIN_LISTEN_ADDR:-0.0.0.0}" port = ${ADMIN_LISTEN_PORT:-8001} -admin_token = "${ADMIN_TOKEN}" +admin_token = "${ADMIN_API_TOKEN}" [core.wg] public_key = "$PUBLIC_KEY" diff --git a/gateway/dstack-app/deploy-to-vmm.sh b/gateway/dstack-app/deploy-to-vmm.sh index 51ee1c16..5ae76a87 100755 --- a/gateway/dstack-app/deploy-to-vmm.sh +++ b/gateway/dstack-app/deploy-to-vmm.sh @@ -95,6 +95,11 @@ WG_ADDR=0.0.0.0:9202 # The token used to launch the App APP_LAUNCH_TOKEN=$(tr -dc 'a-zA-Z0-9' < /dev/urandom | fold -w 32 | head -n 1) +# Bearer token required by the gateway admin API. Used by bootstrap-cluster.sh +# and any operator who calls the admin API. Persisted into .env so cluster +# bootstrap can reach the API after deploy. +ADMIN_API_TOKEN=$(tr -dc 'a-zA-Z0-9' < /dev/urandom | fold -w 48 | head -n 1) + # PROXY protocol: read v1/v2 header from inbound connections (e.g. when this # gateway sits behind a PP-aware L4 LB such as Cloudflare Spectrum or haproxy # with send-proxy). Set to "true" only if the upstream LB is configured to @@ -117,6 +122,7 @@ required_env_vars=( "GATEWAY_APP_ID" "MY_URL" "APP_LAUNCH_TOKEN" + "ADMIN_API_TOKEN" "NODE_ID" "KMS_URL" # "BOOTNODE_URL" @@ -180,6 +186,7 @@ WG_IP=$WG_IP WG_RESERVED_NET=$WG_RESERVED_NET WG_CLIENT_RANGE=$WG_CLIENT_RANGE APP_LAUNCH_TOKEN=$APP_LAUNCH_TOKEN +ADMIN_API_TOKEN=$ADMIN_API_TOKEN RPC_DOMAIN=$RPC_DOMAIN NODE_ID=$NODE_ID PROXY_LISTEN_PORT=$PROXY_LISTEN_PORT diff --git a/gateway/dstack-app/docker-compose.yaml b/gateway/dstack-app/docker-compose.yaml index 869e0080..e48c231b 100644 --- a/gateway/dstack-app/docker-compose.yaml +++ b/gateway/dstack-app/docker-compose.yaml @@ -41,7 +41,7 @@ services: - TIMEOUT_TOTAL=${TIMEOUT_TOTAL:-5h} - ADMIN_LISTEN_ADDR=${ADMIN_LISTEN_ADDR:-0.0.0.0} - ADMIN_LISTEN_PORT=${ADMIN_LISTEN_PORT:-8001} - - ADMIN_TOKEN=${ADMIN_TOKEN:-} + - ADMIN_API_TOKEN=${ADMIN_API_TOKEN:-} - INBOUND_PP_ENABLED=${INBOUND_PP_ENABLED:-false} - TIMEOUT_PP_HEADER=${TIMEOUT_PP_HEADER:-5s} - PORT_POLICY_FETCH_TIMEOUT=${PORT_POLICY_FETCH_TIMEOUT:-10s} diff --git a/gateway/gateway.toml b/gateway/gateway.toml index 0e569650..a7383abb 100644 --- a/gateway/gateway.toml +++ b/gateway/gateway.toml @@ -24,10 +24,15 @@ timeout = "5s" [core.admin] enabled = false address = "127.0.0.1:8011" -# Shared secret required by every admin endpoint (RPC + dashboard) when -# non-empty. Clients send it via the `X-Admin-Token` header (or `?token=...` -# for the dashboard / browser links). Empty disables auth. +# Shared secret required by every admin endpoint (RPC + dashboard). Can also +# be supplied via the `DSTACK_GATEWAY_ADMIN_TOKEN` or `ADMIN_API_TOKEN` env +# vars. Clients send it as `Authorization: Bearer `, `X-Admin-Token`, +# or (GET only, for dashboard links) `?token=...`. Required unless +# `insecure_no_auth = true`. admin_token = "" +# Development/testing escape hatch only. Never enable this on an admin +# interface that is reachable from the network. +insecure_no_auth = false [core.debug] insecure_enable_debug_rpc = false diff --git a/gateway/src/admin_auth.rs b/gateway/src/admin_auth.rs index 1c1444ac..5b4000e4 100644 --- a/gateway/src/admin_auth.rs +++ b/gateway/src/admin_auth.rs @@ -17,30 +17,66 @@ //! Rejected requests are forwarded to a sentinel route that returns HTTP 401, //! so all admin routes (prpc-generated and dashboard) are protected by a single //! attachment without modifying the route declarations. +//! +//! The token is only ever held in memory as its SHA-256 hash; the configured +//! plaintext is dropped right after the fairing is constructed. +use anyhow::{bail, Result}; use rocket::{ fairing::{Fairing, Info, Kind}, http::{uri::Origin, Method, Status}, Data, Request, Route, }; +use sha2::{Digest, Sha256}; use subtle::ConstantTimeEq; +use crate::config::AdminConfig; + const UNAUTH_URI: &str = "/__admin_unauthorized"; const HEADER_NAME: &str = "X-Admin-Token"; const QUERY_PARAM: &str = "token"; +const ENV_ADMIN_TOKEN: &str = "DSTACK_GATEWAY_ADMIN_TOKEN"; +const ENV_ADMIN_TOKEN_COMPAT: &str = "ADMIN_API_TOKEN"; pub struct AdminAuthFairing { - /// `None` means auth is disabled (empty config); any request is allowed. - token: Option, + /// SHA-256 of the configured token. `None` = auth disabled (insecure mode). + token_hash: Option<[u8; 32]>, } impl AdminAuthFairing { - pub fn new(token: String) -> Self { + /// Build a fairing from a resolved plaintext token. Empty disables auth. + pub fn new(token: &str) -> Self { Self { - token: (!token.is_empty()).then_some(token), + token_hash: (!token.is_empty()).then(|| sha256(token.as_bytes())), } } + /// Resolve a token from config + env, applying the auth policy: + /// - `insecure_no_auth = true` → disabled (caller is expected to warn) + /// - else require a non-empty token from `admin_token`, + /// `DSTACK_GATEWAY_ADMIN_TOKEN`, or `ADMIN_API_TOKEN`. + pub fn from_config(config: &AdminConfig) -> Result { + if config.insecure_no_auth { + return Ok(Self { token_hash: None }); + } + let token = if !config.admin_token.is_empty() { + config.admin_token.clone() + } else { + std::env::var(ENV_ADMIN_TOKEN) + .or_else(|_| std::env::var(ENV_ADMIN_TOKEN_COMPAT)) + .unwrap_or_default() + }; + let token = token.trim(); + if token.is_empty() { + bail!( + "admin API is enabled but no admin_token is configured; \ + set core.admin.admin_token, {ENV_ADMIN_TOKEN}, or {ENV_ADMIN_TOKEN_COMPAT}, \ + or set core.admin.insecure_no_auth = true (testing only)" + ); + } + Ok(Self::new(token)) + } + fn extract_token(req: &Request<'_>) -> Option { if let Some(t) = req.headers().get_one(HEADER_NAME) { return Some(t.to_string()); @@ -63,6 +99,10 @@ impl AdminAuthFairing { } } +fn sha256(bytes: &[u8]) -> [u8; 32] { + Sha256::digest(bytes).into() +} + /// Rebuild the request URI without the `token` query parameter, if present. /// Returns `None` when there is nothing to strip. fn strip_token_query(uri: &Origin<'_>) -> Option> { @@ -99,7 +139,7 @@ impl Fairing for AdminAuthFairing { } async fn on_request(&self, req: &mut Request<'_>, _: &mut Data<'_>) { - let Some(expected) = self.token.as_deref() else { + let Some(expected_hash) = self.token_hash.as_ref() else { return; }; // Avoid infinite re-routing if the fairing fires on the sentinel itself. @@ -107,7 +147,8 @@ impl Fairing for AdminAuthFairing { return; } let provided = Self::extract_token(req).unwrap_or_default(); - let matches: bool = provided.as_bytes().ct_eq(expected.as_bytes()).into(); + let provided_hash = sha256(provided.as_bytes()); + let matches: bool = provided_hash.ct_eq(expected_hash).into(); if !matches { if let Ok(origin) = Origin::parse_owned(UNAUTH_URI.to_string()) { req.set_uri(origin); @@ -198,7 +239,7 @@ mod tests { async fn make_client(token: &str) -> Client { let r = rocket::build() - .attach(AdminAuthFairing::new(token.to_string())) + .attach(AdminAuthFairing::new(token)) .mount("/", routes()) .mount("/", rocket::routes![protected_get, protected_post, echo]); Client::tracked(r).await.unwrap() @@ -308,6 +349,86 @@ mod tests { assert_eq!(body, "token= other=keep"); } + fn hash_of(fairing: &AdminAuthFairing) -> Option<[u8; 32]> { + fairing.token_hash + } + + #[test] + fn from_config_disabled_when_insecure_flag_set() { + let cfg = AdminConfig { + enabled: true, + admin_token: String::new(), + insecure_no_auth: true, + }; + let fairing = match AdminAuthFairing::from_config(&cfg) { + Ok(f) => f, + Err(e) => panic!("expected Ok, got err: {e}"), + }; + assert!(hash_of(&fairing).is_none()); + } + + #[test] + fn from_config_uses_config_token() { + let cfg = AdminConfig { + enabled: true, + admin_token: "from-config".into(), + insecure_no_auth: false, + }; + let fairing = match AdminAuthFairing::from_config(&cfg) { + Ok(f) => f, + Err(e) => panic!("expected Ok, got err: {e}"), + }; + assert_eq!(hash_of(&fairing), Some(sha256(b"from-config"))); + } + + // Env-touching cases are combined into a single test so cargo's parallel + // runner doesn't race on `DSTACK_GATEWAY_ADMIN_TOKEN` / `ADMIN_API_TOKEN`. + #[test] + fn from_config_env_paths() { + let empty_cfg = AdminConfig { + enabled: true, + admin_token: String::new(), + insecure_no_auth: false, + }; + + // Baseline: no env, no config token → error. + unsafe { + std::env::remove_var(ENV_ADMIN_TOKEN); + std::env::remove_var(ENV_ADMIN_TOKEN_COMPAT); + } + let err = match AdminAuthFairing::from_config(&empty_cfg) { + Err(e) => e, + Ok(_) => panic!("expected error, got Ok"), + }; + assert!(err.to_string().contains("no admin_token is configured")); + + // Primary env var picked up. + unsafe { + std::env::set_var(ENV_ADMIN_TOKEN, "from-env"); + } + let fairing = match AdminAuthFairing::from_config(&empty_cfg) { + Ok(f) => f, + Err(e) => panic!("expected Ok, got err: {e}"), + }; + assert_eq!(hash_of(&fairing), Some(sha256(b"from-env"))); + unsafe { + std::env::remove_var(ENV_ADMIN_TOKEN); + } + + // Compat env var picked up when primary is absent. + unsafe { + std::env::set_var(ENV_ADMIN_TOKEN_COMPAT, "from-compat"); + } + let fairing = match AdminAuthFairing::from_config(&empty_cfg) { + Ok(f) => f, + Err(e) => panic!("expected Ok, got err: {e}"), + }; + assert_eq!(hash_of(&fairing), Some(sha256(b"from-compat"))); + unsafe { + std::env::remove_var(ENV_ADMIN_TOKEN_COMPAT); + } + } + #[rocket::async_test] async fn unauth_returns_401_on_all_methods() { let client = make_client("s3cret").await; diff --git a/gateway/src/config.rs b/gateway/src/config.rs index e7b7b5dd..68db41c8 100644 --- a/gateway/src/config.rs +++ b/gateway/src/config.rs @@ -287,10 +287,14 @@ impl Config { pub struct AdminConfig { pub enabled: bool, /// Shared secret required to call any admin endpoint (RPC + dashboard). - /// Empty disables authentication; operators are warned at startup so the - /// historical "network-isolation only" deployments keep working. + /// Can also be supplied via `DSTACK_GATEWAY_ADMIN_TOKEN` / `ADMIN_API_TOKEN` + /// env vars. Required unless `insecure_no_auth = true`. #[serde(default)] pub admin_token: String, + /// Disable authentication entirely. Development/testing only; never enable + /// on an admin interface that is reachable from the network. + #[serde(default)] + pub insecure_no_auth: bool, } #[derive(Debug, Clone, Deserialize)] diff --git a/gateway/src/main.rs b/gateway/src/main.rs index ae138d20..032303be 100644 --- a/gateway/src/main.rs +++ b/gateway/src/main.rs @@ -276,8 +276,12 @@ async fn main() -> Result<()> { }; let proxy_config = config.proxy.clone(); let pccs_url = config.pccs_url.clone(); - let admin_enabled = config.admin.enabled; - let admin_token = config.admin.admin_token.clone(); + let admin_auth = if config.admin.enabled { + Some(admin_auth::AdminAuthFairing::from_config(&config.admin)?) + } else { + None + }; + let admin_insecure = config.admin.insecure_no_auth; let debug_config = config.debug.clone(); let state = Proxy::new(ProxyOptions { config, @@ -322,16 +326,16 @@ async fn main() -> Result<()> { let admin_state = state.clone(); let debug_state = state; let admin_srv = async move { - if admin_enabled { - if admin_token.is_empty() { + if let Some(auth_fairing) = admin_auth { + if admin_insecure { tracing::warn!( - "admin server enabled without admin_token; admin API is exposed without authentication" + "admin server running with insecure_no_auth = true; admin API is exposed without authentication" ); } else { tracing::info!("admin server authentication enabled"); } rocket::custom(admin_figment) - .attach(admin_auth::AdminAuthFairing::new(admin_token)) + .attach(auth_fairing) .mount("/", admin_auth::routes()) .mount("/", web_routes::routes()) .mount("/", prpc!(Proxy, AdminRpcHandler, trim: "Admin.")) diff --git a/gateway/test-run/e2e/configs/gateway-1.toml b/gateway/test-run/e2e/configs/gateway-1.toml index dfbe1609..f56fd04a 100644 --- a/gateway/test-run/e2e/configs/gateway-1.toml +++ b/gateway/test-run/e2e/configs/gateway-1.toml @@ -19,6 +19,7 @@ rpc_domain = "gateway-1" enabled = true port = 9016 address = "0.0.0.0" +insecure_no_auth = true [core.debug] insecure_enable_debug_rpc = true diff --git a/gateway/test-run/e2e/configs/gateway-2.toml b/gateway/test-run/e2e/configs/gateway-2.toml index b825fda5..3e205c38 100644 --- a/gateway/test-run/e2e/configs/gateway-2.toml +++ b/gateway/test-run/e2e/configs/gateway-2.toml @@ -19,6 +19,7 @@ rpc_domain = "gateway-2" enabled = true port = 9016 address = "0.0.0.0" +insecure_no_auth = true [core.debug] insecure_enable_debug_rpc = true diff --git a/gateway/test-run/e2e/configs/gateway-3.toml b/gateway/test-run/e2e/configs/gateway-3.toml index f30cb6a1..6bb97ae3 100644 --- a/gateway/test-run/e2e/configs/gateway-3.toml +++ b/gateway/test-run/e2e/configs/gateway-3.toml @@ -19,6 +19,7 @@ rpc_domain = "gateway-3" enabled = true port = 9016 address = "0.0.0.0" +insecure_no_auth = true [core.debug] insecure_enable_debug_rpc = true From a42b01573276f691f790cfac3736faca5f590f2c Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 08:36:18 -0700 Subject: [PATCH 3/6] gw: enable admin token auth in e2e test cluster Replace `insecure_no_auth = true` with a real `admin_token` in all three e2e gateway configs, and update test.sh to send `Authorization: Bearer` on every admin-RPC call. Adds a Phase-3 "Admin token auth" check that exercises the auth fairing end-to-end: missing token -> 401, wrong token -> 401, correct token -> 200. This makes the e2e suite a regression test for the auth path itself (not just for the certbot/cert-sync flow that follows), so future changes to admin auth can't silently break ingress without a test failure. Verified locally against the gateway binary (out of band of the docker compose harness, which depends on an external TDX endpoint): the 8 exhaustive curl cases (no token / wrong / Bearer / X-Admin-Token / ?token= GET / ?token= POST / dashboard root unauthed+authed) all return the expected status codes. Env-var fallback (DSTACK_GATEWAY_ADMIN_TOKEN) and fail-by-default policy also confirmed working. --- gateway/test-run/e2e/configs/gateway-1.toml | 2 +- gateway/test-run/e2e/configs/gateway-2.toml | 2 +- gateway/test-run/e2e/configs/gateway-3.toml | 2 +- gateway/test-run/e2e/test.sh | 62 +++++++++++++++++---- 4 files changed, 53 insertions(+), 15 deletions(-) diff --git a/gateway/test-run/e2e/configs/gateway-1.toml b/gateway/test-run/e2e/configs/gateway-1.toml index f56fd04a..dc8e6f3b 100644 --- a/gateway/test-run/e2e/configs/gateway-1.toml +++ b/gateway/test-run/e2e/configs/gateway-1.toml @@ -19,7 +19,7 @@ rpc_domain = "gateway-1" enabled = true port = 9016 address = "0.0.0.0" -insecure_no_auth = true +admin_token = "e2e-admin-token" [core.debug] insecure_enable_debug_rpc = true diff --git a/gateway/test-run/e2e/configs/gateway-2.toml b/gateway/test-run/e2e/configs/gateway-2.toml index 3e205c38..c733710b 100644 --- a/gateway/test-run/e2e/configs/gateway-2.toml +++ b/gateway/test-run/e2e/configs/gateway-2.toml @@ -19,7 +19,7 @@ rpc_domain = "gateway-2" enabled = true port = 9016 address = "0.0.0.0" -insecure_no_auth = true +admin_token = "e2e-admin-token" [core.debug] insecure_enable_debug_rpc = true diff --git a/gateway/test-run/e2e/configs/gateway-3.toml b/gateway/test-run/e2e/configs/gateway-3.toml index 6bb97ae3..b02a57fc 100644 --- a/gateway/test-run/e2e/configs/gateway-3.toml +++ b/gateway/test-run/e2e/configs/gateway-3.toml @@ -19,7 +19,7 @@ rpc_domain = "gateway-3" enabled = true port = 9016 address = "0.0.0.0" -insecure_no_auth = true +admin_token = "e2e-admin-token" [core.debug] insecure_enable_debug_rpc = true diff --git a/gateway/test-run/e2e/test.sh b/gateway/test-run/e2e/test.sh index 9c1db1a2..4275d618 100755 --- a/gateway/test-run/e2e/test.sh +++ b/gateway/test-run/e2e/test.sh @@ -22,6 +22,10 @@ GATEWAY_PROXIES="gateway-1:9014 gateway-2:9014 gateway-3:9014" GATEWAY_DEBUG_URLS="http://gateway-1:9015 http://gateway-2:9015 http://gateway-3:9015" GATEWAY_ADMIN="http://gateway-1:9016" +# Must match `admin_token` in configs/gateway-*.toml +ADMIN_TOKEN="e2e-admin-token" +ADMIN_AUTH_HEADER="Authorization: Bearer ${ADMIN_TOKEN}" + # External services MOCK_CF_API="http://mock-cf-dns-api:8080" PEBBLE_DIR="http://pebble:14000/dir" @@ -183,6 +187,7 @@ setup_certbot_config() { # Set ACME URL log_info "Setting ACME URL: ${ACME_URL}" if ! curl -sf -X POST "${GATEWAY_ADMIN}/prpc/Admin.SetCertbotConfig" \ + -H "${ADMIN_AUTH_HEADER}" \ -H "Content-Type: application/json" \ -d '{"acme_url": "'"${ACME_URL}"'"}' > /dev/null; then log_error "Failed to set certbot config" @@ -192,6 +197,7 @@ setup_certbot_config() { # Create DNS credential log_info "Creating DNS credential..." if ! curl -sf -X POST "${GATEWAY_ADMIN}/prpc/Admin.CreateDnsCredential" \ + -H "${ADMIN_AUTH_HEADER}" \ -H "Content-Type: application/json" \ -d '{ "name": "test-cloudflare", @@ -210,11 +216,13 @@ setup_certbot_config() { for domain in $CERT_DOMAINS; do log_info "Adding domain: $domain" curl -sf -X POST "${GATEWAY_ADMIN}/prpc/Admin.AddZtDomain" \ + -H "${ADMIN_AUTH_HEADER}" \ -H "Content-Type: application/json" \ -d '{"domain": "'"${domain}"'"}' > /dev/null || true log_info "Triggering renewal for: $domain" curl -sf -X POST "${GATEWAY_ADMIN}/prpc/Admin.RenewZtDomainCert" \ + -H "${ADMIN_AUTH_HEADER}" \ -H "Content-Type: application/json" \ -d '{"domain": "'"${domain}"'", "force": true}' > /dev/null || \ log_warn "Renewal request failed for $domain (may retry)" @@ -223,6 +231,31 @@ setup_certbot_config() { return 0 } +# Returns 0 if HTTP status code from $1 args equals $2. +http_status_eq() { + local expected="$1" + shift + local actual + actual=$(curl -s -o /dev/null -w '%{http_code}' "$@") + [ "$actual" = "$expected" ] +} + +# Returns 0 if all three admin auth checks pass: missing 401, wrong 401, right 200. +test_admin_auth() { + log_info "checking admin auth on ${GATEWAY_ADMIN}" + # Missing token → 401 + http_status_eq 401 "${GATEWAY_ADMIN}/prpc/Admin.Status" \ + || { log_error "no-token request did not return 401"; return 1; } + # Wrong token → 401 + http_status_eq 401 "${GATEWAY_ADMIN}/prpc/Admin.Status" \ + -H "Authorization: Bearer wrong-token" \ + || { log_error "wrong-token request did not return 401"; return 1; } + # Correct token → 200 + http_status_eq 200 "${GATEWAY_ADMIN}/prpc/Admin.Status" \ + -H "${ADMIN_AUTH_HEADER}" \ + || { log_error "valid-token request did not return 200"; return 1; } +} + # ==================== Main ==================== main() { @@ -241,14 +274,19 @@ main() { i=$((i + 1)) done - # Phase 3: Configure certbot - log_phase 3 "Configure certbot" + # Phase 3: Admin auth gating + log_phase 3 "Admin token auth" + run_test "Admin endpoint accepts valid token and rejects missing/wrong" \ + "$(test_admin_auth; echo $?)" + + # Phase 4: Configure certbot + log_phase 4 "Configure certbot" if ! setup_certbot_config; then log_error "Failed to setup certbot configuration" fi - # Phase 4: Certificate issuance - log_phase 4 "Certificate issuance" + # Phase 5: Certificate issuance + log_phase 5 "Certificate issuance" local first_domain=$(echo "$CERT_DOMAINS" | cut -d' ' -f1) local first_sni=$(get_test_sni "$first_domain") local first_proxy=$(echo "$GATEWAY_PROXIES" | cut -d' ' -f1) @@ -274,8 +312,8 @@ main() { log_info "Waiting 20s for cluster sync..." sleep 20 - # Phase 5: Certificate consistency - log_phase 5 "Certificate consistency" + # Phase 6: Certificate consistency + log_phase 6 "Certificate consistency" for domain in $CERT_DOMAINS; do local sni=$(get_test_sni "$domain") run_test "All gateways have same cert for $domain" \ @@ -284,8 +322,8 @@ main() { "$(test_certificate_from_pebble "$sni"; echo $?)" done - # Phase 6: SNI-based selection - log_phase 6 "SNI-based certificate selection" + # Phase 7: SNI-based selection + log_phase 7 "SNI-based certificate selection" for domain in $CERT_DOMAINS; do local sni=$(get_test_sni "$domain") local wildcard=$(get_wildcard_domain "$domain") @@ -293,8 +331,8 @@ main() { "$(test_sni_cert_selection "$first_proxy" "$sni" "$wildcard"; echo $?)" done - # Phase 7: Proxy TLS health - log_phase 7 "Proxy TLS health endpoint" + # Phase 8: Proxy TLS health + log_phase 8 "Proxy TLS health endpoint" for domain in $CERT_DOMAINS; do local sni=$(get_test_sni "$domain") local i=1 @@ -305,8 +343,8 @@ main() { done done - # Phase 8: DNS records (informational) - log_phase 8 "DNS-01 challenge records" + # Phase 9: DNS records (informational) + log_phase 9 "DNS-01 challenge records" local records=$(curl -sf "${MOCK_CF_API}/api/records" 2>/dev/null || echo "") if echo "$records" | grep -q "TXT"; then log_success "DNS TXT records found" From 17474fe7c4004d3ccc1b4838b5504d26a85d8695 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 08:45:37 -0700 Subject: [PATCH 4/6] gw: fix shellcheck warnings in e2e test.sh (prek CI) prek's shellcheck-py started scanning e2e/test.sh once the previous commit modified it, and surfaced two pre-existing classes of warnings: - SC3043: `local` is undefined in POSIX sh. The script uses bash-only `local` throughout, so the `#!/bin/sh` shebang was already wrong. Switch to `#!/bin/bash`. - SC2155: `local foo=$(cmd)` masks the inner command's exit status. Split into `local foo; foo=$(cmd)` at every site (one inside `test_certificate_from_pebble`, the rest inside `main`). Remaining shellcheck findings are info-level only (SC2086, SC2317); the prek CI run only failed on warning-level findings. --- gateway/test-run/e2e/test.sh | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/gateway/test-run/e2e/test.sh b/gateway/test-run/e2e/test.sh index 4275d618..2639823b 100755 --- a/gateway/test-run/e2e/test.sh +++ b/gateway/test-run/e2e/test.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-FileCopyrightText: 2024-2025 Phala Network # # SPDX-License-Identifier: Apache-2.0 @@ -162,7 +162,8 @@ test_certificates_match() { test_certificate_from_pebble() { local sni="$1" - local proxy=$(echo "$GATEWAY_PROXIES" | cut -d' ' -f1) + local proxy + proxy=$(echo "$GATEWAY_PROXIES" | cut -d' ' -f1) get_cert_issuer "$proxy" "$sni" | grep -qi "pebble" } @@ -287,9 +288,10 @@ main() { # Phase 5: Certificate issuance log_phase 5 "Certificate issuance" - local first_domain=$(echo "$CERT_DOMAINS" | cut -d' ' -f1) - local first_sni=$(get_test_sni "$first_domain") - local first_proxy=$(echo "$GATEWAY_PROXIES" | cut -d' ' -f1) + local first_domain first_sni first_proxy + first_domain=$(echo "$CERT_DOMAINS" | cut -d' ' -f1) + first_sni=$(get_test_sni "$first_domain") + first_proxy=$(echo "$GATEWAY_PROXIES" | cut -d' ' -f1) log_info "Waiting for certificates (up to 120s)..." local waited=0 @@ -303,8 +305,9 @@ main() { log_info "Waiting... (${waited}s)" done + local sni wildcard for domain in $CERT_DOMAINS; do - local sni=$(get_test_sni "$domain") + sni=$(get_test_sni "$domain") run_test "Certificate issued for $domain" \ "$(test_certificate_issued "$first_proxy" "$sni"; echo $?)" done @@ -315,7 +318,7 @@ main() { # Phase 6: Certificate consistency log_phase 6 "Certificate consistency" for domain in $CERT_DOMAINS; do - local sni=$(get_test_sni "$domain") + sni=$(get_test_sni "$domain") run_test "All gateways have same cert for $domain" \ "$(test_certificates_match "$sni"; echo $?)" run_test "Cert for $domain issued by Pebble" \ @@ -325,17 +328,18 @@ main() { # Phase 7: SNI-based selection log_phase 7 "SNI-based certificate selection" for domain in $CERT_DOMAINS; do - local sni=$(get_test_sni "$domain") - local wildcard=$(get_wildcard_domain "$domain") + sni=$(get_test_sni "$domain") + wildcard=$(get_wildcard_domain "$domain") run_test "SNI $sni returns $wildcard cert" \ "$(test_sni_cert_selection "$first_proxy" "$sni" "$wildcard"; echo $?)" done # Phase 8: Proxy TLS health log_phase 8 "Proxy TLS health endpoint" + local i for domain in $CERT_DOMAINS; do - local sni=$(get_test_sni "$domain") - local i=1 + sni=$(get_test_sni "$domain") + i=1 for proxy in $GATEWAY_PROXIES; do run_test "Gateway $i TLS health ($sni)" \ "$(test_proxy_tls_health "$proxy" "$sni"; echo $?)" @@ -345,7 +349,8 @@ main() { # Phase 9: DNS records (informational) log_phase 9 "DNS-01 challenge records" - local records=$(curl -sf "${MOCK_CF_API}/api/records" 2>/dev/null || echo "") + local records + records=$(curl -sf "${MOCK_CF_API}/api/records" 2>/dev/null || echo "") if echo "$records" | grep -q "TXT"; then log_success "DNS TXT records found" else From 69974990007784a7b54b3681a388a732f0e29200 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 08:47:29 -0700 Subject: [PATCH 5/6] gw: remove unused wait_for_service helper from e2e test.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shellcheck-py in prek treats info-level findings as failures, and the previous lint pass left two info-level classes (SC2317 unreachable code, SC2086 unquoted in `[ ]`) — both located inside the `wait_for_service` function. The function is defined but never called (the actual readiness wait lives in run-e2e.sh's docker-compose-driven healthchecks), so deleting it clears the findings without altering test behaviour. --- gateway/test-run/e2e/test.sh | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/gateway/test-run/e2e/test.sh b/gateway/test-run/e2e/test.sh index 2639823b..1ed9df3f 100755 --- a/gateway/test-run/e2e/test.sh +++ b/gateway/test-run/e2e/test.sh @@ -79,27 +79,6 @@ run_test() { fi } -# Wait for HTTP service to respond -wait_for_service() { - local url="$1" - local name="$2" - local max_wait="${3:-60}" - local waited=0 - - log_info "Waiting for $name..." - while [ $waited -lt $max_wait ]; do - if curl -sf "$url" > /dev/null 2>&1; then - log_info "$name is ready" - return 0 - fi - sleep 2 - waited=$((waited + 2)) - done - - log_error "$name failed to become ready within ${max_wait}s" - return 1 -} - # ==================== Domain Helpers ==================== # Convert base domain to test SNI: test0.local -> gateway.test0.local From 6f3792944c8c1129a5bdfdf05608e3369f7eec53 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Mon, 11 May 2026 21:40:50 -0700 Subject: [PATCH 6/6] gw: support HTTP Basic Auth on admin server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plain browsers can't set custom headers when typing a URL, so the existing `X-Admin-Token` / `Bearer` / `?token=` paths leave dashboard operators with only the query-param option — which leaks the token via URL bar, history, and Referer. Add `Authorization: Basic ` as a fourth accepted transport. The token may sit in either the username or password field (operators often paste into either side of the native browser prompt), and the 401 sentinel now emits `WWW-Authenticate: Basic realm="..."` so the browser actually shows the prompt instead of a plain error page. RPC clients are unaffected: the new header just goes alongside the existing transports and they ignore the WWW-Authenticate header. Adds 5 tests: password field, username field, wrong password, malformed base64, and the WWW-Authenticate challenge header on 401. --- gateway/src/admin_auth.rs | 155 ++++++++++++++++++++++++++++++++------ 1 file changed, 133 insertions(+), 22 deletions(-) diff --git a/gateway/src/admin_auth.rs b/gateway/src/admin_auth.rs index 5b4000e4..27fa3025 100644 --- a/gateway/src/admin_auth.rs +++ b/gateway/src/admin_auth.rs @@ -8,24 +8,31 @@ //! present the configured shared secret. The token is accepted via, in order: //! 1. `X-Admin-Token` header (any method) //! 2. `Authorization: Bearer ` header (any method) -//! 3. `?token=` query parameter (GET only, for dashboard links) +//! 3. `Authorization: Basic ` (any method; token may be +//! in either the user or password field — needed so plain browsers can +//! authenticate to the dashboard via the native HTTP-auth prompt) +//! 4. `?token=` query parameter (GET only, for dashboard links) //! -//! For (3), the `token` query parameter is stripped from the request URI after +//! For (4), the `token` query parameter is stripped from the request URI after //! successful validation so it doesn't propagate to access logs, downstream //! handlers, or the Referer header. //! -//! Rejected requests are forwarded to a sentinel route that returns HTTP 401, -//! so all admin routes (prpc-generated and dashboard) are protected by a single -//! attachment without modifying the route declarations. +//! Rejected requests are forwarded to a sentinel route that returns HTTP 401 +//! with `WWW-Authenticate: Basic realm="dstack-gateway admin"` so browsers +//! show the native login prompt. All admin routes (prpc-generated and +//! dashboard) are protected by this single fairing attachment without +//! modifying the route declarations. //! //! The token is only ever held in memory as its SHA-256 hash; the configured //! plaintext is dropped right after the fairing is constructed. use anyhow::{bail, Result}; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; use rocket::{ fairing::{Fairing, Info, Kind}, - http::{uri::Origin, Method, Status}, - Data, Request, Route, + http::{uri::Origin, Header, Method, Status}, + response::Responder, + Data, Request, Response, Route, }; use sha2::{Digest, Sha256}; use subtle::ConstantTimeEq; @@ -37,6 +44,7 @@ const HEADER_NAME: &str = "X-Admin-Token"; const QUERY_PARAM: &str = "token"; const ENV_ADMIN_TOKEN: &str = "DSTACK_GATEWAY_ADMIN_TOKEN"; const ENV_ADMIN_TOKEN_COMPAT: &str = "ADMIN_API_TOKEN"; +const BASIC_REALM: &str = "dstack-gateway admin"; pub struct AdminAuthFairing { /// SHA-256 of the configured token. `None` = auth disabled (insecure mode). @@ -83,7 +91,12 @@ impl AdminAuthFairing { } if let Some(auth) = req.headers().get_one("Authorization") { if let Some(t) = auth.strip_prefix("Bearer ") { - return Some(t.to_string()); + return Some(t.trim().to_string()); + } + if let Some(b64) = auth.strip_prefix("Basic ") { + if let Some(t) = basic_auth_token(b64.trim()) { + return Some(t); + } } } // Query token is intended for browser links to the dashboard, so only @@ -103,6 +116,37 @@ fn sha256(bytes: &[u8]) -> [u8; 32] { Sha256::digest(bytes).into() } +/// Decode a `Basic` credential and return whichever of user/password is +/// non-empty (we accept either so the browser prompt's two fields are +/// interchangeable for the operator). +fn basic_auth_token(b64: &str) -> Option { + let decoded = BASE64.decode(b64).ok()?; + let text = std::str::from_utf8(&decoded).ok()?; + let (user, pass) = text.split_once(':').unwrap_or((text, "")); + if !pass.is_empty() { + return Some(pass.to_string()); + } + if !user.is_empty() { + return Some(user.to_string()); + } + None +} + +/// 401 response that triggers the browser's native HTTP-auth prompt. +struct Unauthorized; + +impl<'r> Responder<'r, 'static> for Unauthorized { + fn respond_to(self, _req: &'r Request<'_>) -> rocket::response::Result<'static> { + Response::build() + .status(Status::Unauthorized) + .header(Header::new( + "WWW-Authenticate", + format!("Basic realm=\"{BASIC_REALM}\""), + )) + .ok() + } +} + /// Rebuild the request URI without the `token` query parameter, if present. /// Returns `None` when there is nothing to strip. fn strip_token_query(uri: &Origin<'_>) -> Option> { @@ -166,38 +210,38 @@ impl Fairing for AdminAuthFairing { // enumerate them because Rocket doesn't support a method-agnostic route. #[rocket::get("/__admin_unauthorized")] -fn unauth_get() -> Status { - Status::Unauthorized +fn unauth_get() -> Unauthorized { + Unauthorized } #[rocket::post("/__admin_unauthorized", data = "<_data>")] -fn unauth_post(_data: Data<'_>) -> Status { - Status::Unauthorized +fn unauth_post(_data: Data<'_>) -> Unauthorized { + Unauthorized } #[rocket::put("/__admin_unauthorized", data = "<_data>")] -fn unauth_put(_data: Data<'_>) -> Status { - Status::Unauthorized +fn unauth_put(_data: Data<'_>) -> Unauthorized { + Unauthorized } #[rocket::patch("/__admin_unauthorized", data = "<_data>")] -fn unauth_patch(_data: Data<'_>) -> Status { - Status::Unauthorized +fn unauth_patch(_data: Data<'_>) -> Unauthorized { + Unauthorized } #[rocket::delete("/__admin_unauthorized")] -fn unauth_delete() -> Status { - Status::Unauthorized +fn unauth_delete() -> Unauthorized { + Unauthorized } #[rocket::options("/__admin_unauthorized")] -fn unauth_options() -> Status { - Status::Unauthorized +fn unauth_options() -> Unauthorized { + Unauthorized } #[rocket::head("/__admin_unauthorized")] -fn unauth_head() -> Status { - Status::Unauthorized +fn unauth_head() -> Unauthorized { + Unauthorized } pub fn routes() -> Vec { @@ -444,4 +488,71 @@ mod tests { ); } } + + fn basic_header(user: &str, pass: &str) -> Header<'static> { + let creds = format!("{user}:{pass}"); + Header::new("Authorization", format!("Basic {}", BASE64.encode(creds))) + } + + #[rocket::async_test] + async fn basic_auth_password_field_accepted() { + let client = make_client("s3cret").await; + let resp = client + .get("/protected") + .header(basic_header("admin", "s3cret")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn basic_auth_user_field_accepted_when_password_empty() { + let client = make_client("s3cret").await; + // Some browser users paste the token into the username field by mistake. + let resp = client + .get("/protected") + .header(basic_header("s3cret", "")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Ok); + } + + #[rocket::async_test] + async fn basic_auth_wrong_password_rejected() { + let client = make_client("s3cret").await; + let resp = client + .get("/protected") + .header(basic_header("admin", "wrong")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Unauthorized); + } + + #[rocket::async_test] + async fn basic_auth_malformed_rejected() { + let client = make_client("s3cret").await; + // Not valid base64 at all. + let resp = client + .get("/protected") + .header(Header::new("Authorization", "Basic !!not-base64!!")) + .dispatch() + .await; + assert_eq!(resp.status(), Status::Unauthorized); + } + + #[rocket::async_test] + async fn unauthorized_response_includes_www_authenticate() { + let client = make_client("s3cret").await; + let resp = client.get("/protected").dispatch().await; + assert_eq!(resp.status(), Status::Unauthorized); + let www = resp + .headers() + .get_one("WWW-Authenticate") + .expect("missing WWW-Authenticate header"); + assert!( + www.starts_with("Basic realm="), + "expected Basic challenge, got {www:?}" + ); + assert!(www.contains("dstack-gateway admin")); + } }