From f490e39a14725e84b987fd7ad1e3ce25363495dd Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Tue, 12 May 2026 18:42:38 +0300 Subject: [PATCH 1/2] fix(commands): dedupe stdinIsTTY between init.go and tui_handler.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two independently-developed branches both introduced an injection-seam package var named stdinIsTTY — one for the init --update non-tty UX gate, one for the dashboard/edit TUI refusal. The branches landed in separate merges; git's 3-way merge accepted both because the declarations live in different files, but the resulting package fails to build with 'stdinIsTTY redeclared in this block'. Keep the tui_handler.go declaration as canonical (the godoc there spells out the function-type-injection rationale that applies equally to both callers) and drop the init.go copy. The remaining caller in resolveOverwritePolicy resolves against the package-level var unchanged. Signed-off-by: Aleksei Sviridkin --- pkg/commands/init.go | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pkg/commands/init.go b/pkg/commands/init.go index ebcaa69..0d3f9f4 100644 --- a/pkg/commands/init.go +++ b/pkg/commands/init.go @@ -31,7 +31,6 @@ import ( "github.com/cozystack/talm/pkg/generated" "github.com/cozystack/talm/pkg/secureperm" "github.com/spf13/cobra" - "golang.org/x/term" "gopkg.in/yaml.v3" "github.com/siderolabs/talos/cmd/talosctl/cmd/mgmt/gen" @@ -956,19 +955,6 @@ const ( overwritePolicyNonInteractive ) -// stdinIsTTY reports whether process stdin is connected to a -// terminal. Var-typed so the unit tests can swap a fake. -// -// term.IsTerminal correctly returns false for /dev/null and pipes — -// the naive os.Stdin.Stat()&ModeCharDevice check accepted /dev/null -// (it's a character device) and led the previous version to prompt -// in cron / scripted shells, EOFing the read. -// -//nolint:gochecknoglobals // injection seam for testability; matches stdinReader below. -var stdinIsTTY = func() bool { - return term.IsTerminal(int(os.Stdin.Fd())) -} - // stdinReader is the io.Reader the interactive prompt reads from. // Var-typed so unit tests can supply canned input. // From a5d61ea159b337eeb2c570cc7615b4c3ce47a234 Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Tue, 12 May 2026 21:13:48 +0300 Subject: [PATCH 2/2] feat(commands): preserve META by default on talm reset (#185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flip the talm reset default away from upstream's destructive --wipe-mode=all toward the META-preserving recipe (--system-labels-to-wipe=STATE,EPHEMERAL). The flip only fires when the operator passed neither --wipe-mode nor --system-labels-to-wipe on the CLI; explicit operator intent on either axis is honored unchanged. - 'talm reset' (no wipe flags): wrapper populates --system-labels-to-wipe=STATE,EPHEMERAL. The server-side reset codepath, when SystemPartitionsToWipe is non-empty, takes the label-driven path and keeps other partitions intact per the upstream --system-labels-to-wipe flag doc in cmd/talosctl/cmd/talos/reset.go. META survives; on the next boot Talos rejoins the cluster from META without operator intervention (verified on a 3-node v1.12.6 stand: node returns to etcd with placeholder hostname and a new member id from META within ~90s of the post-reboot apply). - 'talm reset --wipe-mode=all' or '--wipe-mode=system-disk': explicit destructive opt-in. Both values land in the Mode-driven server-side branch when SystemPartitionsToWipe is empty and wipe the full system disk including META. The wrapper does NOT add the safety override; operator intent wins. --wipe-mode=user-disks is safe — it doesn't touch system partitions either way. - 'talm reset --system-labels-to-wipe=STATE': operator's narrower list is honored byte-for-byte. The wrapper does NOT silently expand to STATE,EPHEMERAL — operators choosing a narrower scope are doing so deliberately. Help-text overrides on both flags spell out the divergence so 'talm reset --help' carries the operator-facing story. The --wipe-mode override names both destructive opt-out values so operators picking system-disk to 'be less destructive than all' don't walk into the same trap. Talm already diverges from upstream defaults where the friendly path should be the default — kubeconfig --force=true is the established precedent. This commit extends that pattern to reset. Test coverage (pkg/commands/reset_handler_test.go): - TestWrapResetCommand_NoFlags_AppliesSafeDefault — headline behaviour change pin. - TestWrapResetCommand_ExplicitWipeModeAll_SkipsDefault — opt-out via --wipe-mode=all. - TestWrapResetCommand_ExplicitWipeModeSystemDisk_SkipsDefault — opt-out via --wipe-mode=system-disk (value-agnostic Changed() gate; explicit test guards against a future refactor adding value-conditional logic). - TestWrapResetCommand_ExplicitLabels_PreservesOperatorChoice — operator's narrower list honored byte-for-byte. - TestWrapResetCommand_BothFlagsChanged_SkipsDefault — mixed intent no-interference. - TestWrapResetCommand_HelpTextMentionsMetaPreservation — help text contract pin (substrings 'preserves META' + resetSafeDefaultLabels literal + 'system-disk'). - TestWrapResetCommand_RealUpstreamResetDispatched — end-to-end via taloscommands.Commands -> wrapTalosCommand so a dispatch refactor that silently drops the wrapper fails this test while the synthetic-cobra tests above stay green. README gains a 'talm reset — META-preserving default' subsection under 'Using talosctl commands' with all three call shapes (safe default, explicit destructive opt-in, operator-specified narrower scope) so the divergence is discoverable from operator docs. docs/manual-test-plan.md section H is rewritten around the new default. H2 covers the safe-default headline path; H2a-H2d add boundary cases (explicit destructive --wipe-mode=all or --wipe-mode=system-disk, narrower operator list, --graceful=false, modeline-bearing project root). Each carries an explicit regression-anchor line so future operators running the plan know what a wrapper-side regression would look like. Assisted-By: Claude Signed-off-by: Aleksei Sviridkin --- README.md | 17 ++ docs/manual-test-plan.md | 64 +++++- pkg/commands/reset_handler.go | 98 ++++++++ pkg/commands/reset_handler_test.go | 345 +++++++++++++++++++++++++++++ pkg/commands/talosctl_wrapper.go | 10 + 5 files changed, 529 insertions(+), 5 deletions(-) create mode 100644 pkg/commands/reset_handler.go create mode 100644 pkg/commands/reset_handler_test.go diff --git a/README.md b/README.md index 0e7a119..15f7b7d 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,23 @@ For example, to run a dashboard for three nodes: talm dashboard -f node1.yaml -f node2.yaml -f node3.yaml ``` +### `talm reset` — META-preserving default + +`talm reset` diverges from upstream `talosctl reset` on one default. Upstream defaults to `--wipe-mode=all`, which wipes the Talos META partition along with STATE and EPHEMERAL — the node cannot self-recover and comes up in maintenance mode requiring a full re-apply. Talm instead populates `--system-labels-to-wipe=STATE,EPHEMERAL` when neither `--wipe-mode` nor `--system-labels-to-wipe` was passed, which preserves META so the node rejoins the cluster from its META-stored bootstrap config on the next boot. + +Explicit operator intent is honored unchanged: + +```bash +# talm default — preserves META, node self-recovers. +talm reset --reboot --graceful=true --nodes $NODE --endpoints $OTHER_NODE + +# Explicit destructive opt-in (upstream's default). +talm reset --wipe-mode=all --reboot --nodes $NODE --endpoints $OTHER_NODE + +# Operator-specified narrower scope is honored byte-for-byte. +talm reset --system-labels-to-wipe=STATE --reboot --nodes $NODE --endpoints $OTHER_NODE +``` + ## Customization You're free to edit template files in `./templates` directory. diff --git a/docs/manual-test-plan.md b/docs/manual-test-plan.md index b746b88..c29af61 100644 --- a/docs/manual-test-plan.md +++ b/docs/manual-test-plan.md @@ -285,18 +285,72 @@ Expected: delete succeeds; read returns `NotFound`. Expected: refuses with `etcd data directory is not empty`. -### H2. Reset a control-plane node (graceful + reboot, system labels only) +### H2. Reset a control-plane node (talm safe default — preserves META) -⚠️ Destructive. Run only against a cluster you can afford to lose one node from. Requires `--system-labels-to-wipe=STATE` (and optionally `EPHEMERAL`) for a recoverable reset — `--wipe-mode=all` (the default) removes META too, which makes self-recovery impossible. +⚠️ Destructive. Run only against a cluster you can afford to lose one node from. The talm default populates `--system-labels-to-wipe=STATE,EPHEMERAL` automatically when neither `--wipe-mode` nor `--system-labels-to-wipe` was passed, so META survives and the node self-recovers on the next boot. Upstream `talosctl reset` defaults to `--wipe-mode=all`, which destroys META; that path is exposed in talm as the explicit `--wipe-mode=all` opt-in (see H2a). ```bash /tmp/talm-safety reset --graceful=true --reboot \ - --system-labels-to-wipe=STATE \ - --system-labels-to-wipe=EPHEMERAL \ --nodes $NODE --endpoints $OTHER_NODE ``` -Expected: etcd member departs (`talm etcd members` from another node shows 2 members), node reboots, `post check passed`. +Expected: etcd member departs (`talm etcd members` from another node shows 2 members), node reboots, `post check passed`. After the reboot the node returns to etcd as a fresh member with placeholder hostname `talos-XXXXX` within ~90s; the next `talm apply` refreshes the hostname. + +Regression anchors: + +- `talm reset --help` must show the talm-divergence note on both `--wipe-mode` ("preserves META") and `--system-labels-to-wipe` ("STATE,EPHEMERAL"). Without the help text, the default flip is invisible to operators reading the CLI surface. +- The reset request must succeed without the operator having to type `--system-labels-to-wipe` manually. If the node comes back in maintenance mode requiring fresh apply, the wrapper did not apply the safe default and META was wiped — that is a regression. + +### H2a. Reset with explicit destructive opt-in (`--wipe-mode=all` or `--wipe-mode=system-disk`) + +⚠️ Highly destructive — META wiped, node CANNOT self-recover and requires fresh apply against `--insecure` maintenance mode. Run only on a cluster where the multi-day re-bootstrap cost is acceptable. + +Two opt-out values land in the same destructive server-side branch: `--wipe-mode=all` (full system disk + user disks) and `--wipe-mode=system-disk` (system disk only). Both bypass the safety override and wipe META. `--wipe-mode=user-disks` is safe — it doesn't touch system partitions. + +```bash +# Equivalent destructive paths: +/tmp/talm-safety reset --wipe-mode=all --graceful=true --reboot \ + --nodes $NODE --endpoints $OTHER_NODE +/tmp/talm-safety reset --wipe-mode=system-disk --graceful=true --reboot \ + --nodes $NODE --endpoints $OTHER_NODE +``` + +Expected: same as H2 up to the reboot; after the reboot the node comes up in maintenance mode (no machine config). `talm get hostnames -i --nodes $NODE` succeeds via the insecure path but the node is not yet a cluster member. + +Regression anchor: when EITHER of these commands is run the wrapper MUST NOT silently add `--system-labels-to-wipe=STATE,EPHEMERAL` (which would override the operator's stated intent and quietly turn a destructive reset into a selective one). Verify via `talm reset --wipe-mode=all --help` or by observing that the reset request actually destroys META. + +### H2b. Reset with operator-specified narrower scope (`--system-labels-to-wipe=STATE` only) + +```bash +/tmp/talm-safety reset --system-labels-to-wipe=STATE --graceful=true --reboot \ + --nodes $NODE --endpoints $OTHER_NODE +``` + +Expected: only STATE wiped, EPHEMERAL kept (containerd image cache survives the reset), node returns. The operator's explicit narrower list must be honored byte-for-byte; the wrapper MUST NOT silently expand to `STATE,EPHEMERAL`. + +Regression anchor: after the node returns, `talm dmesg --nodes $NODE | grep -i ephemeral` should show no fresh-format markers for the EPHEMERAL partition. If the wrapper silently expanded the operator's list, EPHEMERAL would have been wiped too. + +### H2c. Reset with `--graceful=false` (ungraceful, preserves safe default) + +```bash +/tmp/talm-safety reset --graceful=false --reboot \ + --nodes $NODE --endpoints $OTHER_NODE +``` + +Expected: ungraceful reset (no etcd leave), but the wrapper's safe default still fires (STATE+EPHEMERAL labels populated by talm because no wipe flag was passed). Node reboots; etcd cluster recovers via remaining quorum; rejoining member appears within ~120s. + +Regression anchor: the default-flip MUST be independent of `--graceful`. A change that conditions the flip on `--graceful=true` is a regression — operators on ungraceful reset are the ones who most need the safe default. + +### H2d. Reset triggered from modeline-bearing project root + +```bash +cd $PROJECT # directory with nodes/$NODE.yaml carrying the modeline +/tmp/talm-safety reset --reboot --graceful=true +``` + +Expected: same outcome as H2 — modeline supplies `--nodes` / `--endpoints` from `nodes/$NODE.yaml`, no wipe flags on the CLI, wrapper applies the safe default, META preserved. + +Regression anchor: the default-flip is gated on `Changed("wipe-mode") && Changed("system-labels-to-wipe")` only — it is independent of where in the PreRunE chain it runs. A refactor that reorders the dispatch chain must keep this path working (modeline-supplied `--nodes` / `--endpoints` plus no operator-supplied wipe flags must still produce the safe default). ### H3. Etcd quorum after reset diff --git a/pkg/commands/reset_handler.go b/pkg/commands/reset_handler.go new file mode 100644 index 0000000..54a31c5 --- /dev/null +++ b/pkg/commands/reset_handler.go @@ -0,0 +1,98 @@ +// Copyright Cozystack Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package commands + +import ( + "github.com/cockroachdb/errors" + "github.com/spf13/cobra" +) + +const ( + resetCmdName = "reset" + + // resetSafeDefaultLabels are the system partition labels talm's + // wrapper populates into `--system-labels-to-wipe` when an + // operator runs `talm reset` without explicitly choosing a wipe + // scope. Wiping STATE clears node-specific persistent state + // (machine config, identity); wiping EPHEMERAL clears the + // container/runtime layer. Leaving META untouched is the key + // property: META carries the bootstrap config Talos uses to + // rejoin the cluster on the next boot, so a reset with only + // these two labels self-recovers without operator intervention. + resetSafeDefaultLabels = "STATE,EPHEMERAL" +) + +// wrapResetCommand flips talm's `talm reset` default away from +// upstream's destructive `--wipe-mode=all` toward the META-preserving +// selective-wipe recipe. The flip only fires when the operator passed +// neither `--wipe-mode` nor `--system-labels-to-wipe` on the CLI: +// +// - No wipe flags: PreRunE pre-populates +// `--system-labels-to-wipe=STATE,EPHEMERAL`. The server-side +// reset codepath, when SystemPartitionsToWipe is non-empty, +// takes the label-driven path and "keep[s] other partitions +// intact" per upstream's `--system-labels-to-wipe` flag doc in +// `cmd/talosctl/cmd/talos/reset.go`. META survives; on the next +// boot Talos rejoins the cluster from META without operator +// intervention. +// - Operator passed `--wipe-mode=...`: the safety override is +// skipped. `--wipe-mode=all` remains the explicit destructive +// opt-in; `--wipe-mode=system-disk` / `--wipe-mode=user-disks` +// also bypass the flip on the assumption that the operator +// stated wipe-scope intent. +// - Operator passed `--system-labels-to-wipe=...`: the operator's +// list is honored byte-for-byte. The wrapper does not silently +// expand a narrower selection (e.g. STATE alone) to the safe +// default — operators choosing a narrower scope are doing so +// deliberately. +// +// Help-text overrides on both flags spell out the divergence so +// `talm reset --help` carries the operator-facing story. +// +// Chain order: capture the wrapTalosCommand-installed PreRunE first, +// run the flip BEFORE chaining. Order is not load-bearing here +// (modeline does not touch wipe flags), but matching the shape of +// the crashdump / rotate-ca wrappers keeps the dispatch site +// readable. +func wrapResetCommand(wrappedCmd *cobra.Command) { + if wipeFlag := wrappedCmd.Flag("wipe-mode"); wipeFlag != nil { + wipeFlag.Usage = "disk reset mode (talm default: --system-labels-to-wipe=" + resetSafeDefaultLabels + + " preserves META so the node self-recovers; pass --wipe-mode=all or --wipe-mode=system-disk explicitly for upstream's destructive behaviour — both destroy META)" + } + + if labelsFlag := wrappedCmd.Flag("system-labels-to-wipe"); labelsFlag != nil { + labelsFlag.Usage = "wipe selected system disk partitions by label, keeping others intact (talm default when no wipe flag is set: " + + resetSafeDefaultLabels + ")" + } + + originalPreRunE := wrappedCmd.PreRunE + + wrappedCmd.PreRunE = func(cmd *cobra.Command, args []string) error { + if !cmd.Flags().Changed("wipe-mode") && !cmd.Flags().Changed("system-labels-to-wipe") { + if err := cmd.Flags().Set("system-labels-to-wipe", resetSafeDefaultLabels); err != nil { + return errors.WithHint( + errors.Wrap(err, "applying talm safe-default wipe labels"), + "this should not happen at runtime; if it does, fall back to passing --system-labels-to-wipe=STATE,EPHEMERAL explicitly", + ) + } + } + + if originalPreRunE != nil { + return originalPreRunE(cmd, args) + } + + return nil + } +} diff --git a/pkg/commands/reset_handler_test.go b/pkg/commands/reset_handler_test.go new file mode 100644 index 0000000..56dfa26 --- /dev/null +++ b/pkg/commands/reset_handler_test.go @@ -0,0 +1,345 @@ +// Copyright Cozystack Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package commands + +import ( + "strings" + "testing" + + taloscommands "github.com/siderolabs/talos/cmd/talosctl/cmd/talos" + "github.com/spf13/cobra" +) + +// registerResetFlagsForTest stands up the two reset flags talm cares +// about in a shape that matches upstream's registrations in +// `cmd/talosctl/cmd/talos/reset.go`: +// +// - `--wipe-mode` accepts a free-form string here (upstream uses a +// custom pflag.Value with an enum, but the wrapper only consults +// Changed() and the string value, so a plain StringVar with the +// upstream default `"all"` is enough to exercise the flip gate). +// - `--system-labels-to-wipe` is a StringSlice (repeatable + comma- +// split) to match upstream's `StringSliceVar`. +func registerResetFlagsForTest(cmd *cobra.Command, wipeModeStore *string, labelsStore *[]string) { + cmd.Flags().StringVar(wipeModeStore, "wipe-mode", "all", "disk reset mode") + cmd.Flags().StringSliceVar(labelsStore, "system-labels-to-wipe", nil, "system disk partitions to wipe by label") +} + +// TestWrapResetCommand_NoFlags_AppliesSafeDefault pins the headline +// behaviour change for #185: when an operator runs `talm reset` +// without choosing a wipe scope, the wrapper's PreRunE pre-populates +// `--system-labels-to-wipe` with STATE,EPHEMERAL so the META +// partition is preserved and the node self-recovers on reboot. +// +// `--wipe-mode` is intentionally left unchanged (still upstream's +// default). The server-side reset codepath, when SystemPartitionsToWipe +// is non-empty, takes the label-driven path regardless of Mode — see +// the upstream `--system-labels-to-wipe` flag doc in +// `cmd/talosctl/cmd/talos/reset.go`: "if set, just wipe selected +// system disk partitions by label but keep other partitions intact". +// The issue's manual reproduction confirms this empirically. +func TestWrapResetCommand_NoFlags_AppliesSafeDefault(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + if err := cmd.PreRunE(cmd, nil); err != nil { + t.Fatalf("PreRunE returned: %v", err) + } + + got, err := cmd.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe: %v", err) + } + + want := strings.Split(resetSafeDefaultLabels, ",") + if len(got) != len(want) { + t.Fatalf("safe default must populate --system-labels-to-wipe=%s; got %v", resetSafeDefaultLabels, got) + } + + for i := range want { + if got[i] != want[i] { + t.Errorf("safe default labels[%d]: got %q, want %q", i, got[i], want[i]) + } + } + + if cmd.Flags().Changed("wipe-mode") { + t.Errorf("safe default must not touch --wipe-mode (operator opt-in via --wipe-mode=all stays the only destructive path); got Changed()=true") + } +} + +// TestWrapResetCommand_ExplicitWipeModeAll_SkipsDefault pins the +// opt-out contract: an operator who explicitly passes +// `--wipe-mode=all` is asking for upstream's destructive behaviour +// (wipe META along with everything else). The wrapper MUST NOT +// silently add `--system-labels-to-wipe=STATE,EPHEMERAL` in that +// case — doing so would override the operator's stated intent and +// quietly turn a destructive reset into a selective one. +func TestWrapResetCommand_ExplicitWipeModeAll_SkipsDefault(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + if err := cmd.Flags().Set("wipe-mode", "all"); err != nil { + t.Fatalf("set --wipe-mode=all: %v", err) + } + + if err := cmd.PreRunE(cmd, nil); err != nil { + t.Fatalf("PreRunE returned: %v", err) + } + + got, err := cmd.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe: %v", err) + } + + if len(got) != 0 { + t.Errorf("explicit --wipe-mode=all must skip the safety override (operator opted into destructive); got --system-labels-to-wipe=%v", got) + } +} + +// TestWrapResetCommand_ExplicitLabels_PreservesOperatorChoice pins +// the operator-list-honored contract: when the operator already +// passed `--system-labels-to-wipe=...`, the wrapper MUST NOT silently +// expand the list (e.g. by appending EPHEMERAL). An operator who +// passes a narrower list than the safe default is doing so +// deliberately and the wrapper must respect it byte-for-byte. +func TestWrapResetCommand_ExplicitLabels_PreservesOperatorChoice(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + if err := cmd.Flags().Set("system-labels-to-wipe", "STATE"); err != nil { + t.Fatalf("set --system-labels-to-wipe=STATE: %v", err) + } + + if err := cmd.PreRunE(cmd, nil); err != nil { + t.Fatalf("PreRunE returned: %v", err) + } + + got, err := cmd.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe: %v", err) + } + + if len(got) != 1 || got[0] != "STATE" { + t.Errorf("operator's explicit narrower list must be honored byte-for-byte; got %v, want [STATE]", got) + } +} + +// TestWrapResetCommand_ExplicitWipeModeSystemDisk_SkipsDefault pins +// the value-agnostic gate: the wrapper checks `Changed("wipe-mode")` +// only, not the value. Any --wipe-mode=... selection counts as +// "operator stated wipe-scope intent" and disables the safety +// override. Verified for --wipe-mode=system-disk (the other +// destructive lane besides --wipe-mode=all); --wipe-mode=user-disks +// would behave the same way at the gate level but doesn't touch +// system partitions server-side, so it's safe either way. +func TestWrapResetCommand_ExplicitWipeModeSystemDisk_SkipsDefault(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + if err := cmd.Flags().Set("wipe-mode", "system-disk"); err != nil { + t.Fatalf("set --wipe-mode=system-disk: %v", err) + } + + if err := cmd.PreRunE(cmd, nil); err != nil { + t.Fatalf("PreRunE returned: %v", err) + } + + got, err := cmd.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe: %v", err) + } + + if len(got) != 0 { + t.Errorf("explicit --wipe-mode=system-disk must skip the safety override (operator stated wipe-scope intent); got --system-labels-to-wipe=%v", got) + } +} + +// TestWrapResetCommand_BothFlagsChanged_SkipsDefault pins the +// no-interference contract for the mixed case: when the operator +// passed BOTH `--wipe-mode=...` and `--system-labels-to-wipe=...`, +// the wrapper MUST NOT touch either flag. The operator has stated +// intent on both axes; the wrapper passes the request through to +// upstream as-is. +func TestWrapResetCommand_BothFlagsChanged_SkipsDefault(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + if err := cmd.Flags().Set("wipe-mode", "system-disk"); err != nil { + t.Fatalf("set --wipe-mode=system-disk: %v", err) + } + + if err := cmd.Flags().Set("system-labels-to-wipe", "STATE"); err != nil { + t.Fatalf("set --system-labels-to-wipe=STATE: %v", err) + } + + if err := cmd.PreRunE(cmd, nil); err != nil { + t.Fatalf("PreRunE returned: %v", err) + } + + got, err := cmd.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe: %v", err) + } + + if len(got) != 1 || got[0] != "STATE" { + t.Errorf("both flags changed: wrapper must not interfere; got --system-labels-to-wipe=%v, want [STATE]", got) + } +} + +// TestWrapResetCommand_HelpTextMentionsMetaPreservation pins the +// operator-facing help-text contract against drift. The default +// flip is invisible without a clear `talm reset --help` story: +// operators relying on the help text to learn what changed will +// otherwise be surprised when `talm reset` (no flags) suddenly +// preserves META instead of wiping it. +// +// Both flags get a wrapper-side Usage override: +// - `--wipe-mode`: mentions the talm safe default + the explicit +// opt-out (`--wipe-mode=all`) for the destructive case. +// - `--system-labels-to-wipe`: spells out the safe default labels +// so an operator inspecting either flag's help finds the same +// story. +// +// The substrings asserted here are narrow ("preserves META" and the +// `resetSafeDefaultLabels` literal) — enough to pin intent without +// over-fitting to exact prose. +func TestWrapResetCommand_HelpTextMentionsMetaPreservation(t *testing.T) { + cmd := &cobra.Command{Use: resetCmdName} + + var ( + wipeMode string + systemLabelsToWipe []string + ) + + registerResetFlagsForTest(cmd, &wipeMode, &systemLabelsToWipe) + + wrapResetCommand(cmd) + + wipeFlag := cmd.Flag("wipe-mode") + if wipeFlag == nil { + t.Fatal("--wipe-mode flag missing after wrap") + } + + if !strings.Contains(wipeFlag.Usage, "preserves META") { + t.Errorf("--wipe-mode help must mention that the talm default preserves META; got: %q", wipeFlag.Usage) + } + + // Both --wipe-mode=all and --wipe-mode=system-disk land in the + // destructive server-side branch when --system-labels-to-wipe is + // empty. Operators picking system-disk to "be less destructive" + // are walking into the same trap — pin both names in the help + // text so the surface they read describes both opt-out lanes. + if !strings.Contains(wipeFlag.Usage, "system-disk") { + t.Errorf("--wipe-mode help must mention --wipe-mode=system-disk as an opt-out lane (also destroys META); got: %q", wipeFlag.Usage) + } + + labelsFlag := cmd.Flag("system-labels-to-wipe") + if labelsFlag == nil { + t.Fatal("--system-labels-to-wipe flag missing after wrap") + } + + if !strings.Contains(labelsFlag.Usage, resetSafeDefaultLabels) { + t.Errorf("--system-labels-to-wipe help must mention the safe default labels (%s); got: %q", resetSafeDefaultLabels, labelsFlag.Usage) + } +} + +// TestWrapResetCommand_RealUpstreamResetDispatched is the end-to-end +// pin that talm's dispatch in wrapTalosCommand actually routes the +// real upstream `reset` command through wrapResetCommand. Without +// this test a refactor of the dispatch chain could silently drop +// the wrapper and the unit tests above (synthetic-cobra) would all +// keep passing while production stops applying the safe default. +// +// Pattern mirrors TestWrapCrashdumpCommand-style real-upstream tests: +// pull the real cobra.Command out of taloscommands.Commands, run it +// through talm's wrapTalosCommand, then invoke the wrapped PreRunE +// and assert the META-preserving default landed on the slice. +func TestWrapResetCommand_RealUpstreamResetDispatched(t *testing.T) { + var resetCmd *cobra.Command + + for _, cmd := range taloscommands.Commands { + if cmd.Name() == resetCmdName { + resetCmd = cmd + + break + } + } + + if resetCmd == nil { + t.Skipf("upstream taloscommands.Commands has no %q command — schema changed", resetCmdName) + } + + wrapped := wrapTalosCommand(resetCmd, resetCmdName) + + if err := wrapped.PreRunE(wrapped, nil); err != nil { + t.Fatalf("wrapped reset PreRunE returned: %v", err) + } + + got, err := wrapped.Flags().GetStringSlice("system-labels-to-wipe") + if err != nil { + t.Fatalf("get --system-labels-to-wipe on wrapped reset: %v", err) + } + + want := strings.Split(resetSafeDefaultLabels, ",") + if len(got) != len(want) { + t.Fatalf("dispatch must route real upstream reset through wrapResetCommand (expected --system-labels-to-wipe=%s after PreRunE); got %v", resetSafeDefaultLabels, got) + } + + for i := range want { + if got[i] != want[i] { + t.Errorf("dispatched safe default labels[%d]: got %q, want %q", i, got[i], want[i]) + } + } +} diff --git a/pkg/commands/talosctl_wrapper.go b/pkg/commands/talosctl_wrapper.go index 4b0f3ea..be51d03 100644 --- a/pkg/commands/talosctl_wrapper.go +++ b/pkg/commands/talosctl_wrapper.go @@ -318,6 +318,16 @@ func wrapTalosCommand(cmd *cobra.Command, cmdName string) *cobra.Command { wrapTUICommand(wrappedCmd, baseCmdName) } + // Special handling for reset: flip talm's default to the + // META-preserving selective wipe when the operator passed no + // wipe-related flags. Upstream's --wipe-mode=all destroys META + // and prevents self-recovery; the safer default matches the + // recipe operators reach for anyway. See wrapResetCommand + // godoc for the precedence rules. + if baseCmdName == resetCmdName { + wrapResetCommand(wrappedCmd) + } + // Copy all subcommands for _, subCmd := range cmd.Commands() { wrappedCmd.AddCommand(wrapTalosCommand(subCmd, subCmd.Name()))