Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowired;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.core.gc.GC;
import org.zstack.core.gc.GCCompletion;
import org.zstack.core.gc.TimeBasedGarbageCollector;
import org.zstack.header.host.HostVO;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.storage.primary.PrimaryStorageVO_;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

public class CleanupVmInstanceMetadataOnPrimaryStorageGC extends TimeBasedGarbageCollector {
private static final CLogger logger = Utils.getLogger(CleanupVmInstanceMetadataOnPrimaryStorageGC.class);

@Autowired
private PluginRegistry pluginRgty;

@GC
public String primaryStorageUuid;
@GC
public String vmUuid;
@GC
public String rootVolumeUuid;
@GC
public String metadataPath;
@GC
public String hostUuid;

public static String getGCName(String vmUuid) {
return String.format("gc-cleanup-vm-metadata-%s", vmUuid);
}

@Override
protected void triggerNow(GCCompletion completion) {
if (!dbf.isExist(primaryStorageUuid, PrimaryStorageVO.class)) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] no longer exists, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, primaryStorageUuid).findValue();
if (psType == null) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] type not found, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
boolean requireHost = ext != null && ext.requireHostForCleanup();

if (hostUuid == null && requireHost) {
logger.debug(String.format("[MetadataCleanupGC] hostUuid is null and ps[uuid:%s, type:%s] " +
"requires host for cleanup, cancel gc for vm[uuid:%s]",
primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}

if (hostUuid != null && !dbf.isExist(hostUuid, HostVO.class)) {
if (requireHost) {
logger.debug(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists " +
"and ps[uuid:%s, type:%s] requires host for cleanup, cancel gc for vm[uuid:%s]",
hostUuid, primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}

logger.info(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists for vm[uuid:%s], " +
"clear hostUuid and let the primary storage backend pick an available host", hostUuid, vmUuid));
hostUuid = null;
Comment on lines +69 to +80
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

仅检查 HostVO 是否存在不够,GC 会在失效宿主上永久重试。

这里把“host 记录还在”当成“host 还能执行 cleanup”,但 LocalStorage 的后续路径还会校验该 host 是否仍然挂着当前 PS。宿主机只是从 PS 脱离时,这里会继续带着旧 hostUuid 发送消息,随后失败又走 completion.fail(),定时 GC 会一直拿同一个失效 host 重试。发送前至少要校验 host/PS 关联,或者在关联失效时清空 hostUuid 再走回退逻辑。

Also applies to: 98-101

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@compute/src/main/java/org/zstack/compute/vm/CleanupVmInstanceMetadataOnPrimaryStorageGC.java`
around lines 69 - 80, The code only checks host existence (dbf.isExist(hostUuid,
HostVO.class)) but must also verify the host is still associated with the
primary storage so GC doesn't keep retrying on a detached/invalid host; update
the logic in CleanupVmInstanceMetadataOnPrimaryStorageGC where hostUuid is
validated (the block referencing hostUuid, primaryStorageUuid, psType, vmUuid,
requireHost, completion) to additionally check the Host-PrimaryStorage
association (e.g., query HostPrimaryStorageRefVO or an equivalent relation for
hostUuid + primaryStorageUuid) instead of only HostVO existence; if the
association is missing treat the host as invalid: if requireHost keep the
existing cancel flow, otherwise set hostUuid = null and proceed with the
fallback branch (and apply the same association check and hostUuid-clear logic
at the other occurrence around lines 98-101) so we clear detached hostUuid
before sending cleanup messages and avoid permanent retries.

}

CleanupVmInstanceMetadataOnPrimaryStorageMsg msg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
msg.setPrimaryStorageUuid(primaryStorageUuid);
msg.setVmInstanceUuid(vmUuid);
msg.setRootVolumeUuid(rootVolumeUuid);
msg.setMetadataPath(metadataPath);
msg.setHostUuid(hostUuid);

bus.makeTargetServiceIdByResourceUuid(msg, PrimaryStorageConstant.SERVICE_ID, primaryStorageUuid);
bus.send(msg, new CloudBusCallBack(completion) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataCleanupGC] successfully cleaned up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]", vmUuid, primaryStorageUuid));
completion.success();
} else {
logger.warn(String.format("[MetadataCleanupGC] failed to clean up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]: %s", vmUuid, primaryStorageUuid, reply.getError()));
completion.fail(reply.getError());
}
}
});
}
}
135 changes: 135 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowire;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Configurable;
import org.zstack.core.cloudbus.CloudBus;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.header.core.workflow.FlowTrigger;
import org.zstack.header.core.workflow.NoRollbackFlow;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.storage.primary.PrimaryStorageVO_;
import org.zstack.header.vm.VmInstanceConstant;
import org.zstack.header.vm.VmInstanceSpec;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.header.volume.VolumeInventory;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

import java.util.Map;
import java.util.concurrent.TimeUnit;

@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
public class VmExpungeMetadataFlow extends NoRollbackFlow {
private static final CLogger logger = Utils.getLogger(VmExpungeMetadataFlow.class);

@Autowired
private CloudBus bus;
@Autowired
private PluginRegistry pluginRgty;

@Override
public void run(FlowTrigger trigger, Map data) {
final VmInstanceSpec spec = (VmInstanceSpec) data.get(VmInstanceConstant.Params.VmInstanceSpec.toString());
if (spec == null || spec.getVmInventory() == null) {
logger.warn("[MetadataExpunge] missing VmInstanceSpec or VmInventory, skip metadata cleanup");
trigger.next();
return;
}

final String vmUuid = spec.getVmInventory().getUuid();

VolumeInventory rootVolume = spec.getVmInventory().getRootVolume();
String psUuid = rootVolume != null ? rootVolume.getPrimaryStorageUuid() : null;
if (psUuid == null) {
logger.debug(String.format("[MetadataExpunge] vm[uuid:%s] root volume has no primaryStorageUuid, " +
"skipping metadata cleanup", vmUuid));
trigger.next();
return;
}


String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, psUuid).findValue();
if (psType == null) {
logger.warn(String.format("[MetadataExpunge] primary storage[uuid:%s] not found for vm[uuid:%s], " +
"skip metadata cleanup", psUuid, vmUuid));
trigger.next();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
if (ext == null) {
logger.warn(String.format("[MetadataExpunge] no VmMetadataPathBuildExtensionPoint found for ps[uuid:%s, type:%s], " +
"skip metadata cleanup", psUuid, psType));
trigger.next();
return;
}
final String metadataPath;
try {
metadataPath = ext.buildVmMetadataPath(psUuid, vmUuid);
} catch (Exception e) {
logger.warn(String.format("[MetadataExpunge] failed to build metadata path for vm[uuid:%s] on ps[uuid:%s], " +
"skip metadata cleanup: %s", vmUuid, psUuid, e.getMessage()));
trigger.next();
return;
}

String hostUuid = spec.getVmInventory().getHostUuid();
if (hostUuid == null) {
hostUuid = spec.getVmInventory().getLastHostUuid();
}

if (hostUuid == null && ext.requireHostForCleanup()) {
logger.warn(String.format("[MetadataExpunge] vm[uuid:%s] hostUuid is null, " +
"ps[uuid:%s, type:%s] requires host for cleanup, skip without submitting GC",
vmUuid, psUuid, psType));
trigger.next();
return;
}

String rootVolumeUuid = rootVolume.getUuid();
CleanupVmInstanceMetadataOnPrimaryStorageMsg cmsg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
cmsg.setPrimaryStorageUuid(psUuid);
cmsg.setVmInstanceUuid(vmUuid);
cmsg.setMetadataPath(metadataPath);
cmsg.setRootVolumeUuid(rootVolumeUuid);
cmsg.setHostUuid(hostUuid);
final String finalPsUuid = psUuid;
final String finalHostUuid = hostUuid;

bus.makeTargetServiceIdByResourceUuid(cmsg, PrimaryStorageConstant.SERVICE_ID, psUuid);
bus.send(cmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataExpunge] successfully deleted metadata for vm[uuid:%s] on ps[uuid:%s]",
vmUuid, finalPsUuid));
} else {
logger.warn(String.format("[MetadataExpunge] failed to delete metadata for vm[uuid:%s] on ps[uuid:%s]: %s, " +
"submitting GC job for retry", vmUuid, finalPsUuid, reply.getError()));
submitGC(finalPsUuid, vmUuid, rootVolumeUuid, metadataPath, finalHostUuid);
}
trigger.next();
}
});
}

private void submitGC(String psUuid, String vmUuid, String rootVolumeUuid, String metadataPath, String hostUuid) {
CleanupVmInstanceMetadataOnPrimaryStorageGC gc = new CleanupVmInstanceMetadataOnPrimaryStorageGC();
gc.NAME = CleanupVmInstanceMetadataOnPrimaryStorageGC.getGCName(vmUuid, psUuid);
gc.primaryStorageUuid = psUuid;
gc.vmUuid = vmUuid;
gc.rootVolumeUuid = rootVolumeUuid;
gc.metadataPath = metadataPath;
gc.hostUuid = hostUuid;
long gcIntervalSec = TimeUnit.HOURS.toSeconds(VmGlobalConfig.VM_METADATA_CLEANUP_GC_INTERVAL.value(Long.class));
gc.deduplicateSubmit(gcIntervalSec, TimeUnit.SECONDS);
Comment on lines +122 to +131
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

deduplicateSubmit() 会保留旧的 GC 上下文。

这里的 GC 名字只由 vmUuidpsUuid 组成,但真正的清理目标还受 metadataPathrootVolumeUuidhostUuid 影响。deduplicateSubmit() 命中已有任务后不会刷新这些 @GC 字段,所以同一 VM 后续如果发生 metadata rebasing、root volume 变化或 host 变化,新的提交会被静默丢弃,GC 继续重试第一次保存的旧目标。需要先更新已有 GC 上下文,或者把去重键改成真正唯一的清理目标。

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.java`
around lines 122 - 131, submitGC currently sets GC.NAME using only vmUuid and
psUuid via CleanupVmInstanceMetadataOnPrimaryStorageGC.getGCName(vmUuid, psUuid)
so deduplicateSubmit() can skip updating context fields (metadataPath,
rootVolumeUuid, hostUuid) when a previous GC exists; either ensure the dedupe
key is truly unique to the cleanup target by including
metadataPath/rootVolumeUuid/hostUuid (or a hash thereof) in GC.NAME/getGCName,
or look up an existing CleanupVmInstanceMetadataOnPrimaryStorageGC by GC.NAME
inside submitGC and update its metadataPath, rootVolumeUuid and hostUuid fields
before calling deduplicateSubmit(); modify submitGC and/or getGCName accordingly
so deduplicateSubmit() will not retain stale cleanup context.


logger.info(String.format("[MetadataExpunge] submitted GC job [%s] for vm[uuid:%s] on ps[uuid:%s]", gc.NAME, vmUuid, psUuid));
}
}
48 changes: 48 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmGlobalConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,52 @@ public class VmGlobalConfig {
@GlobalConfigDef(defaultValue = "false", type = Boolean.class, description = "allowed TPM VM start without KMS")
@GlobalConfigValidation(validValues = {"true", "false"})
public static GlobalConfig ALLOWED_TPM_VM_WITHOUT_KMS = new GlobalConfig(CATEGORY, "allowed.tpm.vm.without.kms");

@GlobalConfigValidation(validValues = {"true", "false"})
public static GlobalConfig VM_METADATA_ENABLED = new GlobalConfig(CATEGORY, "vm.metadata.enabled");

@GlobalConfigValidation()
public static GlobalConfig VM_METADATA_LAST_REFRESH_VERSION = new GlobalConfig(CATEGORY, "vm.metadata.lastRefreshVersion");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 100)
public static GlobalConfig VM_METADATA_FLUSH_CONCURRENCY = new GlobalConfig(CATEGORY, "vm.metadata.flush.concurrency");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 300)
public static GlobalConfig VM_METADATA_FLUSH_POLL_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.flush.pollInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_FLUSH_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.flush.batchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 168)
public static GlobalConfig VM_METADATA_CLEANUP_GC_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.cleanup.gc.interval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 20)
public static GlobalConfig VM_METADATA_FLUSH_MAX_RETRY = new GlobalConfig(CATEGORY, "vm.metadata.flush.maxRetry");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 120)
public static GlobalConfig VM_METADATA_FLUSH_ZOMBIE_CLAIM_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.flush.zombieClaimThreshold");

@GlobalConfigValidation(numberGreaterThan = 21599, numberLessThan = 172801)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_MAX_CYCLES = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryMaxCycles");

@GlobalConfigValidation(numberGreaterThan = 0)
public static GlobalConfig VM_METADATA_PAYLOAD_REJECT_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.payload.rejectThreshold");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_ORPHAN_CHECK_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.orphanCheckInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 20)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryBatchSize");

@GlobalConfigValidation(numberGreaterThan = 9, numberLessThan = 201)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 31)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SLEEP_SEC = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSleepSec");
}
Loading