Skip to content

Commit

Permalink
core: RemoveImage: handle failure in VDS command
Browse files Browse the repository at this point in the history
RemoveImageCommand used to call for DeleteImageGroup VDS command, and
perform the required DB operations without polling on the Async Task.

One issue that it led to is when disk is configured with "Wipe on
Delete", and the wipe action took more than 50 hours (default max
async task polling time) - the task was interrupted, the image is not
deleted in VDSM, but removed from DB, not having any tracking for them
in the engine.

In this patch we add async task polling to RemoveImageCommand and
handle the removal of the image from the DB only if it succeeded,
otherwise we mark the image as ILLEGAL, and add an Audit Log, logging
the error and the IDs of the volumes that might need manual removal.
Additional removal of the now ILLEGAL disk will succeed, as the command
will not find any volumes associated with it in VDSM

Bug-Url: https://bugzilla.redhat.com/1836318
  • Loading branch information
mkemel authored and ahadas committed Sep 22, 2022
1 parent a95002c commit 0b92d02
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import javax.enterprise.inject.Instance;
import javax.enterprise.inject.Typed;
import javax.inject.Inject;

import org.apache.commons.lang.StringUtils;
import org.ovirt.engine.core.bll.InternalCommandAttribute;
import org.ovirt.engine.core.bll.NonTransactiveCommandAttribute;
import org.ovirt.engine.core.bll.context.CommandContext;
import org.ovirt.engine.core.bll.storage.domain.PostDeleteActionHandler;
import org.ovirt.engine.core.bll.tasks.interfaces.CommandCallback;
import org.ovirt.engine.core.common.AuditLogType;
import org.ovirt.engine.core.common.VdcObjectType;
import org.ovirt.engine.core.common.action.ActionType;
import org.ovirt.engine.core.common.action.RemoveImageParameters;
Expand All @@ -32,6 +38,7 @@
import org.ovirt.engine.core.common.vdscommands.VDSReturnValue;
import org.ovirt.engine.core.compat.Guid;
import org.ovirt.engine.core.compat.TransactionScopeOption;
import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector;
import org.ovirt.engine.core.dao.BaseDiskDao;
import org.ovirt.engine.core.dao.DiskImageDao;
import org.ovirt.engine.core.dao.DiskImageDynamicDao;
Expand All @@ -56,6 +63,8 @@ public class RemoveImageCommand<T extends RemoveImageParameters> extends BaseIma
ActionType.RemoveVmTemplateFromImportExport,
ActionType.RemoveUnregisteredVmTemplate,
ActionType.RemoveUnregisteredVm));
@Inject
private AuditLogDirector auditLogDirector;

@Inject
private PostDeleteActionHandler postDeleteActionHandler;
Expand All @@ -79,6 +88,11 @@ public class RemoveImageCommand<T extends RemoveImageParameters> extends BaseIma
@Inject
private VmDao vmDao;

@Inject
@Typed(RemoveImageCommandCallback.class)
private Instance<RemoveImageCommandCallback> callbackProvider;


public RemoveImageCommand(T parameters, CommandContext cmdContext) {
super(parameters, cmdContext);
}
Expand Down Expand Up @@ -120,12 +134,13 @@ protected void executeCommand() {
Guid taskId = persistAsyncTaskPlaceHolder(getParameters().getParentCommand());

VDSReturnValue vdsReturnValue = performDeleteImageVdsmOperation();
getTaskIdList().add(
createTask(taskId,
vdsReturnValue.getCreationInfo(),
getParameters().getParentCommand(),
VdcObjectType.Storage,
getStorageDomainId()));
Guid vdsmTaskId = createTask(taskId,
vdsReturnValue.getCreationInfo(),
getParameters().getParentCommand(),
VdcObjectType.Storage,
getStorageDomainId());
getTaskIdList().add(vdsmTaskId);
getReturnValue().getVdsmTaskIdList().add(vdsmTaskId);
} catch (EngineException e) {
if (e.getErrorCode() == EngineError.ImageDoesNotExistInDomainError) {
log.info("Disk '{}' doesn't exist on storage domain '{}', rolling forward",
Expand All @@ -140,10 +155,6 @@ protected void executeCommand() {
throw e;
}
}

if (!ACTIONS_NOT_REQUIRED_DB_OPERATION.contains(getParameters().getParentCommand())) {
performImageDbOperations();
}
} else {
log.warn("DiskImage is null, nothing to remove");
}
Expand Down Expand Up @@ -295,6 +306,9 @@ private void removeImageMapping() {
}

private void performImageDbOperations() {
if (ACTIONS_NOT_REQUIRED_DB_OPERATION.contains(getParameters().getParentCommand())) {
return;
}
switch (getParameters().getDbOperationScope()) {
case IMAGE:
removeImageFromDB();
Expand Down Expand Up @@ -323,4 +337,33 @@ protected VDSReturnValue performDeleteImageVdsmOperation() {
getDiskImage().isWipeAfterDelete(), getStorageDomain().getDiscardAfterDelete(),
getParameters().isForceDelete())));
}

@Override
public CommandCallback getCallback() {
return callbackProvider.get();
}

public void onSucceeded() {
performImageDbOperations();
}

public void onFailed() {
if (getParameters().getAsyncTaskErrorMessage() != null) {
Guid diskId = getParameters().getDiskImage().getId();
List<Guid> snapshotIds = diskImageDao
.getAllSnapshotsForImageGroup(diskId).stream()
.map(DiskImage::getImageId)
.collect(Collectors.toList());
addCustomValue("DiskId", diskId.toString());
addCustomValue("VolumeIds", StringUtils.join(snapshotIds, ','));
addCustomValue("ErrorMessage", getParameters().getAsyncTaskErrorMessage());
auditLog(this, AuditLogType.DELETE_IMAGE_GROUP_TASK_FAILED);
}
imageDao.updateStatusOfImagesByImageGroupId(getImageGroupId(), ImageStatus.ILLEGAL);
}

@Override
protected void setSucceeded(boolean value) {
super.setSucceeded(value);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package org.ovirt.engine.core.bll.storage.disk.image;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.enterprise.inject.Typed;
import javax.inject.Inject;

import org.ovirt.engine.core.bll.tasks.CommandCoordinatorUtil;
import org.ovirt.engine.core.bll.tasks.interfaces.CommandCallback;
import org.ovirt.engine.core.common.action.RemoveImageParameters;
import org.ovirt.engine.core.common.businessentities.AsyncTaskStatus;
import org.ovirt.engine.core.compat.CommandStatus;
import org.ovirt.engine.core.compat.Guid;
import org.ovirt.engine.core.vdsbroker.vdsbroker.VDSErrorException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Typed(RemoveImageCommandCallback.class)
public class RemoveImageCommandCallback implements CommandCallback {
private static final Logger log = LoggerFactory.getLogger(RemoveImageCommandCallback.class);

@Inject
private CommandCoordinatorUtil commandCoordinatorUtil;

@Override
public void doPolling(Guid cmdId, List<Guid> childCmdIds) {
RemoveImageCommand<RemoveImageParameters> command = getRemoveImageCommand(cmdId);
Set<Guid> taskIds = new HashSet<>(command.getReturnValue().getVdsmTaskIdList());
Map<Guid, AsyncTaskStatus> idToTaskStatusMap = commandCoordinatorUtil.pollTasks(taskIds);
for (Map.Entry<Guid, AsyncTaskStatus> idToTaskStatus : idToTaskStatusMap.entrySet()) {
Guid id = idToTaskStatus.getKey();
AsyncTaskStatus status = idToTaskStatus.getValue();
if (status.getTaskIsRunning()) {
log.info("Waiting on remove image command to complete the task '{}'", id);
return;
}
}

List<AsyncTaskStatus> failedTasks = new ArrayList<>();
idToTaskStatusMap.forEach((id, status) -> {
if (!status.getTaskEndedSuccessfully()) {
failedTasks.add(status);
}
});
if (failedTasks.isEmpty()) {
command.setSucceeded(true);
command.setCommandStatus(CommandStatus.SUCCEEDED);
command.persistCommand(command.getParameters().getParentCommand());
log.info("Remove image command has completed successfully for disk '{}' with async task(s) '{}'.",
command.getParameters().getDiskImage().getId(), taskIds);
} else {
RuntimeException exception = failedTasks.get(0).getException();
if (exception instanceof VDSErrorException) {
command.getParameters()
.setAsyncTaskErrorMessage(((VDSErrorException) exception).getVdsError().getMessage());
}
command.setSucceeded(false);
command.setCommandStatus(CommandStatus.FAILED);
log.info("Remove image command has failed for disk '{}' with async task(s) '{}'.",
command.getParameters().getDiskImage().getId(), failedTasks);
}
command.persistCommand(command.getParameters().getParentCommand());
}

@Override
public void onSucceeded(Guid cmdId, List<Guid> childCmdIds) {
getRemoveImageCommand(cmdId).onSucceeded();
}

@Override
public void onFailed(Guid cmdId, List<Guid> childCmdIds) {
getRemoveImageCommand(cmdId).onFailed();
}

private RemoveImageCommand<RemoveImageParameters> getRemoveImageCommand(Guid cmdId) {
return commandCoordinatorUtil.retrieveCommand(cmdId);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ public enum AuditLogType {
MEMORY_HOT_UNPLUG_SUCCESSFULLY_REQUESTED_PLUS_MEMORY_INFO(2049),
NO_MEMORY_DEVICE_TO_HOT_UNPLUG(2050),
NO_SUITABLE_MEMORY_DEVICE_TO_HOT_UNPLUG(2051),
DELETE_IMAGE_GROUP_TASK_FAILED(2053, AuditLogSeverity.ERROR),

// Used only from SQL script, therefor should not have severity & message
USER_RUN_UNLOCK_ENTITY_SCRIPT(2024),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public class RemoveImageParameters extends ImagesContainterParametersBase implem
private ImageDbOperationScope dbOperationScope;
private boolean removeFromSnapshots;
private boolean shouldLockImage;
private String asyncTaskErrorMessage;

public RemoveImageParameters(Guid imageId) {
super(imageId, null);
Expand Down Expand Up @@ -55,4 +56,12 @@ public boolean isRemoveFromSnapshots() {
public void setRemoveFromSnapshots(boolean removeFromSnapshots) {
this.removeFromSnapshots = removeFromSnapshots;
}

public String getAsyncTaskErrorMessage() {
return asyncTaskErrorMessage;
}

public void setAsyncTaskErrorMessage(String asyncTaskErrorMessage) {
this.asyncTaskErrorMessage = asyncTaskErrorMessage;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1359,4 +1359,5 @@ VM_BACKUP_SCRATCH_DISK_CREATION_FAILED=Failed to create scratch disk for VM ${Vm
VNICS_OUT_OF_SYNC_ON_NETWORK_UPDATE=Update to network ${NetworkName} was not applied to virtual network interfaces [${VnicNames}]. The actual configuration on the interfaces may differ from the displayed one.
VNICS_OUT_OF_SYNC_ON_PROFILE_UPDATE=Update to network interface profile ${ProfileName} was not applied to virtual network interfaces [${VnicNames}]. The actual configuration on the interfaces may differ from the displayed one.
VM_BACKUP_SNAPSHOT_POSSIBLE_LEFTOVER=Possible failure while removing a snapshot that was generated automatically during backup of VM '${VmName}'. The snapshot may be removed manually
VM_BACKUP_FAILED_UNMONITORED=Backup ${BackupId} for VM ${VmName} was marked as failed since it is no longer monitored, cleanup may be required.
VM_BACKUP_FAILED_UNMONITORED=Backup ${BackupId} for VM ${VmName} was marked as failed since it is no longer monitored, cleanup may be required.
DELETE_IMAGE_GROUP_TASK_FAILED=Image group deletion task for disk ${DiskId} with volume(s) [${VolumeIds}] failed. Manual volume removal might be required. Underlying error message: ${ErrorMessage}

0 comments on commit 0b92d02

Please sign in to comment.