Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image pruning fixes #16717

Merged
merged 3 commits into from
Oct 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 128 additions & 73 deletions pkg/image/prune/prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/golang/glog"
gonum "github.com/gonum/graph"

kerrapi "k8s.io/apimachinery/pkg/api/errors"
kmeta "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -75,7 +76,9 @@ type ImageStreamDeleter interface {
GetImageStream(stream *imageapi.ImageStream) (*imageapi.ImageStream, error)
// UpdateImageStream removes all references to the image from the image
// stream's status.tags. The updated image stream is returned.
UpdateImageStream(stream *imageapi.ImageStream, image *imageapi.Image, updatedTags []string) (*imageapi.ImageStream, error)
UpdateImageStream(stream *imageapi.ImageStream) (*imageapi.ImageStream, error)
// NotifyImageStreamPrune shows notification about updated image stream.
NotifyImageStreamPrune(stream *imageapi.ImageStream, updatedTags []string, deletedTags []string)
}

// BlobDeleter knows how to delete a blob from the Docker registry.
Expand Down Expand Up @@ -208,7 +211,7 @@ var _ Pruner = &pruner{}
//
// Also automatically remove any image layer that is no longer referenced by any
// images.
func NewPruner(options PrunerOptions) Pruner {
func NewPruner(options PrunerOptions) (Pruner, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

godoc

glog.V(1).Infof("Creating image pruner with keepYoungerThan=%v, keepTagRevisions=%s, pruneOverSizeLimit=%s, allImages=%s",
options.KeepYoungerThan, getValue(options.KeepTagRevisions), getValue(options.PruneOverSizeLimit), getValue(options.AllImages))

Expand All @@ -233,16 +236,20 @@ func NewPruner(options PrunerOptions) Pruner {
addImageStreamsToGraph(g, options.Streams, options.LimitRanges, algorithm)
addPodsToGraph(g, options.Pods, algorithm)
addReplicationControllersToGraph(g, options.RCs)
addBuildConfigsToGraph(g, options.BCs)
addBuildsToGraph(g, options.Builds)
if err := addBuildConfigsToGraph(g, options.BCs); err != nil {
return nil, err
}
if err := addBuildsToGraph(g, options.Builds); err != nil {
return nil, err
}
addDeploymentConfigsToGraph(g, options.DCs)

return &pruner{
g: g,
algorithm: algorithm,
registryClient: options.RegistryClient,
registryURL: options.RegistryURL,
}
}, nil
}

func getValue(option interface{}) string {
Expand Down Expand Up @@ -484,25 +491,31 @@ func addDeploymentConfigsToGraph(g graph.Graph, dcs *deployapi.DeploymentConfigL
// addBuildConfigsToGraph adds build configs to the graph.
//
// Edges are added to the graph from each build config to the image specified by its strategy.from.
func addBuildConfigsToGraph(g graph.Graph, bcs *buildapi.BuildConfigList) {
func addBuildConfigsToGraph(g graph.Graph, bcs *buildapi.BuildConfigList) error {
for i := range bcs.Items {
bc := &bcs.Items[i]
glog.V(4).Infof("Examining BuildConfig %s", getName(bc))
bcNode := buildgraph.EnsureBuildConfigNode(g, bc)
addBuildStrategyImageReferencesToGraph(g, bc.Spec.Strategy, bcNode)
if err := addBuildStrategyImageReferencesToGraph(g, bc.Spec.Strategy, bcNode); err != nil {
return fmt.Errorf("unable to add BuildConfig %s to graph: %v", getName(bc), err)
}
}
return nil
}

// addBuildsToGraph adds builds to the graph.
//
// Edges are added to the graph from each build to the image specified by its strategy.from.
func addBuildsToGraph(g graph.Graph, builds *buildapi.BuildList) {
func addBuildsToGraph(g graph.Graph, builds *buildapi.BuildList) error {
for i := range builds.Items {
build := &builds.Items[i]
glog.V(4).Infof("Examining build %s", getName(build))
buildNode := buildgraph.EnsureBuildNode(g, build)
addBuildStrategyImageReferencesToGraph(g, build.Spec.Strategy, buildNode)
if err := addBuildStrategyImageReferencesToGraph(g, build.Spec.Strategy, buildNode); err != nil {
return fmt.Errorf("unable to add Build %s to graph: %v", getName(build), err)
}
}
return nil
}

// addBuildStrategyImageReferencesToGraph ads references from the build strategy's parent node to the image
Expand All @@ -511,11 +524,11 @@ func addBuildsToGraph(g graph.Graph, builds *buildapi.BuildList) {
// Edges are added to the graph from each predecessor (build or build config)
// to the image specified by strategy.from, as long as the image is managed by
// OpenShift.
func addBuildStrategyImageReferencesToGraph(g graph.Graph, strategy buildapi.BuildStrategy, predecessor gonum.Node) {
func addBuildStrategyImageReferencesToGraph(g graph.Graph, strategy buildapi.BuildStrategy, predecessor gonum.Node) error {
from := buildapi.GetInputReference(strategy)
if from == nil {
glog.V(4).Infof("Unable to determine 'from' reference - skipping")
return
return nil
}

glog.V(4).Infof("Examining build strategy with from: %#v", from)
Expand All @@ -526,38 +539,39 @@ func addBuildStrategyImageReferencesToGraph(g graph.Graph, strategy buildapi.Bui
case "ImageStreamImage":
_, id, err := imageapi.ParseImageStreamImageName(from.Name)
if err != nil {
glog.V(2).Infof("Error parsing ImageStreamImage name %q: %v - skipping", from.Name, err)
return
return fmt.Errorf("error parsing ImageStreamImage name %q: %v", from.Name, err)
}
imageID = id
case "DockerImage":
ref, err := imageapi.ParseDockerImageReference(from.Name)
if err != nil {
glog.V(2).Infof("Error parsing DockerImage name %q: %v - skipping", from.Name, err)
return
return fmt.Errorf("error parsing DockerImage name %q: %v", from.Name, err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought I saw agreement to make this a warning, not fail the entire prune process? That would be my inclination.

Same for the ImageStreamImage check above.

If the reference is invalid, it clearly is not going to reference an image and keep us from pruning that image.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought I saw agreement to make this a warning, not fail the entire prune process? That would be my inclination.

@bparees Yes, but my answer was that you can't create such reference.

If the reference is invalid, it clearly is not going to reference an image and keep us from pruning that image.

You can not be sure that this Strategy is not going to reference an image. You just don't know. I believe that in the conditions when we are not sure we need to save user data. In the end the user will see the error and can decide to remove this build/buildconfig or fix reference.

#16717 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bparees Yes, but my answer was that you can't create such reference.

I think relying on that guarantee is a mistake. Bugs in the buildconfig validation could allow the creation of such a reference and then pruning is broken for no good reason.

You can not be sure that this Strategy is not going to reference an image.

I don't care what it may do in the future. I know it's not currently referencing an image. What the user may update it to reference in the future when they fix their typo is irrelevant. It may have tried to reference an image, but it is not referencing an image (it has an invalid reference, so it's not referencing anything).

I believe that in the conditions when we are not sure we need to save user data.

What are we saving? Can you give me a scenario where a user manages to create an image that we'd prune here despite an invalid reference existing in the system, that would ultimately be a problem?

If your argument is "the user meant to reference something but they did it wrong so we have to save everything until we know what they meant to reference", well what if they meant to reference "ruby" and they typed it as "riby"? It's a "valid" reference, so it won't trip this error, but the "ruby" image is going to get cleaned up in that case too. There's only so much protection we can offer.

In the end the user will see the error and can decide to remove this build/buildconfig or fix reference.

The admin running this is not the same as the user who created the broken reference. So your solution requires the admin to either go edit the user's object themselves, or track down the user and discuss how to resolve it. Neither of those are ideal situations for an admin who's just trying to free up space (not to mention that this is frequently run as an automated job that may or may not have good monitoring in place).

In your referenced comment thread you said "done" but it's not obvious to me what that means. You didn't add the suggested "force" flag and you didn't make this a warning. (And to be clear, I prefer the latter).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think relying on that guarantee is a mistake. Bugs in the buildconfig validation could allow the creation of such a reference and then pruning is broken for no good reason.

@bparees I'll change it, but it's your decision.

In your referenced comment thread you said "done" but it's not obvious to me what that means

My done was about:

If the error is printed, the admin will have the opportunity to act and tune/delete the object. Make sure to include namespace/name of the build config.

and I added the ability to see the name of the object.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll change it, but it's your decision.

I asked a pretty direct question:

What are we saving? Can you give me a scenario where a user manages to create an image that we'd prune here despite an invalid reference existing in the system, that would ultimately be a problem?

You seem very concerned about something here, but I haven't grasped what it is. Can you help me understand the scenario you are envisioning where proceeding with pruning is a mistake and why it is a mistake?

I am trying to make an educated decision.... so far I am not seeing the risk in proceeding with pruning in this situation and i'd like you to help me be confident that I am not overlooking something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked a pretty direct question:

@bparees At first I did not understand correctly. Sorry, of course I will try to explain my point of view.

What are we saving?

Since this operation can not be canceled (we send requests to the registry to delete blobs/manifests), then we need to be very careful with the calculation of references. Otherwise, we will permanently delete objects that the user did not want to lose.

If we have references that we can not parse, it means that there is something wrong in the system (incomplete upgrade or something else). In this case, it is not the best time to perform actions that can not be rolled back.

Ignoring "broken" Build/BuildConfig will lead us to the image pruning if there are no other references in the cluster.

Can you give me a scenario where a user manages to create an image that we'd prune here despite an invalid reference existing in the system, that would ultimately be a problem?

Imagine the situation when in the new version of origin we updated the github.com/openshift/origin/pkg/image/reference in such a way (it can be corner cases) that 10% of the old references stopped parsing. If you ignore the reference parsing error, you will lose 10% of the links in the graph. This will lead to a loss of actual data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Imagine the situation when in the new version of origin we updated the github.com/openshift/origin/pkg/image/reference in such a way (it can be corner cases) that 10% of the old references stopped parsing. If you ignore the reference parsing error, you will lose 10% of the links in the graph. This will lead to a loss of actual data.

Thanks, that is a compelling argument.

Do we at least have enough information to add the namespace containing the bad buildconfig reference? Right now i'm not convinced we're printing enough information to help an admin go find the bad reference and fix it. They essentially would have to dump all the buildconfigs in all the namespaces and then grep for the string. We should do more to help point them directly to where the problem is, in these cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we at least have enough information to add the namespace containing the bad buildconfig reference?

@bparees Right now, I output an error to the user and specifying <namespace>/<name> of the Build/BuildConfig. Is this information enough ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, i hadn't seen that you were catching this error and wrapping it w/ additional information. I see it now.

}
imageID = ref.ID
default:
return
glog.V(4).Infof("Unknown kind for name %q: %v", from.Name, from.Kind)
return nil
}

glog.V(4).Infof("Looking for image %q in graph", imageID)
imageNode := imagegraph.FindImage(g, imageID)
if imageNode == nil {
glog.V(4).Infof("Unable to find image %q in graph - skipping", imageID)
return
return nil
}

glog.V(4).Infof("Adding edge from %v to %v", predecessor, imageNode)
g.AddEdge(predecessor, imageNode, ReferencedImageEdgeKind)

return nil
}

// getImageNodes returns only nodes of type ImageNode.
func getImageNodes(nodes []gonum.Node) []*imagegraph.ImageNode {
ret := []*imagegraph.ImageNode{}
func getImageNodes(nodes []gonum.Node) map[string]*imagegraph.ImageNode {
ret := make(map[string]*imagegraph.ImageNode)
for i := range nodes {
if node, ok := nodes[i].(*imagegraph.ImageNode); ok {
ret = append(ret, node)
ret[node.Image.Name] = node
}
}
return ret
Expand Down Expand Up @@ -614,16 +628,20 @@ func imageIsPrunable(g graph.Graph, imageNode *imagegraph.ImageNode, algorithm p

// calculatePrunableImages returns the list of prunable images and a
// graph.NodeSet containing the image node IDs.
func calculatePrunableImages(g graph.Graph, imageNodes []*imagegraph.ImageNode, algorithm pruneAlgorithm) ([]*imagegraph.ImageNode, graph.NodeSet) {
prunable := []*imagegraph.ImageNode{}
func calculatePrunableImages(
g graph.Graph,
imageNodes map[string]*imagegraph.ImageNode,
algorithm pruneAlgorithm,
) (map[string]*imagegraph.ImageNode, graph.NodeSet) {
prunable := make(map[string]*imagegraph.ImageNode)
ids := make(graph.NodeSet)

for _, imageNode := range imageNodes {
glog.V(4).Infof("Examining image %q", imageNode.Image.Name)

if imageIsPrunable(g, imageNode, algorithm) {
glog.V(4).Infof("Image %q is prunable", imageNode.Image.Name)
prunable = append(prunable, imageNode)
prunable[imageNode.Image.Name] = imageNode
ids.Add(imageNode.ID())
}
}
Expand Down Expand Up @@ -671,71 +689,100 @@ func calculatePrunableImageComponents(g graph.Graph) []*imagegraph.ImageComponen
return components
}

func getPrunableComponents(g graph.Graph, prunableImageIDs graph.NodeSet) []*imagegraph.ImageComponentNode {
graphWithoutPrunableImages := subgraphWithoutPrunableImages(g, prunableImageIDs)
return calculatePrunableImageComponents(graphWithoutPrunableImages)
}

// pruneStreams removes references from all image streams' status.tags entries
// to prunable images, invoking streamPruner.UpdateImageStream for each updated
// stream.
func pruneStreams(g graph.Graph, imageNodes []*imagegraph.ImageNode, streamPruner ImageStreamDeleter) []error {
errs := []error{}
func pruneStreams(
g graph.Graph,
prunableImageNodes map[string]*imagegraph.ImageNode,
streamPruner ImageStreamDeleter,
) error {
prunableStreams := make(map[string]*imagegraph.ImageStreamNode)

glog.V(4).Infof("Removing pruned image references from streams")
for _, imageNode := range imageNodes {
for _, imageNode := range prunableImageNodes {
for _, n := range g.To(imageNode) {
streamNode, ok := n.(*imagegraph.ImageStreamNode)
if !ok {
continue
}

if err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
stream, err := streamPruner.GetImageStream(streamNode.ImageStream)
if err != nil {
return err
}
updatedTags := sets.NewString()

glog.V(4).Infof("Checking if ImageStream %s has references to image %s in status.tags", getName(stream), imageNode.Image.Name)
streamName := getName(streamNode.ImageStream)
prunableStreams[streamName] = streamNode
}
}

for tag, history := range stream.Status.Tags {
glog.V(4).Infof("Checking tag %q", tag)
for streamName, streamNode := range prunableStreams {
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
stream, err := streamPruner.GetImageStream(streamNode.ImageStream)
if err != nil {
if kerrapi.IsNotFound(err) {
glog.V(4).Infof("Unable to get image stream %s: removed during prune", streamName)
return nil
}
return err
}

newHistory := imageapi.TagEventList{}
updatedTags := sets.NewString()
deletedTags := sets.NewString()

for i, tagEvent := range history.Items {
glog.V(4).Infof("Checking tag event %d with image %q", i, tagEvent.Image)
for tag, history := range stream.Status.Tags {
newHistory := imageapi.TagEventList{}
for i, tagEvent := range history.Items {
glog.V(4).Infof("Checking tag event %d with image %q", i, tagEvent.Image)

if tagEvent.Image != imageNode.Image.Name {
glog.V(4).Infof("Tag event doesn't match deleted image - keeping")
newHistory.Items = append(newHistory.Items, tagEvent)
} else {
glog.V(4).Infof("Tag event matches deleted image - removing reference")
updatedTags.Insert(tag)
}
}
if len(newHistory.Items) == 0 {
glog.V(4).Infof("Removing tag %q from status.tags of ImageStream %s", tag, getName(stream))
delete(stream.Status.Tags, tag)
if _, ok := prunableImageNodes[tagEvent.Image]; ok {
glog.V(4).Infof("Image stream tag %s:%s revision %d - removing because image %q matches deleted image", streamName, tag, i, tagEvent.Image)
updatedTags.Insert(tag)
} else {
stream.Status.Tags[tag] = newHistory
glog.V(4).Infof("Image stream tag %s:%s revision %d - keeping because image %q is not deleted", streamName, tag, i, tagEvent.Image)
newHistory.Items = append(newHistory.Items, tagEvent)
}
}

updatedStream, err := streamPruner.UpdateImageStream(stream, imageNode.Image, updatedTags.List())
if err == nil {
streamNode.ImageStream = updatedStream
if len(newHistory.Items) == 0 {
glog.V(4).Infof("Image stream tag %s:%s - removing empty tag", streamName, tag)
delete(stream.Status.Tags, tag)
deletedTags.Insert(tag)
} else {
stream.Status.Tags[tag] = newHistory
}
return err
}); err != nil {
errs = append(errs, fmt.Errorf("error removing image %s from stream %s: %v",
imageNode.Image.Name, getName(streamNode.ImageStream), err))
continue
}

if updatedTags.Len() == 0 && deletedTags.Len() == 0 {
return nil
}

updatedStream, err := streamPruner.UpdateImageStream(stream)
if err == nil {
streamPruner.NotifyImageStreamPrune(stream, updatedTags.List(), deletedTags.List())
streamNode.ImageStream = updatedStream
}

if kerrapi.IsNotFound(err) {
glog.V(4).Infof("Unable to update image stream %s: removed during prune", streamName)
return nil
}

return err
})

if err != nil {
return fmt.Errorf("unable to prune stream %s: %v", streamName, err)
}
}

glog.V(4).Infof("Done removing pruned image references from streams")
return errs
return nil
}

// pruneImages invokes imagePruner.DeleteImage with each image that is prunable.
func pruneImages(g graph.Graph, imageNodes []*imagegraph.ImageNode, imagePruner ImageDeleter) []error {
func pruneImages(g graph.Graph, imageNodes map[string]*imagegraph.ImageNode, imagePruner ImageDeleter) []error {
errs := []error{}

for _, imageNode := range imageNodes {
Expand Down Expand Up @@ -766,27 +813,30 @@ func (p *pruner) Prune(

prunableImageNodes, prunableImageIDs := calculatePrunableImages(p.g, imageNodes, p.algorithm)

errs := []error{}
errs = append(errs, pruneStreams(p.g, prunableImageNodes, streamPruner)...)
err := pruneStreams(p.g, prunableImageNodes, streamPruner)
// if namespace is specified prune only ImageStreams and nothing more
if len(p.algorithm.namespace) > 0 {
return kerrors.NewAggregate(errs)
// if we have any errors after ImageStreams pruning this may mean that
// we still have references to images.
if len(p.algorithm.namespace) > 0 || err != nil {
return err
}

graphWithoutPrunableImages := subgraphWithoutPrunableImages(p.g, prunableImageIDs)
prunableComponents := calculatePrunableImageComponents(graphWithoutPrunableImages)
prunableComponents := getPrunableComponents(p.g, prunableImageIDs)

var errs []error

errs = append(errs, pruneImageComponents(p.g, p.registryClient, p.registryURL, prunableComponents, layerLinkPruner)...)
errs = append(errs, pruneBlobs(p.g, p.registryClient, p.registryURL, prunableComponents, blobPruner)...)
errs = append(errs, pruneManifests(p.g, p.registryClient, p.registryURL, prunableImageNodes, manifestPruner)...)

if len(errs) > 0 {
// If we had any errors removing image references from image streams or deleting
// layers, blobs, or manifest data from the registry, stop here and don't
// delete any images. This way, you can rerun prune and retry things that failed.
// If we had any errors deleting layers, blobs, or manifest data from the registry,
// stop here and don't delete any images. This way, you can rerun prune and retry
// things that failed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to get smarter about this and still prune the images for which we successfully pruned all the layers/blobs/manifests.

return kerrors.NewAggregate(errs)
}

errs = append(errs, pruneImages(p.g, prunableImageNodes, imagePruner)...)
errs = pruneImages(p.g, prunableImageNodes, imagePruner)
return kerrors.NewAggregate(errs)
}

Expand Down Expand Up @@ -871,7 +921,7 @@ func pruneManifests(
g graph.Graph,
registryClient *http.Client,
registryURL *url.URL,
imageNodes []*imagegraph.ImageNode,
imageNodes map[string]*imagegraph.ImageNode,
manifestPruner ManifestDeleter,
) []error {
errs := []error{}
Expand Down Expand Up @@ -933,7 +983,7 @@ func (p *imageStreamDeleter) GetImageStream(stream *imageapi.ImageStream) (*imag
return p.streams.ImageStreams(stream.Namespace).Get(stream.Name, metav1.GetOptions{})
}

func (p *imageStreamDeleter) UpdateImageStream(stream *imageapi.ImageStream, image *imageapi.Image, updatedTags []string) (*imageapi.ImageStream, error) {
func (p *imageStreamDeleter) UpdateImageStream(stream *imageapi.ImageStream) (*imageapi.ImageStream, error) {
glog.V(4).Infof("Updating ImageStream %s", getName(stream))
is, err := p.streams.ImageStreams(stream.Namespace).UpdateStatus(stream)
if err == nil {
Expand All @@ -942,6 +992,11 @@ func (p *imageStreamDeleter) UpdateImageStream(stream *imageapi.ImageStream, ima
return is, err
}

// NotifyImageStreamPrune shows notification about updated image stream.
func (p *imageStreamDeleter) NotifyImageStreamPrune(stream *imageapi.ImageStream, updatedTags []string, deletedTags []string) {
return
}

// deleteFromRegistry uses registryClient to send a DELETE request to the
// provided url. It attempts an https request first; if that fails, it fails
// back to http.
Expand Down
Loading