Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connection limit and Created new Failed Manifest #197

Open
asyba opened this issue Jun 4, 2019 · 2 comments
Open

Connection limit and Created new Failed Manifest #197

asyba opened this issue Jun 4, 2019 · 2 comments

Comments

@asyba
Copy link

asyba commented Jun 4, 2019

Hi, I have two kind of errors:

How I can make a way to reprocess automatically (not manually)?
I have the failureTopicARN but for this kind of error is not called right?
and I have the failedManifestKey that are saved in s3 bucket but the flow ends here.

Cluster Load Failure error: abort query on Cluster xxxxxx.amazonaws.com
...
 error: abort query\n code: 1020\n context: system requested abort\n query: 10119908\n location: queryabort.hpp:62\n process: padbmaster [pid=32211]\n 
...

Cluster Load Failure error: connection limit "500" exceeded for non-bootstrap users on Cluster xxxxxxx.redshift.amazonaws.com

end of lambda:
Created new Failed Manifest xxxxxx
{
    "xxxxxxxxxx.redshift.amazonaws.com": {
        "status": -1,
        "error": {
            "name": "error",
            "length": 157,
            "severity": "FATAL",
            "code": "53300",
            "file": "/home/ec2-user/padb/src/pg/src/backend/utils/init/postinit.c",
            "line": "571",
            "routine": "InitPostgres"
        }
    }
}

2019-06-04T06:06:15.613Z d4f668ac-7193-4a16-8123-0a50c4eff75c
{
    "errorMessage": "error"
}
@IanMeyers
Copy link
Contributor

Hello,

The failureTopicArn should be being used to send notifications in this case. Does anything appear in the logs to this effect? For automatic reprocessing, there is an additional Lambda function in the project which can be hooked up to the failure SNS topic and handles a few different scenarios.

Ian

@asyba
Copy link
Author

asyba commented Jun 5, 2019

@IanMeyers not in this case, I have the code copy from 11/10/2018 could be that there was an issue/bug at that time that is not working?

The only thing that appear in the logs is the text I put above, for some reason the sns is not called. (I have failureTopicArn in the configuration)

Here is some related code of that call that currently I have:

exports.failBatch = function(loadState, config, thisBatchId, s3Info, manifestInfo) {
	console.log(loadState);

	if (config.failedManifestKey && manifestInfo) {
	    // copy the manifest to the failed location
	    manifestInfo.failedManifestPrefix = manifestInfo.manifestPrefix.replace(manifestInfo.manifestKey + '/', config.failedManifestKey.S + '/');
	    manifestInfo.failedManifestPath = manifestInfo.manifestBucket + '/' + manifestInfo.failedManifestPrefix;

	    var copySpec = {
		Bucket : manifestInfo.manifestBucket,
		Key : manifestInfo.failedManifestPrefix,
		CopySource : manifestInfo.manifestPath,
		Metadata : {
		    'x-amz-meta-load-date' : common.readableTime(common.now())
		}
	    };
	    s3.copyObject(copySpec, function(err, data) {
		if (err) {
		    console.log(err);
		    exports.closeBatch(err, config, thisBatchId, s3Info, manifestInfo);
		} else {
		    console.log('Created new Failed Manifest ' + manifestInfo.failedManifestPath);

		    // update the batch entry showing the failed
		    // manifest location
		    var manifestModification = {
			Key : {
			    batchId : {
				S : thisBatchId
			    },
			    s3Prefix : {
				S : s3Info.prefix
			    }
			},
			TableName : batchTable,
			AttributeUpdates : {
			    manifestFile : {
				Action : 'PUT',
				Value : {
				    S : manifestInfo.failedManifestPath
				}
			    },
			    lastUpdate : {
				Action : 'PUT',
				Value : {
				    N : '' + common.now()
				}
			    }
			}
		    };
		    common.retryableUpdate(dynamoDB, manifestModification, function(err, data) {
			if (err) {
			    console.log(err);
			    exports.closeBatch(err, config, thisBatchId, s3Info, manifestInfo);
			} else {
			    // close the batch with the original
			    // calling error
			    exports.closeBatch(loadState, config, thisBatchId, s3Info, manifestInfo);
			}
		    });
		}
	    });
	} else {
	    console.log('Not requesting copy of Manifest to Failed S3 Location');
	    exports.closeBatch(loadState, config, thisBatchId, s3Info, manifestInfo);
	}
    };

    exports.closeBatch = function(batchError, config, thisBatchId, s3Info, manifestInfo) {
	var batchEndStatus;

	if (batchError && batchError !== null) {
	    batchEndStatus = error;
	} else {
	    batchEndStatus = complete;
	}

	var item = {
	    Key : {
		batchId : {
		    S : thisBatchId
		},
		s3Prefix : {
		    S : s3Info.prefix
		}
	    },
	    TableName : batchTable,
	    AttributeUpdates : {
		status : {
		    Action : 'PUT',
		    Value : {
			S : batchEndStatus
		    }
		},
		lastUpdate : {
		    Action : 'PUT',
		    Value : {
			N : '' + common.now()
		    }
		}
	    }
	};

	// add the error message to the updates if we had one
	if (batchError && batchError !== null) {
	    item.AttributeUpdates.errorMessage = {
		Action : 'PUT',
		Value : {
		    S : JSON.stringify(batchError)
		}
	    };
	}

	// mark the batch as closed
	common.retryableUpdate(dynamoDB, item, function(err, data) {
	    // ugh, the batch closure didn't finish - this is not a good
	    // place to be
	    if (err) {
		console.log(JSON.stringify(err));
		context.done(error, JSON.stringify(err));
	    } else {
		// send notifications
		exports.notify(config, thisBatchId, s3Info, manifestInfo, batchError);
	    }
	});
    };

    /** send an SNS message to a topic */
    exports.sendSNS = function(topic, subj, msg, successCallback, failureCallback) {
	var m = {
	    Message : JSON.stringify(msg),
	    Subject : subj,
	    TopicArn : topic
	};

	sns.publish(m, function(err, data) {
	    if (err) {
		if (failureCallback) {
		    failureCallback(err);
		} else {
		    console.log(err);
		}
	    } else {
		if (successCallback) {
		    successCallback();
		}
	    }
	});
    };

    /** Send SNS notifications if configured for OK vs Failed status */
    exports.notify = function(config, thisBatchId, s3Info, manifestInfo, batchError) {
	var statusMessage = batchError ? 'error' : 'ok';
	var errorMessage = batchError ? JSON.stringify(batchError) : null;
	var messageBody = {
	    error : errorMessage,
	    status : statusMessage,
	    batchId : thisBatchId,
	    s3Prefix : s3Info.prefix,
	    key : s3Info.key
	};

	if (manifestInfo) {
	    messageBody.originalManifest = manifestInfo.manifestPath;
	    messageBody.failedManifest = manifestInfo.failedManifestPath;
	}

	if (batchError && batchError !== null) {
	    console.log(JSON.stringify(batchError));

	    if (config.failureTopicARN) {
		exports.sendSNS(config.failureTopicARN.S, "Lambda Redshift Batch Load " + thisBatchId + " Failure", messageBody, function() {
		    context.done(error, JSON.stringify(batchError));
		}, function(err) {
		    console.log(JSON.stringify(err));
		    context.done(error, JSON.stringify(err));
		});
	    } else {
		context.done(error, JSON.stringify(batchError));
	    }
	} else {
	    if (config.successTopicARN) {
		exports.sendSNS(config.successTopicARN.S, "Lambda Redshift Batch Load " + thisBatchId + " OK", messageBody, function() {
		    context.done(null, null);
		}, function(err) {
		    console.log(JSON.stringify(err));
		    context.done(error, JSON.stringify(err));
		});
	    } else {
		// finished OK - no SNS notifications for
		// success
		console.log("Batch Load " + thisBatchId + " Complete");
		context.done(null, null);
	    }
	}
    };

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants