diff --git a/README.rst b/README.rst index 1a016c0..2722d39 100644 --- a/README.rst +++ b/README.rst @@ -17,6 +17,7 @@ It supports the following features: - Plan based backups - Custom command run pre-backup - Storing to S3 +- Calculating MD5 hashes of the backup set to avoid uploading duplicate backup sets - Emailing the result of the backup plans - Python standard logging framework @@ -54,6 +55,7 @@ file "AWS_REGION": "this is a region", "EMAIL_FROM": "source@address.com", "EMAIL_TO": "recipient@address.com", + "HASH_CHECK_FILE": "plan_hashes.txt", "Plans": [ { "Name": "MySQL Backup", @@ -108,7 +110,28 @@ Run the backup tool using the following method: s3backup.run_plans() -See ``test.py`` in the ``src`` folder for an example. +See ``test.py`` for an example. + +File Hashing +------------ + +After a backup set is created an MD5 hash is calculated for it. This is then compared against a previously calculated +hash for that particular plan name. + +**NOTE:** Do not change the generated HASH_CHECK_FILE! + +Finally, be aware of a "gotcha" - the hashes are keyed on the *plan name* - therefore changing the plan name will +cause the backup script to think it needs to upload a new backup set. + +Emails +------ + +An email will be sent after each plan runs. The email will either report a success or a failure. In the event +of a success, it will be reported if there was a new uploaded backup set (and the file name), otherwise it will +state that no changes were detected and no upload was made. + +If there was a failure while running the backup, the exception message will be emailed, and the logs can be +referred to for further information. Future Improvements ------------------- @@ -116,3 +139,5 @@ Future Improvements These are some of the planned future improvements: - Run multiple pre-backup commands (by providing an array) +- Allow custom format strings for the output files (instead of the default date/time format) +- Modification of the glob2 library to allow hidden files to be included \ No newline at end of file diff --git a/S3Backup/__init__.py b/S3Backup/__init__.py index e987af7..5d53fe3 100644 --- a/S3Backup/__init__.py +++ b/S3Backup/__init__.py @@ -31,7 +31,7 @@ class S3BackupTool: - def __init__(self, config_file="config.json", log_file="s3backup.log"): + def __init__(self, config_file="config.json"): logger.info('Initialising...') try: @@ -52,17 +52,42 @@ def run_plans(self): logger.info('Executing plan %d of %d', counter, len(self.PLANS)) try: - plan.run() - self.__send_status_email(plan, True) + updated, output_file = plan.run() + self.__send_success_email(plan, updated, output_file) except Exception, e: logger.error('Failed to run plan: %s', e) - self.__send_status_email(plan, False, e) + self.__send_failure_email(plan, e) counter += 1 logger.info('Finished running backup plans') - def __send_status_email(self, plan, success, exception=None): + def __send_success_email(self, plan, updated, output_file): + subject = '[S3-Backup] [SUCCESS] - Plan: %s' % plan.name + + body = 'The backup plan, %s, run at %s was SUCCESSFUL\n\n' % ( + plan.name, + strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) + + if updated: + body += 'The backup set had changed, so a new backup was uploaded: %s' % output_file + else: + body += 'The backup set had not changed. No new backup uploaded' + + self.__send_status_email(subject, body) + + def __send_failure_email(self, plan, exception): + subject = '[S3-Backup] [FAILURE] - Plan: %s' % plan.name + + body = 'The backup plan, %s, run at %s was a FAILURE\n\n' % ( + plan.name, + strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) + + body += '\n\nDetailed failure information:\n\n%s' % exception + + self.__send_status_email(subject, body) + + def __send_status_email(self, subject, body): if self.CONFIGURATION['EMAIL_FROM'] is None or self.CONFIGURATION['EMAIL_TO'] is None: logger.debug('Email not provided, so status update not sent') return @@ -72,25 +97,13 @@ def __send_status_email(self, plan, success, exception=None): aws_access_key_id=self.CONFIGURATION['AWS_KEY'], aws_secret_access_key=self.CONFIGURATION['AWS_SECRET']) - result = 'SUCCESS' - if not success: - result = 'FAILURE' - - body = 'The backup plan, %s, run at %s was %s' % ( - plan.name, - strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()), - result) - - if exception is not None: - body += '\n\nDetailed failure information:\n\n%s' % exception - try: conn.send_email( self.CONFIGURATION['EMAIL_FROM'], - '[S3-Backup] [%s] - Plan: %s' % (result, plan.name), + subject, body, [self.CONFIGURATION['EMAIL_TO']]) except Exception, e: - logger.error('Failed to send email to {0:s} regarding plan: {1:s}'.format(self.CONFIGURATION['EMAIL_TO'], - plan.name), + logger.error('Failed to send email to {0:s} with subject {1:s}'.format(self.CONFIGURATION['EMAIL_TO'], + subject), e) diff --git a/S3Backup/config_loader.py b/S3Backup/config_loader.py index 1711fdb..7d39100 100644 --- a/S3Backup/config_loader.py +++ b/S3Backup/config_loader.py @@ -28,7 +28,7 @@ logger = logging.getLogger(name='config_loader') -required_root_values = ['AWS_KEY', 'AWS_SECRET', 'AWS_BUCKET', 'AWS_REGION', 'Plans'] +required_root_values = ['AWS_KEY', 'AWS_SECRET', 'AWS_BUCKET', 'AWS_REGION', 'HASH_CHECK_FILE', 'Plans'] optional_root_values = ['EMAIL_FROM', 'EMAIL_TO'] def config_setup(config_file): @@ -39,6 +39,7 @@ def config_setup(config_file): 'AWS_SECRET': '', 'AWS_BUCKET': '', 'AWS_REGION': '', + 'HASH_CHECK_FILE': '', 'EMAIL_FROM': None, 'EMAIL_TO': None } diff --git a/S3Backup/hash_file.py b/S3Backup/hash_file.py new file mode 100644 index 0000000..2856ca0 --- /dev/null +++ b/S3Backup/hash_file.py @@ -0,0 +1,64 @@ +import hashlib +import os +from shutil import move +from tempfile import mkstemp + +BLOCKSIZE=65535 + +def find_hash(hash_file, plan_name): + # Try to find the hash in the hash file + filename = os.path.normpath(hash_file) + if os.path.isfile(filename): + plan_hashes = open(filename, 'r').readlines() + for line in plan_hashes: + parts = line.strip().split('=') + if len(parts) == 2 and parts[0] == plan_name: + return parts[1] + + return None + +def update_hash(hash_file, plan_name, hash_value): + # Do the update (create the file if it doesn't exist) + filename = os.path.normpath(hash_file) + + # If it doesn't exist, we shortcut this + if not os.path.isfile(hash_file): + with open(hash_file, 'w') as new_file: + new_file.write('%s=%s\n' % (plan_name, hash_value)) + return + + # Otherwise, we need to rebuild the file + fh, abs_path = mkstemp() + is_written = False + + with open(abs_path, 'w') as new_file: + with open(filename, 'r') as old_file: + # Handle existing entries in the file + for line in old_file: + parts = line.strip().split('=') + if parts[0] == plan_name: + is_written = True + new_file.write('%s=%s\n' % (plan_name, hash_value)) + else: + new_file.write(line) + + # If the hash wasn't already in the file + if not is_written: + new_file.write('%s=%s\n' % (plan_name, hash_value)) + + os.close(fh) + + # Remove original file + os.remove(hash_file) + + # Move new file + move(abs_path, hash_file) + +def calc_hash(filename): + hasher = hashlib.md5() + with open(filename, 'rb') as afile: + buf = afile.read(BLOCKSIZE) + while len(buf) > 0: + hasher.update(buf) + buf = afile.read(BLOCKSIZE) + return hasher.hexdigest() diff --git a/S3Backup/plan.py b/S3Backup/plan.py index 0836412..e8b5da9 100644 --- a/S3Backup/plan.py +++ b/S3Backup/plan.py @@ -29,6 +29,7 @@ from zipfile import ZipFile import time import boto.ses +from S3Backup import hash_file required_plan_values = ['Name', 'Src', 'OutputPrefix'] optional_plan_values = ['Command'] @@ -52,6 +53,8 @@ def __init__(self, raw_plan, configuration): self.command = None self.output_file = '%s_%s.zip' % (raw_plan['OutputPrefix'], time.strftime("%Y-%m-%d_%H-%M-%S")) + self.new_hash = None + if 'Command' in raw_plan: self.command = raw_plan['Command'] @@ -63,7 +66,9 @@ def run(self): The plan is run in the following order: 1) (if applicable) Run the external command provided 2) Zip source file(s) to destination file - 3) Upload destination file to S3 bucket + 3) Perform hash check to see if there are any changes (which would require an upload) + 4) Upload destination file to S3 bucket + 5) Update hash file with new hash """ logger.info('Running plan "%s"', self.name) @@ -74,12 +79,24 @@ def run(self): # 2) Zip the source file to the destination file self.__zip_files() - # 3) Upload destination file to S3 bucket + updated = False + try: - self.__upload() + # 3) Perform hash check to see if there are any changes (which would require an upload) + if not self.__hash_check(): + # 4) Upload destination file to S3 bucket + self.__upload() + + # 5) Update hash file with new hash + self.__update_hash() + + updated = True + finally: self.__cleanup() + return updated, self.output_file + def __run_command(self): logger.info('Executing custom command...') @@ -137,6 +154,27 @@ def __upload(self): logger.error('Failed to upload backup file to S3: %s', e) raise + def __hash_check(self): + previous_hash = hash_file.find_hash(self.CONFIGURATION['HASH_CHECK_FILE'], self.name) + + if previous_hash is None: + logger.debug('No previous hash found for plan %s', self.name) + else: + logger.debug('Got a previous hash for plan %s of %s', self.name, previous_hash) + + self.new_hash = hash_file.calc_hash(self.output_file) + + logger.debug('New hash for plan %s of %s', self.name, self.new_hash) + + return previous_hash == self.new_hash + + def __update_hash(self): + if self.new_hash is None: + logger.error('Could not update hash as no hash was found') + return + + hash_file.update_hash(self.CONFIGURATION['HASH_CHECK_FILE'], self.name, self.new_hash) + def __cleanup(self): logger.info('Cleaning up temporary file: %s', self.output_file) try: diff --git a/TODO.md b/TODO.md index 57d768a..5d03711 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,5 @@ # Features to be added 1. Ability to choose custom format strings on output file (rather than automatically appending date/time) -2. Ability to run multiple commands \ No newline at end of file +2. Ability to run multiple commands +3. Modify the glob2 library to support hidden files diff --git a/config.json b/config.json index 2e3915b..df27bec 100644 --- a/config.json +++ b/config.json @@ -5,6 +5,7 @@ "AWS_REGION": "this is a region", "EMAIL_FROM": "source@address.com", "EMAIL_TO": "recipient@address.com", + "HASH_CHECK_FILE": "plan_hashes.txt", "Plans": [ { "Name": "MySQL Backup",