forked from mediafinance/s3-content-type-fixer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
s3_content_type_fixer.py
executable file
·98 lines (79 loc) · 3.33 KB
/
s3_content_type_fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from boto.s3.connection import S3Connection
import argparse
import multiprocessing
import sys
import mimetypes
BLOCK_TIME = 60 * 60
def find_matching_files(bucket, prefixes):
"""
Returns a set of files in a given S3 bucket that match the specificed file
path prefixes
"""
return set(key for prefix in prefixes for key in bucket.list(prefix))
def get_bucket(access_key, secret_key, bucket):
"""Gets an S3 bucket"""
return S3Connection(access_key, secret_key).get_bucket(bucket)
def check_headers(bucket, queue, verbose):
"""
Callback used by sub-processes to check the headers of candidate files in
a multiprocessing queue
"""
while True:
try:
key_name = queue.get(BLOCK_TIME)
except:
break
if key_name == None:
break
key = bucket.lookup(key_name)
if not key:
print >> sys.stderr, "%s: Could not lookup" % key.name
continue
expected_content_type = ''
content_type = key.content_type
if key.name.endswith('woff2'):
expected_content_type = 'application/font-woff2'
else:
expected_content_type, _ = mimetypes.guess_type(key.name)
if not expected_content_type:
print >> sys.stderr, "%s: Could not guess content type" % key.name
continue
if content_type == expected_content_type:
if verbose:
print "%s: Matches expected content type" % key.name
else:
print "%s: Current content type (%s) does not match expected (%s); fixing" % (key.name, content_type, expected_content_type)
key.copy(key.bucket, key.name, preserve_acl=True, metadata={'Content-Type': expected_content_type, 'Content-Disposition': key.content_disposition})
def main():
parser = argparse.ArgumentParser(description="Fixes the content-type of assets on S3")
parser.add_argument("--access-key", "-a", type=str, required=True, help="The AWS access key")
parser.add_argument("--secret-key", "-s", type=str, required=True, help="The AWS secret key")
parser.add_argument("--bucket", "-b", type=str, required=True, help="The S3 bucket to check")
parser.add_argument("--prefixes", "-p", type=str, default=[""], required=False, nargs="*", help="File path prefixes to check")
parser.add_argument("--workers", "-w", type=int, default=4, required=False, help="The number of workers")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
args = parser.parse_args()
queue = multiprocessing.Queue()
processes = []
bucket = get_bucket(args.access_key, args.secret_key, args.bucket)
# Start the workers
for _ in xrange(args.workers):
p = multiprocessing.Process(target=check_headers, args=(bucket, queue, args.verbose),)
p.start()
processes.append(p)
# Add the items to the queue
for key in find_matching_files(bucket, args.prefixes):
queue.put(key.name)
# Add None's to the end of the queue, which acts as a signal for the
# proceses to finish
for _ in xrange(args.workers):
queue.put(None)
for p in processes:
# Wait for the processes to finish
try:
p.join()
except KeyboardInterrupt:
pass
if __name__ == "__main__":
main()