-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added the ability to stream data using cp
.
#903
Changes from all commits
c57fa91
1461190
995dc93
f21837a
4716948
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ | |
from botocore.compat import quote | ||
from awscli.customizations.s3.utils import find_bucket_key, \ | ||
check_etag, check_error, operate, uni_print, \ | ||
guess_content_type, MD5Error | ||
guess_content_type, MD5Error, bytes_print | ||
|
||
|
||
class CreateDirectoryError(Exception): | ||
|
@@ -26,7 +26,7 @@ def read_file(filename): | |
return in_file.read() | ||
|
||
|
||
def save_file(filename, response_data, last_update): | ||
def save_file(filename, response_data, last_update, is_stream=False): | ||
""" | ||
This writes to the file upon downloading. It reads the data in the | ||
response. Makes a new directory if needed and then writes the | ||
|
@@ -35,31 +35,57 @@ def save_file(filename, response_data, last_update): | |
""" | ||
body = response_data['Body'] | ||
etag = response_data['ETag'][1:-1] | ||
d = os.path.dirname(filename) | ||
try: | ||
if not os.path.exists(d): | ||
os.makedirs(d) | ||
except OSError as e: | ||
if not e.errno == errno.EEXIST: | ||
raise CreateDirectoryError( | ||
"Could not create directory %s: %s" % (d, e)) | ||
if not is_stream: | ||
d = os.path.dirname(filename) | ||
try: | ||
if not os.path.exists(d): | ||
os.makedirs(d) | ||
except OSError as e: | ||
if not e.errno == errno.EEXIST: | ||
raise CreateDirectoryError( | ||
"Could not create directory %s: %s" % (d, e)) | ||
md5 = hashlib.md5() | ||
file_chunks = iter(partial(body.read, 1024 * 1024), b'') | ||
with open(filename, 'wb') as out_file: | ||
if not _is_multipart_etag(etag): | ||
for chunk in file_chunks: | ||
md5.update(chunk) | ||
out_file.write(chunk) | ||
else: | ||
for chunk in file_chunks: | ||
out_file.write(chunk) | ||
if is_stream: | ||
# Need to save the data to be able to check the etag for a stream | ||
# becuase once the data is written to the stream there is no | ||
# undoing it. | ||
payload = write_to_file(None, etag, md5, file_chunks, True) | ||
else: | ||
with open(filename, 'wb') as out_file: | ||
write_to_file(out_file, etag, md5, file_chunks) | ||
|
||
if not _is_multipart_etag(etag): | ||
if etag != md5.hexdigest(): | ||
os.remove(filename) | ||
if not is_stream: | ||
os.remove(filename) | ||
raise MD5Error(filename) | ||
last_update_tuple = last_update.timetuple() | ||
mod_timestamp = time.mktime(last_update_tuple) | ||
os.utime(filename, (int(mod_timestamp), int(mod_timestamp))) | ||
|
||
if not is_stream: | ||
last_update_tuple = last_update.timetuple() | ||
mod_timestamp = time.mktime(last_update_tuple) | ||
os.utime(filename, (int(mod_timestamp), int(mod_timestamp))) | ||
else: | ||
# Now write the output to stdout since the md5 is correct. | ||
bytes_print(payload) | ||
sys.stdout.flush() | ||
|
||
|
||
def write_to_file(out_file, etag, md5, file_chunks, is_stream=False): | ||
""" | ||
Updates the etag for each file chunk. It will write to the file if it a | ||
file but if it is a stream it will return a byte string to be later | ||
written to a stream. | ||
""" | ||
body = b'' | ||
for chunk in file_chunks: | ||
if not _is_multipart_etag(etag): | ||
md5.update(chunk) | ||
if is_stream: | ||
body += chunk | ||
else: | ||
out_file.write(chunk) | ||
return body | ||
|
||
|
||
def _is_multipart_etag(etag): | ||
|
@@ -140,7 +166,7 @@ class FileInfo(TaskInfo): | |
def __init__(self, src, dest=None, compare_key=None, size=None, | ||
last_update=None, src_type=None, dest_type=None, | ||
operation_name=None, service=None, endpoint=None, | ||
parameters=None, source_endpoint=None): | ||
parameters=None, source_endpoint=None, is_stream=False): | ||
super(FileInfo, self).__init__(src, src_type=src_type, | ||
operation_name=operation_name, | ||
service=service, | ||
|
@@ -157,6 +183,18 @@ def __init__(self, src, dest=None, compare_key=None, size=None, | |
self.parameters = {'acl': None, | ||
'sse': None} | ||
self.source_endpoint = source_endpoint | ||
self.is_stream = is_stream | ||
|
||
def set_size_from_s3(self): | ||
""" | ||
This runs a ``HeadObject`` on the s3 object and sets the size. | ||
""" | ||
bucket, key = find_bucket_key(self.src) | ||
params = {'endpoint': self.endpoint, | ||
'bucket': bucket, | ||
'key': key} | ||
response_data, http = operate(self.service, 'HeadObject', params) | ||
self.size = int(response_data['ContentLength']) | ||
|
||
def _permission_to_param(self, permission): | ||
if permission == 'read': | ||
|
@@ -204,24 +242,30 @@ def _handle_object_params(self, params): | |
if self.parameters['expires']: | ||
params['expires'] = self.parameters['expires'][0] | ||
|
||
def upload(self): | ||
def upload(self, payload=None): | ||
""" | ||
Redirects the file to the multipart upload function if the file is | ||
large. If it is small enough, it puts the file as an object in s3. | ||
""" | ||
with open(self.src, 'rb') as body: | ||
bucket, key = find_bucket_key(self.dest) | ||
params = { | ||
'endpoint': self.endpoint, | ||
'bucket': bucket, | ||
'key': key, | ||
'body': body, | ||
} | ||
self._handle_object_params(params) | ||
response_data, http = operate(self.service, 'PutObject', params) | ||
etag = response_data['ETag'][1:-1] | ||
body.seek(0) | ||
check_etag(etag, body) | ||
if payload: | ||
self._handle_upload(payload) | ||
else: | ||
with open(self.src, 'rb') as body: | ||
self._handle_upload(body) | ||
|
||
def _handle_upload(self, body): | ||
bucket, key = find_bucket_key(self.dest) | ||
params = { | ||
'endpoint': self.endpoint, | ||
'bucket': bucket, | ||
'key': key, | ||
'body': body, | ||
} | ||
self._handle_object_params(params) | ||
response_data, http = operate(self.service, 'PutObject', params) | ||
etag = response_data['ETag'][1:-1] | ||
body.seek(0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if a non-seekable stream is passed? Should the call fail? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will not because in another function, the parts of the original stream are cut up and packaged into There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see, okay that makes sense. |
||
check_etag(etag, body) | ||
|
||
def _inject_content_type(self, params, filename): | ||
# Add a content type param if we can guess the type. | ||
|
@@ -237,7 +281,8 @@ def download(self): | |
bucket, key = find_bucket_key(self.src) | ||
params = {'endpoint': self.endpoint, 'bucket': bucket, 'key': key} | ||
response_data, http = operate(self.service, 'GetObject', params) | ||
save_file(self.dest, response_data, self.last_update) | ||
save_file(self.dest, response_data, self.last_update, | ||
self.is_stream) | ||
|
||
def copy(self): | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a little unclear to me here - is this actually reading in the entire contents of the file to be printed later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is if the object is being streamed to standard out. This is needed because if you are writing an object out to stdout while doing the md5 calculation, there is no way to erase the data sent to stdout if there is an md5error and needs to be retried. Therefore, I write to a buffer that is later written to stdout once I have ensured the md5 is correct. On the other hand for a file, I write to file as I calculate to md5 because I can delete the file if the md5's do not match.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is kind of concerning to me given the size of files people will put into S3. Have you considered using a temporary file? You could only use temp files if the download is large, and it would have the same behavior as a normal file except it is eventually written to stdout and removed from disk. What about writing a message out to stderr and returning a non-zero exit code (leaving retries up to the calling script if they want to use stdout)? Any other ideas you considered?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is for a single download and the cutoff for multipart threshold is 8 MB and so there will be at most that much in memory (for a non-multipart download) since you can only perform an operation on one file when streaming. This is memory issue is more concerning with multipart operations, which I will discuss at the bottom of the comment section. On a side note, I like the idea of temporary files