aws · kyleknap · Dec 2, 2014 · Dec 1, 2014 · Dec 2, 2014
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,8 @@ Next Release (TBD)
 * bugfix:Timestamp Arguments: Fix issue where certain timestamps were not
   being accepted as valid input
   (`botocore issue 389 <https://github.com/boto/botocore/pull/389>`__)
+* bugfix:``aws s3``: Skip files whose names cannot be properly decoded
+  (`issue 1038 <https://github.com/aws/aws-cli/pull/1038>`__)
 
 
 1.6.5

diff --git a/awscli/customizations/s3/filegenerator.py b/awscli/customizations/s3/filegenerator.py
@@ -85,9 +85,9 @@ def __init__(self, directory, filename):
         self.directory = directory
         self.file_name = filename
         self.error_message = (
-            'There was an error trying to decode the the file "%s" in '
-            'directory "%s". \n%s' % (self.file_name,
-                                      self.directory.encode('utf-8'),
+            'There was an error trying to decode the the file %s in '
+            'directory "%s". \n%s' % (repr(self.file_name),
+                                      self.directory,
                                       self.ADVICE)
         )
         super(FileDecodingError, self).__init__(self.error_message)
@@ -170,28 +170,30 @@ def list_files(self, path, dir_op):
                 # contents and adding the directory separator to any
                 # directories.  We can then sort the contents,
                 # and ensure byte order.
-                names = listdir(path)
-                self._check_paths_decoded(path, names)
-                for i, name in enumerate(names):
-                    file_path = join(path, name)
-                    if isdir(file_path):
-                        names[i] = name + os.path.sep
+                listdir_names = listdir(path)
+                names = []
+                for name in listdir_names:
+                    if not self.should_ignore_file_with_decoding_warnings(
+                            path, name):
+                        file_path = join(path, name)
+                        if isdir(file_path):
+                            name = name + os.path.sep
+                        names.append(name)
                 self.normalize_sort(names, os.sep, '/')
                 for name in names:
                     file_path = join(path, name)
-                    if not self.should_ignore_file(file_path):
-                        if isdir(file_path):
-                            # Anything in a directory will have a prefix of
-                            # this current directory and will come before the
-                            # remaining contents in this directory.  This
-                            # means we need to recurse into this sub directory
-                            # before yielding the rest of this directory's
-                            # contents.
-                            for x in self.list_files(file_path, dir_op):
-                                yield x
-                        else:
-                            size, last_update = get_file_stat(file_path)
-                            yield file_path, size, last_update
+                    if isdir(file_path):
+                        # Anything in a directory will have a prefix of
+                        # this current directory and will come before the
+                        # remaining contents in this directory.  This
+                        # means we need to recurse into this sub directory
+                        # before yielding the rest of this directory's
+                        # contents.
+                        for x in self.list_files(file_path, dir_op):
+                            yield x
+                    else:
+                        size, last_update = get_file_stat(file_path)
+                        yield file_path, size, last_update
 
     def normalize_sort(self, names, os_sep, character):
         """
@@ -202,16 +204,23 @@ def normalize_sort(self, names, os_sep, character):
         """
         names.sort(key=lambda item: item.replace(os_sep, character))
 
-    def _check_paths_decoded(self, path, names):
-        # We can get a UnicodeDecodeError if we try to listdir(<unicode>) and
-        # can't decode the contents with sys.getfilesystemencoding().  In this
-        # case listdir() returns the bytestring, which means that
-        # join(<unicode>, <str>) could raise a UnicodeDecodeError.  When this
-        # happens we raise a FileDecodingError that provides more information
-        # into what's going on.
-        for name in names:
-            if not isinstance(name, six.text_type):
-                raise FileDecodingError(path, name)
+    def should_ignore_file_with_decoding_warnings(self, dirname, filename):
+        """
+        We can get a UnicodeDecodeError if we try to listdir(<unicode>) and
+        can't decode the contents with sys.getfilesystemencoding().  In this
+        case listdir() returns the bytestring, which means that
+        join(<unicode>, <str>) could raise a UnicodeDecodeError.  When this
+        happens we warn using a FileDecodingError that provides more
+        information into what's going on.
+        """
+        if not isinstance(filename, six.text_type):
+            decoding_error = FileDecodingError(dirname, filename)
+            warning = create_warning(repr(filename),
+                                     decoding_error.error_message)
+            self.result_queue.put(warning)
+            return True
+        path = os.path.join(dirname, filename)
+        return self.should_ignore_file(path)
 
     def should_ignore_file(self, path):
         """

diff --git a/tests/unit/customizations/s3/test_executor.py b/tests/unit/customizations/s3/test_executor.py
@@ -23,6 +23,7 @@
 from awscli.customizations.s3.executor import IOWriterThread
 from awscli.customizations.s3.executor import ShutdownThreadRequest
 from awscli.customizations.s3.executor import Executor, PrintThread
+from awscli.customizations.s3.filegenerator import FileDecodingError
 from awscli.customizations.s3.utils import IORequest, IOCloseRequest, \
     PrintTask
 
@@ -204,3 +205,16 @@ def test_print_warning_only_show_errors(self):
                              quiet=False, only_show_errors=True)
         self.assert_expected_output(print_task, 'Bad File.', thread,
                                     'sys.stderr')
+
+    def test_print_decoding_error_message(self):
+        # This ensures that if the message being passed to the print string
+        # is from a decoding error. It can be outputted properly.
+        decoding_error = FileDecodingError('temp',
+                                           b'\xe2\x9c\x93')
+        print_task = PrintTask(message=decoding_error.error_message,
+                               warning=True)
+        thread = PrintThread(result_queue=self.result_queue,
+                             quiet=False, only_show_errors=False)
+        self.assert_expected_output(print_task,
+                                    decoding_error.error_message,
+                                    thread, 'sys.stderr')
diff --git a/tests/unit/customizations/s3/test_filegenerator.py b/tests/unit/customizations/s3/test_filegenerator.py
@@ -415,8 +415,14 @@ def test_error_raised_on_decoding_error(self, listdir_mock):
         file_generator = FileGenerator(None, None, None)
         # utf-8 encoding for U+2713.
         listdir_mock.return_value = [b'\xe2\x9c\x93']
-        with self.assertRaises(FileDecodingError):
-            list(file_generator.list_files(self.directory, dir_op=True))
+        list(file_generator.list_files(self.directory, dir_op=True))
+        # Ensure the message was added to the result queue and is
+        # being skipped.
+        self.assertFalse(file_generator.result_queue.empty())
+        warning_message = file_generator.result_queue.get()
+        self.assertIn("warning: Skipping file ", warning_message.message)
+        self.assertIn("Please check your locale settings.",
+                      warning_message.message)
 
     def test_list_files_is_in_sorted_order(self):
         p = os.path.join