Skip to content

Commit

Permalink
Fix bug when filtering s3 locations
Browse files Browse the repository at this point in the history
The pattern is evaluated against the entire bucket.  This doesn't
mattern for suffix searches, but for a prefix search, you'd have to
include the bucket name.  This is inconsistent with filtering local
files.  Now they're both consistent.  Given:

  --exclude 'foo*'

This will filter any full path that startsw with foo.  So locally:

  rootdir/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

And on s3, we now have the same results:

  bucket/
    foo.txt      # yes
    foo1.txt     # yes
    foo/bar.txt  # yes
    bar.txt      # no

I also added debug logs to help customers troubleshoot why their
filters aren't working the way they expect.
  • Loading branch information
jamesls committed Dec 5, 2013
1 parent 8c2941a commit 5063e56
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 4 deletions.
18 changes: 15 additions & 3 deletions awscli/customizations/s3/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import logging
import fnmatch
import os


LOG = logging.getLogger(__name__)


class Filter(object):
"""
This is a universal exclude/include filter.
Expand Down Expand Up @@ -55,13 +59,21 @@ def call(self, file_infos):

else:
path_pattern = pattern[1].replace(os.sep, '/')
full_path_pattern = path_pattern

full_path_pattern = os.path.join(file_path.split('/')[0],
path_pattern)
is_match = fnmatch.fnmatch(file_path, full_path_pattern)
if is_match and pattern_type == '--include':
file_status = (file_info, True)
LOG.debug("%s matched include filter: %s",
file_path, full_path_pattern)
elif is_match and pattern_type == '--exclude':
file_status = (file_info, False)

LOG.debug("%s matched exclude filter: %s",
file_path, full_path_pattern)
else:
LOG.debug("%s did not match %s filter: %s",
file_path, pattern_type[2:], full_path_pattern)
LOG.debug("=%s final filtered status, should_include: %s",
file_path, file_status[1])
if file_status[1]:
yield file_info
46 changes: 45 additions & 1 deletion tests/unit/customizations/s3/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ def setUp(self):

def file_info(self, filename, src_type='local'):
if src_type == 'local':
filename = os.path.abspath(filename)
dest_type = 's3'
else:
dest_type = 'local'
return FileInfo(src=os.path.abspath(filename), dest='',
return FileInfo(src=filename, dest='',
compare_key='', size=10,
last_update=0, src_type=src_type,
dest_type=dest_type, operation_name='',
Expand Down Expand Up @@ -84,6 +85,49 @@ def test_include_exclude(self):
matched_files = list(exclude_all_filter.call(self.s3_files))
self.assertEqual(matched_files, [])

def test_prefix_filtering_consistent(self):
# The same filter should work for both local and remote files.
# So if I have a directory with 2 files:
local_files = [
self.file_info('test1.txt'),
self.file_info('nottest1.txt'),
]
# And the same 2 files remote (note that the way FileInfo objects
# are constructed, we'll have the bucket name but no leading '/'
# character):
remote_files = [
self.file_info('bucket/test1.txt', src_type='s3'),
self.file_info('bucket/nottest1.txt', src_type='s3'),
]
# If I apply the filter to the local to the local files.
exclude_filter = Filter({'filters': [['--exclude', 't*']]})
filtered_files = list(exclude_filter.call(local_files))
self.assertEqual(len(filtered_files), 1)
self.assertEqual(os.path.basename(filtered_files[0].src),
'nottest1.txt')

# I should get the same result if I apply the same filter to s3
# objects.
same_filtered_files = list(exclude_filter.call(remote_files))
self.assertEqual(len(same_filtered_files), 1)
self.assertEqual(os.path.basename(same_filtered_files[0].src),
'nottest1.txt')

def test_bucket_exclude_with_prefix(self):
s3_files = [
self.file_info('bucket/dir1/key1.txt', src_type='s3'),
self.file_info('bucket/dir1/key2.txt', src_type='s3'),
self.file_info('bucket/dir1/notkey3.txt', src_type='s3'),
]
filtered_files = list(
Filter({'filters': [['--exclude', 'dir1/*']]}).call(s3_files))
self.assertEqual(filtered_files, [])

key_files = list(
Filter({'filters': [['--exclude', 'dir1/key*']]}).call(s3_files))
self.assertEqual(len(key_files), 1)
self.assertEqual(key_files[0].src, 'bucket/dir1/notkey3.txt')


if __name__ == "__main__":
unittest.main()

0 comments on commit 5063e56

Please sign in to comment.