Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[build_manager] add support for remote zip #4263

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 27 additions & 21 deletions src/clusterfuzz/_internal/bot/fuzzers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
# limitations under the License.
"""Fuzzer utils."""

import functools
import os
import re
import stat
import tempfile
from typing import Callable
from typing import Optional

from clusterfuzz._internal.base import utils
from clusterfuzz._internal.metrics import logs
Expand All @@ -30,7 +33,7 @@
EXTRA_BUILD_DIR = '__extra_build'


def is_fuzz_target_local(file_path, file_handle=None):
def is_fuzz_target(file_path, file_opener: Optional[Callable] = None):
"""Returns whether |file_path| is a fuzz target binary (local path)."""
if '@' in file_path:
# GFT targets often have periods in the name that get misinterpreted as an
Expand All @@ -53,7 +56,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
# Ignore files with disallowed extensions (to prevent opening e.g. .zips).
return False

if not file_handle and not os.path.exists(file_path):
if not file_opener and not os.path.exists(file_path):
# Ignore non-existent files for cases when we don't have a file handle.
return False

Expand All @@ -72,24 +75,27 @@ def is_fuzz_target_local(file_path, file_handle=None):
logs.warning('Tried to read from non-regular file: %s.' % file_path)
return False

# Use already provided file handle or open the file.
local_file_handle = file_handle or open(file_path, 'rb')

result = False
for pattern in FUZZ_TARGET_SEARCH_BYTES:
# TODO(metzman): Bound this call so we don't read forever if something went
# wrong.
local_file_handle.seek(0)
result = utils.search_bytes_in_file(pattern, local_file_handle)
if result:
break

if not file_handle:
# If this local file handle is owned by our function, close it now.
# Otherwise, it is caller's responsibility.
local_file_handle.close()

return result
# Either use the file opener or open the file ourselves.
if not file_opener:
file_opener = functools.partial(open, mode='rb')
try:
with file_opener(file_path) as file_handle:
result = False
for pattern in FUZZ_TARGET_SEARCH_BYTES:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note for future work: If searching these bytes take a long time, we might want to search for them "in parallel" by pushing the pattern list into search_bytes_in_file, which can scan each chunk of the file for all patterns (avoiding the need to re-read the file X times) or even reach for something like Aho-Corasick: https://pyahocorasick.readthedocs.io/en/latest/

# TODO(metzman): Bound this call so we don't read forever if something
# went wrong.
file_handle.seek(0)
result = utils.search_bytes_in_file(pattern, file_handle)
if result:
break

file_handle.close()

return result
except Exception as e:
# In case we could not open the file, we consider it's not a fuzzer.
logs.warning(f'Could not open {file_path}: {e}')
return False
paulsemel marked this conversation as resolved.
Show resolved Hide resolved


def get_fuzz_targets_local(path):
Expand All @@ -103,7 +109,7 @@ def get_fuzz_targets_local(path):
continue

file_path = os.path.join(root, filename)
if is_fuzz_target_local(file_path):
if is_fuzz_target(file_path):
paulsemel marked this conversation as resolved.
Show resolved Hide resolved
fuzz_target_paths.append(file_path)

return fuzz_target_paths
Expand Down
60 changes: 45 additions & 15 deletions src/clusterfuzz/_internal/build_management/build_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,10 @@ def list_fuzz_targets(self) -> List[str]:
from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils

for archive_file in self.list_members():
file_content = self.try_open(archive_file.name)
if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
if fuzzer_utils.is_fuzz_target(archive_file.name, self.open):
fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
self._fuzz_targets[fuzz_target] = archive_file.name

if file_content:
file_content.close()

return list(self._fuzz_targets.keys())

def unpacked_size(self, fuzz_target: Optional[str] = None) -> int:
Expand Down Expand Up @@ -299,23 +295,19 @@ def get_target_dependencies(
return res


# pylint: disable=redefined-builtin
def open(archive_path: str) -> BuildArchive:
"""Opens the archive and gets the appropriate build archive based on the
`archive_path`. The resulting object is usable as a normal archive reader,
but provides additional feature related to build handling.
def open_with_reader(reader: archive.ArchiveReader) -> BuildArchive:
"""Open the archive and gets the appropriate build archive based on the
provided archive information.

Args:
archive_path: the path to the archive.
reader: the archive reader.

Raises:
If the file could not be opened or if the archive type cannot be handled.
If the archive reader cannot be handled.

Returns:
the build archive.
The build archive.
"""
reader = archive.open(archive_path)

# Unfortunately, there is no good heuristic for determining which build
# archive implementation to use.
# Hopefully, we can search in the archive whether some files are present and
Expand All @@ -328,3 +320,41 @@ def open(archive_path: str) -> BuildArchive:
if reader.file_exists(args_gn_path):
return ChromeBuildArchive(reader)
return DefaultBuildArchive(reader)


def open(archive_path: str) -> BuildArchive: # pylint: disable=redefined-builtin
"""Opens the archive and gets the appropriate build archive based on the
`archive_path`. The resulting object is usable as a normal archive reader,
but provides additional feature related to build handling.

Args:
archive_path: the path to the archive.

Raises:
If the file could not be opened or if the archive type cannot be handled.

Returns:
The build archive.
"""
reader = archive.open(archive_path)
return open_with_reader(reader)


def open_uri(uri: str) -> BuildArchive:
"""Opens a build archive over HTTP. This is only compatible with chromium as
of now.

Args:
uri: the URI pointing to the zip file.

Returns:
The build archive.
"""
reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
return open_with_reader(reader)


def unzip_over_http_compatible(build_url: str) -> bool:
"""Whether the build URL is compatible with unzipping over HTTP.
"""
return archive.HttpZipFile.is_uri_compatible(build_url)
128 changes: 102 additions & 26 deletions src/clusterfuzz/_internal/build_management/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
"""Build manager."""

from collections import namedtuple
import contextlib
import os
import re
import shutil
import subprocess
import time
from typing import Optional

from clusterfuzz._internal.base import errors
from clusterfuzz._internal.base import utils
Expand Down Expand Up @@ -402,23 +404,19 @@ def _post_setup_success(self, update_revision=True):
if instrumented_library_paths:
self._patch_rpaths(instrumented_library_paths)

def _unpack_build(self, base_build_dir, build_dir, build_url):
"""Unpacks a build from a build url into the build directory."""
# Track time taken to unpack builds so that it doesn't silently regress.
start_time = time.time()

logs.info(f'Unpacking build from {build_url} into {build_dir}.')
@contextlib.contextmanager
def _download_and_open_build_archive(self, base_build_dir: str,
build_dir: str, build_url: str):
"""Downloads the build archive at `build_url` and opens it.

# Free up memory.
utils.python_gc()

# Remove the current build.
logs.info(f'Removing build directory {build_dir}.')
if not shell.remove_directory(build_dir, recreate=True):
logs.error(f'Unable to clear build directory {build_dir}.')
_handle_unrecoverable_error_on_windows()
return False
Args:
base_build_dir: the base build directory
build_dir: the current build directory
build_url: the build URL

Yields:
the build archive
"""
# Download build archive locally.
build_local_archive = os.path.join(build_dir, os.path.basename(build_url))

Expand All @@ -431,15 +429,83 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
'Failed to make space for download. '
'Cleared all data directories to free up space, exiting.')

logs.info(f'Downloading build from {build_url}.')
logs.info(f'Downloading build from {build_url} to {build_local_archive}.')
try:
storage.copy_file_from(build_url, build_local_archive)
except Exception as e:
logs.error(f'Unable to download build from {build_url}: {e}')
return False
raise

try:
with build_archive.open(build_local_archive) as build:
yield build
finally:
shell.remove_file(build_local_archive)

def _open_build_archive(self, base_build_dir: str, build_dir: str,
build_url: str, http_build_url: Optional[str],
unpack_everything: Optional[bool]):
"""Gets a handle on a build archive for the current build. Depending on the
provided parameters, this function might download the build archive into
the build directory or directly use remote HTTP archive.

Args:
unpack_everything: wether we should unpack the whole archive or try
selective unpacking.
base_build_dir: the base build directory.
build_dir: the current build directory.
build_url: the build URL.
http_build_url: the HTTP build URL.

Raises:
if an error occurred while accessing the file over HTTP or while
downloading the file on disk.

Returns:
the build archive.
"""
# We only want to use remote unzipping if we're not unpacking everything and
# if the HTTP URL is compatible with remote unzipping.
allow_unpack_over_http = environment.get_value(
'ALLOW_UNPACK_OVER_HTTP', default_value=False)
can_unzip_over_http = (
allow_unpack_over_http and not unpack_everything and http_build_url and
build_archive.unzip_over_http_compatible(http_build_url))

if not can_unzip_over_http:
return self._download_and_open_build_archive(base_build_dir, build_dir,
build_url)
logs.info("Opening an archive over HTTP, skipping archive download.")
assert http_build_url
return build_archive.open_uri(http_build_url)

def _unpack_build(self,
base_build_dir,
build_dir,
build_url,
http_build_url=None):
"""Unpacks a build from a build url into the build directory."""
# Track time taken to unpack builds so that it doesn't silently regress.
start_time = time.time()

unpack_everything = environment.get_value(
'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')

logs.info(f'Unpacking build from {build_url} into {build_dir}.')

# Free up memory.
utils.python_gc()

# Remove the current build.
logs.info(f'Removing build directory {build_dir}.')
if not shell.remove_directory(build_dir, recreate=True):
logs.error(f'Unable to clear build directory {build_dir}.')
_handle_unrecoverable_error_on_windows()
return False

try:
with self._open_build_archive(base_build_dir, build_dir, build_url,
http_build_url, unpack_everything) as build:
unpack_everything = environment.get_value(
paulsemel marked this conversation as resolved.
Show resolved Hide resolved
'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')

Expand All @@ -463,8 +529,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
'Cleared all data directories to free up space, exiting.')

# Unpack the local build archive.
logs.info(
f'Unpacking build archive {build_local_archive} to {build_dir}.')
logs.info(f'Unpacking build archive {build_url} to {build_dir}.')
paulsemel marked this conversation as resolved.
Show resolved Hide resolved
trusted = not utils.is_oss_fuzz()

build.unpack(
Expand All @@ -473,7 +538,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
trusted=trusted)

except Exception as e:
logs.error(f'Unable to unpack build archive {build_local_archive}: {e}')
logs.error(f'Unable to unpack build archive {build_url}: {e}')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this pass linting? I had the linter recently tell me to use lazy string interpolation for logging calls:

logs.error('Unable to unpack build archive %s: %s', build_url, e)

Maybe because this uses the logs module you're safe.

return False

if unpack_everything:
Expand All @@ -484,9 +549,6 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
partial_build_file_path = os.path.join(build_dir, PARTIAL_BUILD_FILE)
utils.write_data_to_file('', partial_build_file_path)

# No point in keeping the archive around.
shell.remove_file(build_local_archive)

elapsed_time = time.time() - start_time
elapsed_mins = elapsed_time / 60.
log_func = logs.warning if elapsed_time > UNPACK_TIME_LIMIT else logs.info
Expand Down Expand Up @@ -605,10 +667,20 @@ def __init__(self,
revision,
build_url,
build_prefix='',
fuzz_target=None):
fuzz_target=None,
http_build_url=None):
paulsemel marked this conversation as resolved.
Show resolved Hide resolved
"""RegularBuild constructor. See Build constructor for other parameters.

Args:
http_build_url: the http build URL. E.g.
http://storage.com/foo/bar.zip. Defaults to None.
build_url: the GCS bucket URL where the build is stored. E.g.
gs://foo/bar.zip.
"""
super().__init__(
base_build_dir, revision, build_prefix, fuzz_target=fuzz_target)
self.build_url = build_url
self.http_build_url = http_build_url

if build_prefix:
self.build_dir_name = build_prefix.lower()
Expand All @@ -630,7 +702,7 @@ def setup(self):
build_update = not self.exists()
if build_update:
if not self._unpack_build(self.base_build_dir, self.build_dir,
self.build_url):
self.build_url, self.http_build_url):
return False

logs.info('Retrieved build r%d.' % self.revision)
Expand Down Expand Up @@ -1116,6 +1188,9 @@ def setup_regular_build(revision,

return None

# build_url points to a GCP bucket, and we're only converting it to its HTTP
# endpoint so that we can use remote unzipping.
http_build_url = build_url.replace('gs://', 'https://storage.googleapis.com/')
paulsemel marked this conversation as resolved.
Show resolved Hide resolved
base_build_dir = _base_build_dir(bucket_path)

build_class = RegularBuild
Expand All @@ -1133,7 +1208,8 @@ def setup_regular_build(revision,
revision,
build_url,
build_prefix=build_prefix,
fuzz_target=fuzz_target)
fuzz_target=fuzz_target,
http_build_url=http_build_url)
if build.setup():
result = build
else:
Expand Down
Loading
Loading