Skip to content

Commit

Permalink
[build_manager] add support for remote zip
Browse files Browse the repository at this point in the history
This adds support for remote ZIP.

As of now, performances are quite good locally, and the read ahead
mechanism should keep reasonable performance. Also, given that the
ClusterFuzz bots are having HDD, numbers might even be better there, as
we're only storing on disk when unpacking the build.

The memory consumption of this new feature is contant: it uses at most
(and most of the time) 50 MB of RAM.
  • Loading branch information
paulsemel committed Sep 20, 2024
1 parent 7684841 commit cb662bd
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 33 deletions.
18 changes: 10 additions & 8 deletions src/clusterfuzz/_internal/bot/fuzzers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
EXTRA_BUILD_DIR = '__extra_build'


def is_fuzz_target_local(file_path, file_handle=None):
def is_fuzz_target_local(file_path, file_opener=None):
"""Returns whether |file_path| is a fuzz target binary (local path)."""
if '@' in file_path:
# GFT targets often have periods in the name that get misinterpreted as an
Expand All @@ -53,7 +53,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
# Ignore files with disallowed extensions (to prevent opening e.g. .zips).
return False

if not file_handle and not os.path.exists(file_path):
if not file_opener and not os.path.exists(file_path):
# Ignore non-existent files for cases when we don't have a file handle.
return False

Expand All @@ -72,8 +72,13 @@ def is_fuzz_target_local(file_path, file_handle=None):
logs.warning('Tried to read from non-regular file: %s.' % file_path)
return False

# Use already provided file handle or open the file.
local_file_handle = file_handle or open(file_path, 'rb')
# Either use the file opener or open the file ourselves.
if file_opener:
local_file_handle = file_opener.try_open(file_path)
if not local_file_handle:
return False
else:
local_file_handle = open(file_path, 'rb')

result = False
for pattern in FUZZ_TARGET_SEARCH_BYTES:
Expand All @@ -84,10 +89,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
if result:
break

if not file_handle:
# If this local file handle is owned by our function, close it now.
# Otherwise, it is caller's responsibility.
local_file_handle.close()
local_file_handle.close()

return result

Expand Down
42 changes: 38 additions & 4 deletions src/clusterfuzz/_internal/build_management/build_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import abc
import os
import re
from typing import BinaryIO
from typing import Callable
from typing import List
Expand Down Expand Up @@ -51,6 +52,8 @@
'src_root',
]

CHROMIUM_GS_MATCHER = re.compile(r'gs://(chromium-.*\.zip)')


class BuildArchive(archive.ArchiveReader):
"""Abstract class for representing a build archive. This is mostly an
Expand Down Expand Up @@ -190,13 +193,13 @@ def list_fuzz_targets(self) -> List[str]:
from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils

for archive_file in self.list_members():
file_content = self.try_open(archive_file.name)
if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
# file_content = self.try_open(archive_file.name)
if fuzzer_utils.is_fuzz_target_local(archive_file.name, self):
fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
self._fuzz_targets[fuzz_target] = archive_file.name

if file_content:
file_content.close()
# if file_content:
# file_content.close()

return list(self._fuzz_targets.keys())

Expand Down Expand Up @@ -328,3 +331,34 @@ def open(archive_path: str) -> BuildArchive:
if reader.file_exists(args_gn_path):
return ChromeBuildArchive(reader)
return DefaultBuildArchive(reader)


def open_uri(uri: str) -> BuildArchive:
"""Opens a build archive over HTTP. This is only compatible with chromium as
of now.
Args:
uri: the gs:// storage URI.
Returns:
the build archive.
"""
assert unzip_over_http_compatible(uri)
match = CHROMIUM_GS_MATCHER.match(uri)
uri = f'https://commondatastorage.googleapis.com/{match.group(1)}'
reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
args_gn_path = os.path.join(reader.root_dir(), 'args.gn')
if reader.file_exists(args_gn_path):
return ChromeBuildArchive(reader)
return DefaultBuildArchive(reader)


def unzip_over_http_compatible(build_url: str) -> bool:
"""Whether the build URL is compatible with unzipping over HTTP.
As for now, we're only checking for chromium compatible URLs.
"""
match = CHROMIUM_GS_MATCHER.match(build_url)
if not match:
return False
uri = f'https://commondatastorage.googleapis.com/{match.group(1)}'
return archive.HttpZipFile.is_uri_compatible(uri)
49 changes: 30 additions & 19 deletions src/clusterfuzz/_internal/build_management/build_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,27 +425,38 @@ def _unpack_build(self,
_handle_unrecoverable_error_on_windows()
return False

# Download build archive locally.
build_local_archive = os.path.join(build_dir, os.path.basename(build_url))

# Make the disk space necessary for the archive available.
archive_size = storage.get_object_size(build_url)
if archive_size is not None and not _make_space(archive_size,
base_build_dir):
shell.clear_data_directories()
logs.log_fatal_and_exit(
'Failed to make space for download. '
'Cleared all data directories to free up space, exiting.')

logs.info(f'Downloading build from {build_url}.')
try:
storage.copy_file_from(build_url, build_local_archive)
except Exception as e:
logs.error(f'Unable to download build from {build_url}: {e}')
return False
should_download = not utils.is_chromium(
) or not build_archive.unzip_over_http_compatible(build_url)

if should_download:
# Download build archive locally.
build_local_archive = os.path.join(build_dir, os.path.basename(build_url))

# Make the disk space necessary for the archive available.
archive_size = storage.get_object_size(build_url)
if archive_size is not None and not _make_space(archive_size,
base_build_dir):
shell.clear_data_directories()
logs.log_fatal_and_exit(
'Failed to make space for download. '
'Cleared all data directories to free up space, exiting.')

logs.info(f'Downloading build from {build_url}.')
try:
storage.copy_file_from(build_url, build_local_archive)
except Exception as e:
logs.error(f'Unable to download build from {build_url}: {e}')
return False

opener = build_archive.open
file_arg = build_local_archive
else:
logs.info("Using an archive over HTTP, skipping archive download.")
opener = build_archive.open_uri
file_arg = build_url

try:
with build_archive.open(build_local_archive) as build:
with opener(file_arg) as build:
unpack_everything = environment.get_value(
'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
if not unpack_everything:
Expand Down
100 changes: 100 additions & 0 deletions src/clusterfuzz/_internal/system/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
from typing import Callable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Union
from urllib import request
import zipfile

from clusterfuzz._internal.metrics import logs
Expand All @@ -39,6 +41,8 @@
ARCHIVE_FILE_EXTENSIONS = (
ZIP_FILE_EXTENSIONS + TAR_FILE_EXTENSIONS + LZMA_FILE_EXTENSIONS)

REMOTE_HTTP_MIN_READ_SIZE = 50 * 1024 * 1024 # 50 MB

StrBytesPathLike = Union[str, bytes, os.PathLike]
MatchCallback = Callable[[str], bool]

Expand Down Expand Up @@ -407,6 +411,102 @@ class ArchiveType:
TAR_LZMA = 3


@dataclasses.dataclass
class CacheBlock:
"""Represents a cache entry for the HttpZipFile."""
start: int
end: int
content: Sequence[bytes]


class HttpZipFile:
"""This class is a very simple file-object representation of a file over HTTP.
It uses the 'Accept-Ranges' feature of HTTP to fetch parts (or all) of the
file.
"""

@staticmethod
def is_uri_compatible(uri: str) -> bool:
try:
res = request.urlopen(request.Request(uri, method="HEAD"))
return res.getheader('Accept-Ranges') is not None
except:
return False

def __init__(self, uri):
self.uri = uri
resp = request.urlopen(request.Request(self.uri, method="HEAD"))
self.file_size = int(resp.getheader('Content-Length', default=0))
self._current_block = CacheBlock(0, 0, [])
self._pos = 0
assert resp.getheader('Accept-Ranges') is not None

def seekable(self) -> bool:
"""Whether this is seekable.
"""
return True

def seek(self, offset: int, from_what: int = 0):
"""Provides a seek implementation.
Args:
offset: the offset
from_what: from where the offset should be computed. Defaults to 0.
"""
if from_what == 0:
self._pos = offset
elif from_what == 1:
self._pos = self._pos + offset
else:
self._pos = self.file_size + offset
if self._pos > self.file_size:
self._pos = self.file_size
self._pos = max(self._pos, 0)

def tell(self) -> int:
"""Provides a tell implementation. Returns the current curso position.
Returns:
the current cursor position.
"""
return self._pos

def _fetch_from_http(self, start, end) -> Sequence[bytes]:
req = request.Request(
self.uri, method="GET", headers={'Range': f'bytes={start}-{end}'})
resp = request.urlopen(req)
return resp.read()

def _fetch_from_cache(self, start, end) -> Sequence[bytes]:
if self._current_block.start > start or self._current_block.end < end:
read_ahead_end = min(self.file_size - 1, end + REMOTE_HTTP_MIN_READ_SIZE)
self._current_block = CacheBlock(
start, read_ahead_end, self._fetch_from_http(start, read_ahead_end))
inner_start = start - self._current_block.start
inner_end = end - self._current_block.start
return self._current_block.content[inner_start:inner_end + 1]

def read(self, size=-1) -> Sequence[bytes]:
"""Read into this file-object.
Args:
size: the size of the read. If not specified, reads all.
Returns:
a sequence of bytes.
"""
if size == -1:
size = self.file_size - self._pos
read_size = min(self.file_size - self._pos, size)
end_range = self._pos + read_size - 1
if read_size > REMOTE_HTTP_MIN_READ_SIZE:
content = self._fetch_from_http(self._pos, end_range)
else:
content = self._fetch_from_cache(self._pos, end_range)
self._pos += read_size
return content


def get_archive_type(archive_path: str) -> ArchiveType:
"""Get the type of the archive.
Expand Down
4 changes: 2 additions & 2 deletions src/clusterfuzz/fuzz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def get_engine(name):
return engine_impl


def is_fuzz_target(file_path, file_handle=None):
def is_fuzz_target(file_path, file_opener=None):
"""Returns whether |file_path| is a fuzz target."""
return utils.is_fuzz_target_local(file_path, file_handle)
return utils.is_fuzz_target_local(file_path, file_opener)


def get_fuzz_targets(directory):
Expand Down

0 comments on commit cb662bd

Please sign in to comment.