[build_manager] add support for remote zip

This adds support for remote ZIP. As of now, performances are quite good locally, and the read ahead mechanism should keep reasonable performance. Also, given that the ClusterFuzz bots are having HDD, numbers might even be better there, as we're only storing on disk when unpacking the build. The memory consumption of this new feature is contant: it uses at most (and most of the time) 50 MB of RAM.
google · Sep 20, 2024 · cb662bd · cb662bd
1 parent 7684841
commit cb662bd
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 33 deletions.
diff --git a/src/clusterfuzz/_internal/bot/fuzzers/utils.py b/src/clusterfuzz/_internal/bot/fuzzers/utils.py
@@ -30,7 +30,7 @@
 EXTRA_BUILD_DIR = '__extra_build'
 
 
-def is_fuzz_target_local(file_path, file_handle=None):
+def is_fuzz_target_local(file_path, file_opener=None):
   """Returns whether |file_path| is a fuzz target binary (local path)."""
   if '@' in file_path:
     # GFT targets often have periods in the name that get misinterpreted as an
@@ -53,7 +53,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
     # Ignore files with disallowed extensions (to prevent opening e.g. .zips).
     return False
 
-  if not file_handle and not os.path.exists(file_path):
+  if not file_opener and not os.path.exists(file_path):
     # Ignore non-existent files for cases when we don't have a file handle.
     return False
 
@@ -72,8 +72,13 @@ def is_fuzz_target_local(file_path, file_handle=None):
     logs.warning('Tried to read from non-regular file: %s.' % file_path)
     return False
 
-  # Use already provided file handle or open the file.
-  local_file_handle = file_handle or open(file_path, 'rb')
+  # Either use the file opener or open the file ourselves.
+  if file_opener:
+    local_file_handle = file_opener.try_open(file_path)
+    if not local_file_handle:
+      return False
+  else:
+    local_file_handle = open(file_path, 'rb')
 
   result = False
   for pattern in FUZZ_TARGET_SEARCH_BYTES:
@@ -84,10 +89,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
     if result:
       break
 
-  if not file_handle:
-    # If this local file handle is owned by our function, close it now.
-    # Otherwise, it is caller's responsibility.
-    local_file_handle.close()
+  local_file_handle.close()
 
   return result
 

diff --git a/src/clusterfuzz/_internal/build_management/build_archive.py b/src/clusterfuzz/_internal/build_management/build_archive.py
@@ -15,6 +15,7 @@
 
 import abc
 import os
+import re
 from typing import BinaryIO
 from typing import Callable
 from typing import List
@@ -51,6 +52,8 @@
     'src_root',
 ]
 
+CHROMIUM_GS_MATCHER = re.compile(r'gs://(chromium-.*\.zip)')
+
 
 class BuildArchive(archive.ArchiveReader):
   """Abstract class for representing a build archive. This is mostly an
@@ -190,13 +193,13 @@ def list_fuzz_targets(self) -> List[str]:
     from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils
 
     for archive_file in self.list_members():
-      file_content = self.try_open(archive_file.name)
-      if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
+      # file_content = self.try_open(archive_file.name)
+      if fuzzer_utils.is_fuzz_target_local(archive_file.name, self):
         fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
         self._fuzz_targets[fuzz_target] = archive_file.name
 
-      if file_content:
-        file_content.close()
+      # if file_content:
+      #   file_content.close()
 
     return list(self._fuzz_targets.keys())
 
@@ -328,3 +331,34 @@ def open(archive_path: str) -> BuildArchive:
   if reader.file_exists(args_gn_path):
     return ChromeBuildArchive(reader)
   return DefaultBuildArchive(reader)
+
+
+def open_uri(uri: str) -> BuildArchive:
+  """Opens a build archive over HTTP. This is only compatible with chromium as
+  of now.
+
+  Args:
+      uri: the gs:// storage URI.
+
+  Returns:
+      the build archive.
+  """
+  assert unzip_over_http_compatible(uri)
+  match = CHROMIUM_GS_MATCHER.match(uri)
+  uri = f'https://commondatastorage.googleapis.com/{match.group(1)}'
+  reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
+  args_gn_path = os.path.join(reader.root_dir(), 'args.gn')
+  if reader.file_exists(args_gn_path):
+    return ChromeBuildArchive(reader)
+  return DefaultBuildArchive(reader)
+
+
+def unzip_over_http_compatible(build_url: str) -> bool:
+  """Whether the build URL is compatible with unzipping over HTTP.
+  As for now, we're only checking for chromium compatible URLs.
+  """
+  match = CHROMIUM_GS_MATCHER.match(build_url)
+  if not match:
+    return False
+  uri = f'https://commondatastorage.googleapis.com/{match.group(1)}'
+  return archive.HttpZipFile.is_uri_compatible(uri)
diff --git a/src/clusterfuzz/_internal/build_management/build_manager.py b/src/clusterfuzz/_internal/build_management/build_manager.py
@@ -425,27 +425,38 @@ def _unpack_build(self,
       _handle_unrecoverable_error_on_windows()
       return False
 
-    # Download build archive locally.
-    build_local_archive = os.path.join(build_dir, os.path.basename(build_url))
-
-    # Make the disk space necessary for the archive available.
-    archive_size = storage.get_object_size(build_url)
-    if archive_size is not None and not _make_space(archive_size,
-                                                    base_build_dir):
-      shell.clear_data_directories()
-      logs.log_fatal_and_exit(
-          'Failed to make space for download. '
-          'Cleared all data directories to free up space, exiting.')
-
-    logs.info(f'Downloading build from {build_url}.')
-    try:
-      storage.copy_file_from(build_url, build_local_archive)
-    except Exception as e:
-      logs.error(f'Unable to download build from {build_url}: {e}')
-      return False
+    should_download = not utils.is_chromium(
+    ) or not build_archive.unzip_over_http_compatible(build_url)
+
+    if should_download:
+      # Download build archive locally.
+      build_local_archive = os.path.join(build_dir, os.path.basename(build_url))
+
+      # Make the disk space necessary for the archive available.
+      archive_size = storage.get_object_size(build_url)
+      if archive_size is not None and not _make_space(archive_size,
+                                                      base_build_dir):
+        shell.clear_data_directories()
+        logs.log_fatal_and_exit(
+            'Failed to make space for download. '
+            'Cleared all data directories to free up space, exiting.')
+
+      logs.info(f'Downloading build from {build_url}.')
+      try:
+        storage.copy_file_from(build_url, build_local_archive)
+      except Exception as e:
+        logs.error(f'Unable to download build from {build_url}: {e}')
+        return False
+
+      opener = build_archive.open
+      file_arg = build_local_archive
+    else:
+      logs.info("Using an archive over HTTP, skipping archive download.")
+      opener = build_archive.open_uri
+      file_arg = build_url
 
     try:
-      with build_archive.open(build_local_archive) as build:
+      with opener(file_arg) as build:
         unpack_everything = environment.get_value(
             'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
         if not unpack_everything:

diff --git a/src/clusterfuzz/_internal/system/archive.py b/src/clusterfuzz/_internal/system/archive.py
@@ -21,7 +21,9 @@
 from typing import Callable
 from typing import List
 from typing import Optional
+from typing import Sequence
 from typing import Union
+from urllib import request
 import zipfile
 
 from clusterfuzz._internal.metrics import logs
@@ -39,6 +41,8 @@
 ARCHIVE_FILE_EXTENSIONS = (
     ZIP_FILE_EXTENSIONS + TAR_FILE_EXTENSIONS + LZMA_FILE_EXTENSIONS)
 
+REMOTE_HTTP_MIN_READ_SIZE = 50 * 1024 * 1024  # 50 MB
+
 StrBytesPathLike = Union[str, bytes, os.PathLike]
 MatchCallback = Callable[[str], bool]
 
@@ -407,6 +411,102 @@ class ArchiveType:
   TAR_LZMA = 3
 
 
+@dataclasses.dataclass
+class CacheBlock:
+  """Represents a cache entry for the HttpZipFile."""
+  start: int
+  end: int
+  content: Sequence[bytes]
+
+
+class HttpZipFile:
+  """This class is a very simple file-object representation of a file over HTTP.
+  It uses the 'Accept-Ranges' feature of HTTP to fetch parts (or all) of the
+  file.
+  """
+
+  @staticmethod
+  def is_uri_compatible(uri: str) -> bool:
+    try:
+      res = request.urlopen(request.Request(uri, method="HEAD"))
+      return res.getheader('Accept-Ranges') is not None
+    except:
+      return False
+
+  def __init__(self, uri):
+    self.uri = uri
+    resp = request.urlopen(request.Request(self.uri, method="HEAD"))
+    self.file_size = int(resp.getheader('Content-Length', default=0))
+    self._current_block = CacheBlock(0, 0, [])
+    self._pos = 0
+    assert resp.getheader('Accept-Ranges') is not None
+
+  def seekable(self) -> bool:
+    """Whether this is seekable.
+    """
+    return True
+
+  def seek(self, offset: int, from_what: int = 0):
+    """Provides a seek implementation.
+
+    Args:
+        offset: the offset
+        from_what: from where the offset should be computed. Defaults to 0.
+    """
+    if from_what == 0:
+      self._pos = offset
+    elif from_what == 1:
+      self._pos = self._pos + offset
+    else:
+      self._pos = self.file_size + offset
+    if self._pos > self.file_size:
+      self._pos = self.file_size
+    self._pos = max(self._pos, 0)
+
+  def tell(self) -> int:
+    """Provides a tell implementation. Returns the current curso position.
+
+    Returns:
+        the current cursor position.
+    """
+    return self._pos
+
+  def _fetch_from_http(self, start, end) -> Sequence[bytes]:
+    req = request.Request(
+        self.uri, method="GET", headers={'Range': f'bytes={start}-{end}'})
+    resp = request.urlopen(req)
+    return resp.read()
+
+  def _fetch_from_cache(self, start, end) -> Sequence[bytes]:
+    if self._current_block.start > start or self._current_block.end < end:
+      read_ahead_end = min(self.file_size - 1, end + REMOTE_HTTP_MIN_READ_SIZE)
+      self._current_block = CacheBlock(
+          start, read_ahead_end, self._fetch_from_http(start, read_ahead_end))
+    inner_start = start - self._current_block.start
+    inner_end = end - self._current_block.start
+    return self._current_block.content[inner_start:inner_end + 1]
+
+  def read(self, size=-1) -> Sequence[bytes]:
+    """Read into this file-object.
+
+    Args:
+        size: the size of the read. If not specified, reads all.
+
+    Returns:
+        a sequence of bytes.
+    """
+    if size == -1:
+      size = self.file_size - self._pos
+    read_size = min(self.file_size - self._pos, size)
+    end_range = self._pos + read_size - 1
+    if read_size > REMOTE_HTTP_MIN_READ_SIZE:
+      content = self._fetch_from_http(self._pos, end_range)
+    else:
+      content = self._fetch_from_cache(self._pos, end_range)
+    self._pos += read_size
+    return content
+
+
 def get_archive_type(archive_path: str) -> ArchiveType:
   """Get the type of the archive.
 

diff --git a/src/clusterfuzz/fuzz/__init__.py b/src/clusterfuzz/fuzz/__init__.py
@@ -42,9 +42,9 @@ def get_engine(name):
   return engine_impl
 
 
-def is_fuzz_target(file_path, file_handle=None):
+def is_fuzz_target(file_path, file_opener=None):
   """Returns whether |file_path| is a fuzz target."""
-  return utils.is_fuzz_target_local(file_path, file_handle)
+  return utils.is_fuzz_target_local(file_path, file_opener)
 
 
 def get_fuzz_targets(directory):