google · jonathanmetzman · Oct 8, 2024 · Sep 26, 2024 · letitz · Sep 30, 2024
diff --git a/src/clusterfuzz/_internal/bot/fuzzers/utils.py b/src/clusterfuzz/_internal/bot/fuzzers/utils.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 """Fuzzer utils."""
 
+import functools
 import os
 import re
 import stat
 import tempfile
+from typing import Callable
+from typing import Optional
 
 from clusterfuzz._internal.base import utils
 from clusterfuzz._internal.metrics import logs
@@ -30,7 +33,7 @@
 EXTRA_BUILD_DIR = '__extra_build'
 
 
-def is_fuzz_target_local(file_path, file_handle=None):
+def is_fuzz_target(file_path, file_opener: Optional[Callable] = None):
   """Returns whether |file_path| is a fuzz target binary (local path)."""
   if '@' in file_path:
     # GFT targets often have periods in the name that get misinterpreted as an
@@ -53,7 +56,7 @@ def is_fuzz_target_local(file_path, file_handle=None):
     # Ignore files with disallowed extensions (to prevent opening e.g. .zips).
     return False
 
-  if not file_handle and not os.path.exists(file_path):
+  if not file_opener and not os.path.exists(file_path):
     # Ignore non-existent files for cases when we don't have a file handle.
     return False
 
@@ -72,24 +75,27 @@ def is_fuzz_target_local(file_path, file_handle=None):
     logs.warning('Tried to read from non-regular file: %s.' % file_path)
     return False
 
-  # Use already provided file handle or open the file.
-  local_file_handle = file_handle or open(file_path, 'rb')
-
-  result = False
-  for pattern in FUZZ_TARGET_SEARCH_BYTES:
-    # TODO(metzman): Bound this call so we don't read forever if something went
-    # wrong.
-    local_file_handle.seek(0)
-    result = utils.search_bytes_in_file(pattern, local_file_handle)
-    if result:
-      break
-
-  if not file_handle:
-    # If this local file handle is owned by our function, close it now.
-    # Otherwise, it is caller's responsibility.
-    local_file_handle.close()
-
-  return result
+  # Either use the file opener or open the file ourselves.
+  if not file_opener:
+    file_opener = functools.partial(open, mode='rb')
+  try:
+    with file_opener(file_path) as file_handle:
+      result = False
+      for pattern in FUZZ_TARGET_SEARCH_BYTES:
+        # TODO(metzman): Bound this call so we don't read forever if something
+        # went wrong.
+        file_handle.seek(0)
+        result = utils.search_bytes_in_file(pattern, file_handle)
+        if result:
+          break
+
+      file_handle.close()
+
+      return result
+  except Exception as e:
+    # In case we could not open the file, we consider it's not a fuzzer.
+    logs.warning(f'Could not open {file_path}: {e}')
+    return False
 
 
 def get_fuzz_targets_local(path):
@@ -103,7 +109,7 @@ def get_fuzz_targets_local(path):
         continue
 
       file_path = os.path.join(root, filename)
-      if is_fuzz_target_local(file_path):
+      if is_fuzz_target(file_path):
         fuzz_target_paths.append(file_path)
 
   return fuzz_target_paths

diff --git a/src/clusterfuzz/_internal/build_management/build_archive.py b/src/clusterfuzz/_internal/build_management/build_archive.py
@@ -190,14 +190,10 @@ def list_fuzz_targets(self) -> List[str]:
     from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils
 
     for archive_file in self.list_members():
-      file_content = self.try_open(archive_file.name)
-      if fuzzer_utils.is_fuzz_target_local(archive_file.name, file_content):
+      if fuzzer_utils.is_fuzz_target(archive_file.name, self.open):
         fuzz_target = fuzzer_utils.normalize_target_name(archive_file.name)
         self._fuzz_targets[fuzz_target] = archive_file.name
 
-      if file_content:
-        file_content.close()
-
     return list(self._fuzz_targets.keys())
 
   def unpacked_size(self, fuzz_target: Optional[str] = None) -> int:
@@ -299,23 +295,19 @@ def get_target_dependencies(
     return res
 
 
-# pylint: disable=redefined-builtin
-def open(archive_path: str) -> BuildArchive:
-  """Opens the archive and gets the appropriate build archive based on the
-  `archive_path`. The resulting object is usable as a normal archive reader,
-  but provides additional feature related to build handling.
+def open_with_reader(reader: archive.ArchiveReader) -> BuildArchive:
+  """Open the archive and gets the appropriate build archive based on the
+  provided archive information.
 
   Args:
-      archive_path: the path to the archive.
+      reader: the archive reader.
 
   Raises:
-      If the file could not be opened or if the archive type cannot be handled.
+    If the archive reader cannot be handled.
 
   Returns:
-      the build archive.
+      The build archive.
   """
-  reader = archive.open(archive_path)
-
   # Unfortunately, there is no good heuristic for determining which build
   # archive implementation to use.
   # Hopefully, we can search in the archive whether some files are present and
@@ -328,3 +320,41 @@ def open(archive_path: str) -> BuildArchive:
   if reader.file_exists(args_gn_path):
     return ChromeBuildArchive(reader)
   return DefaultBuildArchive(reader)
+
+
+def open(archive_path: str) -> BuildArchive:  # pylint: disable=redefined-builtin
+  """Opens the archive and gets the appropriate build archive based on the
+  `archive_path`. The resulting object is usable as a normal archive reader,
+  but provides additional feature related to build handling.
+
+  Args:
+      archive_path: the path to the archive.
+
+  Raises:
+      If the file could not be opened or if the archive type cannot be handled.
+
+  Returns:
+      The build archive.
+  """
+  reader = archive.open(archive_path)
+  return open_with_reader(reader)
+
+
+def open_uri(uri: str) -> BuildArchive:
+  """Opens a build archive over HTTP. This is only compatible with chromium as
+  of now.
+
+  Args:
+      uri: the URI pointing to the zip file.
+
+  Returns:
+      The build archive.
+  """
+  reader = archive.ZipArchiveReader(archive.HttpZipFile(uri))
+  return open_with_reader(reader)
+
+
+def unzip_over_http_compatible(build_url: str) -> bool:
+  """Whether the build URL is compatible with unzipping over HTTP.
+  """
+  return archive.HttpZipFile.is_uri_compatible(build_url)
diff --git a/src/clusterfuzz/_internal/build_management/build_manager.py b/src/clusterfuzz/_internal/build_management/build_manager.py
@@ -14,11 +14,13 @@
 """Build manager."""
 
 from collections import namedtuple
+import contextlib
 import os
 import re
 import shutil
 import subprocess
 import time
+from typing import Optional
 
 from clusterfuzz._internal.base import errors
 from clusterfuzz._internal.base import utils
@@ -402,23 +404,19 @@ def _post_setup_success(self, update_revision=True):
     if instrumented_library_paths:
       self._patch_rpaths(instrumented_library_paths)
 
-  def _unpack_build(self, base_build_dir, build_dir, build_url):
-    """Unpacks a build from a build url into the build directory."""
-    # Track time taken to unpack builds so that it doesn't silently regress.
-    start_time = time.time()
-
-    logs.info(f'Unpacking build from {build_url} into {build_dir}.')
+  @contextlib.contextmanager
+  def _download_and_open_build_archive(self, base_build_dir: str,
+                                       build_dir: str, build_url: str):
+    """Downloads the build archive at `build_url` and opens it.
 
-    # Free up memory.
-    utils.python_gc()
-
-    # Remove the current build.
-    logs.info(f'Removing build directory {build_dir}.')
-    if not shell.remove_directory(build_dir, recreate=True):
-      logs.error(f'Unable to clear build directory {build_dir}.')
-      _handle_unrecoverable_error_on_windows()
-      return False
+    Args:
+        base_build_dir: the base build directory
+        build_dir: the current build directory
+        build_url: the build URL
 
+    Yields:
+        the build archive
+    """
     # Download build archive locally.
     build_local_archive = os.path.join(build_dir, os.path.basename(build_url))
 
@@ -431,15 +429,83 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
           'Failed to make space for download. '
           'Cleared all data directories to free up space, exiting.')
 
-    logs.info(f'Downloading build from {build_url}.')
+    logs.info(f'Downloading build from {build_url} to {build_local_archive}.')
     try:
       storage.copy_file_from(build_url, build_local_archive)
     except Exception as e:
       logs.error(f'Unable to download build from {build_url}: {e}')
-      return False
+      raise
 
     try:
       with build_archive.open(build_local_archive) as build:
+        yield build
+    finally:
+      shell.remove_file(build_local_archive)
+
+  def _open_build_archive(self, base_build_dir: str, build_dir: str,
+                          build_url: str, http_build_url: Optional[str],
+                          unpack_everything: Optional[bool]):
+    """Gets a handle on a build archive for the current build. Depending on the
+    provided parameters, this function might download the build archive into
+    the build directory or directly use remote HTTP archive.
+
+    Args:
+        unpack_everything: wether we should unpack the whole archive or try
+        selective unpacking.
+        base_build_dir: the base build directory.
+        build_dir: the current build directory.
+        build_url: the build URL.
+        http_build_url: the HTTP build URL.
+
+    Raises:
+        if an error occurred while accessing the file over HTTP or while
+        downloading the file on disk.
+
+    Returns:
+        the build archive.
+    """
+    # We only want to use remote unzipping if we're not unpacking everything and
+    # if the HTTP URL is compatible with remote unzipping.
+    allow_unpack_over_http = environment.get_value(
+        'ALLOW_UNPACK_OVER_HTTP', default_value=False)
+    can_unzip_over_http = (
+        allow_unpack_over_http and not unpack_everything and http_build_url and
+        build_archive.unzip_over_http_compatible(http_build_url))
+
+    if not can_unzip_over_http:
+      return self._download_and_open_build_archive(base_build_dir, build_dir,
+                                                   build_url)
+    logs.info("Opening an archive over HTTP, skipping archive download.")
+    assert http_build_url
+    return build_archive.open_uri(http_build_url)
+
+  def _unpack_build(self,
+                    base_build_dir,
+                    build_dir,
+                    build_url,
+                    http_build_url=None):
+    """Unpacks a build from a build url into the build directory."""
+    # Track time taken to unpack builds so that it doesn't silently regress.
+    start_time = time.time()
+
+    unpack_everything = environment.get_value(
+        'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
+
+    logs.info(f'Unpacking build from {build_url} into {build_dir}.')
+
+    # Free up memory.
+    utils.python_gc()
+
+    # Remove the current build.
+    logs.info(f'Removing build directory {build_dir}.')
+    if not shell.remove_directory(build_dir, recreate=True):
+      logs.error(f'Unable to clear build directory {build_dir}.')
+      _handle_unrecoverable_error_on_windows()
+      return False
+
+    try:
+      with self._open_build_archive(base_build_dir, build_dir, build_url,
+                                    http_build_url, unpack_everything) as build:
         unpack_everything = environment.get_value(
             'UNPACK_ALL_FUZZ_TARGETS_AND_FILES')
 
@@ -463,8 +529,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
               'Cleared all data directories to free up space, exiting.')
 
         # Unpack the local build archive.
-        logs.info(
-            f'Unpacking build archive {build_local_archive} to {build_dir}.')
+        logs.info(f'Unpacking build archive {build_url} to {build_dir}.')
         trusted = not utils.is_oss_fuzz()
 
         build.unpack(
@@ -473,7 +538,7 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
             trusted=trusted)
 
     except Exception as e:
-      logs.error(f'Unable to unpack build archive {build_local_archive}: {e}')
+      logs.error(f'Unable to unpack build archive {build_url}: {e}')
       return False
 
     if unpack_everything:
@@ -484,9 +549,6 @@ def _unpack_build(self, base_build_dir, build_dir, build_url):
       partial_build_file_path = os.path.join(build_dir, PARTIAL_BUILD_FILE)
       utils.write_data_to_file('', partial_build_file_path)
 
-    # No point in keeping the archive around.
-    shell.remove_file(build_local_archive)
-
     elapsed_time = time.time() - start_time
     elapsed_mins = elapsed_time / 60.
     log_func = logs.warning if elapsed_time > UNPACK_TIME_LIMIT else logs.info
@@ -605,10 +667,20 @@ def __init__(self,
                revision,
                build_url,
                build_prefix='',
-               fuzz_target=None):
+               fuzz_target=None,
+               http_build_url=None):
+    """RegularBuild constructor. See Build constructor for other parameters.
+
+    Args:
+        http_build_url: the http build URL. E.g.
+        http://storage.com/foo/bar.zip. Defaults to None.
+        build_url: the GCS bucket URL where the build is stored. E.g.
+        gs://foo/bar.zip.
+    """
     super().__init__(
         base_build_dir, revision, build_prefix, fuzz_target=fuzz_target)
     self.build_url = build_url
+    self.http_build_url = http_build_url
 
     if build_prefix:
       self.build_dir_name = build_prefix.lower()
@@ -630,7 +702,7 @@ def setup(self):
     build_update = not self.exists()
     if build_update:
       if not self._unpack_build(self.base_build_dir, self.build_dir,
-                                self.build_url):
+                                self.build_url, self.http_build_url):
         return False
 
       logs.info('Retrieved build r%d.' % self.revision)
@@ -1116,6 +1188,9 @@ def setup_regular_build(revision,
 
     return None
 
+  # build_url points to a GCP bucket, and we're only converting it to its HTTP
+  # endpoint so that we can use remote unzipping.
+  http_build_url = build_url.replace('gs://', 'https://storage.googleapis.com/')
   base_build_dir = _base_build_dir(bucket_path)
 
   build_class = RegularBuild
@@ -1133,7 +1208,8 @@ def setup_regular_build(revision,
       revision,
       build_url,
       build_prefix=build_prefix,
-      fuzz_target=fuzz_target)
+      fuzz_target=fuzz_target,
+      http_build_url=http_build_url)
   if build.setup():
     result = build
   else: