mandiant · williballenthin · Apr 3, 2023 · Apr 3, 2023 · Apr 3, 2023 · Apr 3, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: true
       matrix:
         include:
-          - os: ubuntu-18.04
+          - os: ubuntu-20.04
             # use old linux so that the shared library versioning is more portable
             artifact_name: capa
             asset_name: linux
@@ -36,7 +36,7 @@ jobs:
         uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
         with:
           python-version: 3.8
-      - if: matrix.os == 'ubuntu-18.04'
+      - if: matrix.os == 'ubuntu-20.04'
         run: sudo apt-get install -y libyaml-dev
       - name: Upgrade pip, setuptools
         run: python -m pip install --upgrade pip setuptools
@@ -65,10 +65,7 @@ jobs:
       matrix:
         include:
           # OSs not already tested above
-          - os: ubuntu-18.04
-            artifact_name: capa
-            asset_name: linux
-          - os: ubuntu-20.04
+          - os: ubuntu-22.04
             artifact_name: capa
             asset_name: linux
           - os: windows-2022

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -74,6 +74,8 @@ jobs:
             python-version: "3.8"
           - os: ubuntu-20.04
             python-version: "3.9"
+          - os: ubuntu-20.04
+            python-version: "3.10"
     steps:
     - name: Checkout capa with submodules
       uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0

diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py
@@ -503,6 +503,23 @@ def needed(self) -> Iterator[str]:
 
             yield read_cstr(strtab, d_val)
 
+    @property
+    def symtab(self) -> Optional[Tuple[Shdr, Shdr]]:
+        """
+        fetch the Shdr for the symtab and the associated strtab.
+        """
+        SHT_SYMTAB = 0x2
+        for shdr in self.section_headers:
+            if shdr.type != SHT_SYMTAB:
+                continue
+
+            # the linked section contains strings referenced by the symtab structures.
+            strtab_shdr = self.parse_section_header(shdr.link)
+
+            return shdr, strtab_shdr
+
+        return None
+
 
 @dataclass
 class ABITag:
@@ -604,40 +621,63 @@ def abi_tag(self) -> Optional[ABITag]:
         return ABITag(os, kmajor, kminor, kpatch)
 
 
+@dataclass
+class Symbol:
+    name_offset: int
+    value: int
+    size: int
+    info: int
+    other: int
+    shndx: int
+
+
 class SymTab:
-    def __init__(self, endian: str, bitness: int, symtab_buf: bytes, symtab_entsize: int, symtab_sz: int, strtab_buf: bytes, strtab_sz: int) -> None:
-        self.symbols = []
-        self.symnum = int(symtab_sz / symtab_entsize)
-        self.entsize = symtab_entsize
-
-        self.strings = strtab_buf
-        self.strings_sz = strtab_sz
+    def __init__(
+        self,
+        endian: str,
+        bitness: int,
+        symtab: Shdr,
+        strtab: Shdr,
+    ) -> None:
+        self.symbols: List[Symbol] = []
+
+        self.symtab = symtab
+        self.strtab = strtab
 
-        self._parse(endian, bitness, symtab_buf)
+        self._parse(endian, bitness, symtab.buf)
 
     def _parse(self, endian: str, bitness: int, symtab_buf: bytes) -> None:
         """
-        return the symbol's information in 
+        return the symbol's information in
         the order specified by sys/elf32.h
         """
-        for i in range(self.symnum):
+        for i in range(int(len(self.symtab.buf) / self.symtab.entsize)):
             if bitness == 32:
-                name, value, size, info, other, shndx = struct.unpack_from(endian+"IIIBBH", symtab_buf, i*self.entsize)
+                name_offset, value, size, info, other, shndx = struct.unpack_from(
+                    endian + "IIIBBH", symtab_buf, i * self.symtab.entsize
+                )
             elif bitness == 64:
-                name, info, other, shndx, value, size = struct.unpack_from(endian+"IBBBQQ", symtab_buf, i*self.entsize)
+                name_offset, info, other, shndx, value, size = struct.unpack_from(
+                    endian + "IBBBQQ", symtab_buf, i * self.symtab.entsize
+                )
 
-            self.symbols.append((name, value, size, info, other, shndx))
+            self.symbols.append(Symbol(name_offset, value, size, info, other, shndx))
 
-    def fetch_str(self, offset) -> str:
+    def get_name(self, symbol: Symbol) -> str:
         """
         fetch a symbol's name from symtab's
         associated strings' section (SHT_STRTAB)
         """
-        for i in range(offset, self.strings_sz):
-            if self.strings[i] == 0:
-                return self.strings[offset:i].decode()
+        if not self.strtab:
+            raise ValueError("no strings found")
 
-    def get_symbols(self) -> Iterator[Tuple[int, int, int, int, int, int]]:
+        for i in range(symbol.name_offset, self.strtab.size):
+            if self.strtab.buf[i] == 0:
+                return self.strtab.buf[symbol.name_offset : i].decode("utf-8")
+
+        raise ValueError("symbol name not found")
+
+    def get_symbols(self) -> Iterator[Symbol]:
         """
         return a tuple: (name, value, size, info, other, shndx)
         for each symbol contained in the symbol table
@@ -646,11 +686,11 @@ def get_symbols(self) -> Iterator[Tuple[int, int, int, int, int, int]]:
             yield symbol
 
 
-def guess_os_from_osabi(elf) -> Optional[OS]:
+def guess_os_from_osabi(elf: ELF) -> Optional[OS]:
     return elf.ei_osabi
 
 
-def guess_os_from_ph_notes(elf) -> Optional[OS]:
+def guess_os_from_ph_notes(elf: ELF) -> Optional[OS]:
     # search for PT_NOTE sections that specify an OS
     # for example, on Linux there is a GNU section with minimum kernel version
     PT_NOTE = 0x4
@@ -689,7 +729,7 @@ def guess_os_from_ph_notes(elf) -> Optional[OS]:
     return None
 
 
-def guess_os_from_sh_notes(elf) -> Optional[OS]:
+def guess_os_from_sh_notes(elf: ELF) -> Optional[OS]:
     # search for notes stored in sections that aren't visible in program headers.
     # e.g. .note.Linux in Linux kernel modules.
     SHT_NOTE = 0x7
@@ -722,7 +762,7 @@ def guess_os_from_sh_notes(elf) -> Optional[OS]:
     return None
 
 
-def guess_os_from_linker(elf) -> Optional[OS]:
+def guess_os_from_linker(elf: ELF) -> Optional[OS]:
     # search for recognizable dynamic linkers (interpreters)
     # for example, on linux, we see file paths like: /lib64/ld-linux-x86-64.so.2
     linker = elf.linker
@@ -732,7 +772,7 @@ def guess_os_from_linker(elf) -> Optional[OS]:
     return None
 
 
-def guess_os_from_abi_versions_needed(elf) -> Optional[OS]:
+def guess_os_from_abi_versions_needed(elf: ELF) -> Optional[OS]:
     # then lets look for GLIBC symbol versioning requirements.
     # this will let us guess about linux/hurd in some cases.
 
@@ -763,7 +803,7 @@ def guess_os_from_abi_versions_needed(elf) -> Optional[OS]:
     return None
 
 
-def guess_os_from_needed_dependencies(elf) -> Optional[OS]:
+def guess_os_from_needed_dependencies(elf: ELF) -> Optional[OS]:
     for needed in elf.needed:
         if needed.startswith("libmachuser.so"):
             return OS.HURD
@@ -773,38 +813,30 @@ def guess_os_from_needed_dependencies(elf) -> Optional[OS]:
     return None
 
 
-def guess_os_from_symtab(elf) -> Optional[OS]:
-    SHT_SYMTAB = 0x2
-    SHT_STRTAB = 0x3
-    strtab_buf = symtab_buf = None
-
-    for shdr in elf.section_headers:
-        if shdr.type == SHT_STRTAB:
-            strtab_buf, strtab_sz= shdr.buf, shdr.size
-
-        elif shdr.type == SHT_SYMTAB:
-            symtab_buf, symtab_entsize, symtab_sz = shdr.buf, shdr.entsize, shdr.size
-
-    if None in (strtab_buf, symtab_buf):
+def guess_os_from_symtab(elf: ELF) -> Optional[OS]:
+    shdrs = elf.symtab
+    if not shdrs:
         # executable does not contain a symbol table
         # or the symbol's names are stripped
         return None
-
-    symtab = SymTab(
-        elf.endian, elf.bitness, symtab_buf, symtab_entsize, symtab_sz, strtab_buf, strtab_sz
-    )
+
+    symtab_shdr, strtab_shdr = shdrs
+    symtab = SymTab(elf.endian, elf.bitness, symtab_shdr, strtab_shdr)
 
     keywords = {
-        OS.LINUX: ['linux', '/linux/',],
+        OS.LINUX: [
+            "linux",
+            "/linux/",
+        ],
     }
-    
-    for name, *_ in symtab.get_symbols():
-        sym_name = symtab.fetch_str(name)
+
+    for symbol in symtab.get_symbols():
+        sym_name = symtab.get_name(symbol)
 
         for os, hints in keywords.items():
             if any(map(lambda x: x in sym_name, hints)):
                 return os
-    
+
     return None
 
 
@@ -832,7 +864,7 @@ def detect_elf_os(f) -> str:
     needed_dependencies_guess = guess_os_from_needed_dependencies(elf)
     logger.debug("guess: needed dependencies: %s", needed_dependencies_guess)
 
-    symtab_guess = guess_os_from_symtab(elf)    
+    symtab_guess = guess_os_from_symtab(elf)
     logger.debug("guess: pertinent symbol name: %s", symtab_guess)
 
     ret = None

diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py
@@ -92,7 +92,7 @@ def get_file_imports() -> Dict[int, Tuple[str, str, int]]:
 
         # IDA uses section names for the library of ELF imports, like ".dynsym".
         # These are not useful to us, we may need to expand this list over time
-        # TODO: exhaust this list, see #1419 
+        # TODO: exhaust this list, see #1419
         if library == ".dynsym":
             library = ""
 

diff --git a/capa/main.py b/capa/main.py
@@ -1181,46 +1181,62 @@ def main(argv=None):
             if not (args.verbose or args.vverbose or args.json):
                 logger.debug("file limitation short circuit, won't analyze fully.")
                 return E_FILE_LIMITATION
+
+    # TODO: #1411 use a real type, not a dict here.
+    meta: Dict[str, Any]
+    capabilities: MatchResults
+    counts: Dict[str, Any]
+
     if format_ == FORMAT_RESULT:
+        # result document directly parses into meta, capabilities
         result_doc = capa.render.result_document.ResultDocument.parse_file(args.sample)
         meta, capabilities = result_doc.to_capa()
-    elif format_ == FORMAT_FREEZE:
-        with open(args.sample, "rb") as f:
-            extractor = capa.features.freeze.load(f.read())
+
     else:
-        try:
-            if format_ == FORMAT_PE:
-                sig_paths = get_signatures(args.signatures)
-            else:
-                sig_paths = []
-                logger.debug("skipping library code matching: only have native PE signatures")
-        except IOError as e:
-            logger.error("%s", str(e))
-            return E_INVALID_SIG
+        # all other formats we must create an extractor
+        # and use that to extract meta and capabilities
 
-        should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
+        if format_ == FORMAT_FREEZE:
+            # freeze format deserializes directly into an extractor
+            with open(args.sample, "rb") as f:
+                extractor = capa.features.freeze.load(f.read())
+        else:
+            # all other formats we must create an extractor,
+            # such as viv, binary ninja, etc. workspaces
+            # and use those for extracting.
+
+            try:
+                if format_ == FORMAT_PE:
+                    sig_paths = get_signatures(args.signatures)
+                else:
+                    sig_paths = []
+                    logger.debug("skipping library code matching: only have native PE signatures")
+            except IOError as e:
+                logger.error("%s", str(e))
+                return E_INVALID_SIG
+
+            should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None)
+
+            try:
+                extractor = get_extractor(
+                    args.sample,
+                    format_,
+                    args.os,
+                    args.backend,
+                    sig_paths,
+                    should_save_workspace,
+                    disable_progress=args.quiet,
+                )
+            except UnsupportedFormatError:
+                log_unsupported_format_error()
+                return E_INVALID_FILE_TYPE
+            except UnsupportedArchError:
+                log_unsupported_arch_error()
+                return E_INVALID_FILE_ARCH
+            except UnsupportedOSError:
+                log_unsupported_os_error()
+                return E_INVALID_FILE_OS
 
-        try:
-            extractor = get_extractor(
-                args.sample,
-                format_,
-                args.os,
-                args.backend,
-                sig_paths,
-                should_save_workspace,
-                disable_progress=args.quiet,
-            )
-        except UnsupportedFormatError:
-            log_unsupported_format_error()
-            return E_INVALID_FILE_TYPE
-        except UnsupportedArchError:
-            log_unsupported_arch_error()
-            return E_INVALID_FILE_ARCH
-        except UnsupportedOSError:
-            log_unsupported_os_error()
-            return E_INVALID_FILE_OS
-
-    if format_ != FORMAT_RESULT:
         meta = collect_metadata(argv, args.sample, args.format, args.os, args.rules, extractor)
 
         capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet)