Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM: add support for arm architecture #1796

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/pyinstaller/hooks/hook-vivisect.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
"vivisect.analysis.amd64",
"vivisect.analysis.amd64.emulation",
"vivisect.analysis.amd64.golang",
"vivisect.analysis.arm",
"vivisect.analysis.arm.emulation",
"vivisect.analysis.arm.renaming",
"vivisect.analysis.arm.thunk_reg",
"vivisect.analysis.crypto",
"vivisect.analysis.crypto.constants",
"vivisect.analysis.elf",
Expand Down Expand Up @@ -76,6 +80,7 @@
"vivisect.analysis.ms.vftables",
"vivisect.analysis.pe",
"vivisect.impapi.posix.amd64",
"vivisect.impapi.posix.arm",
"vivisect.impapi.posix.i386",
"vivisect.impapi.windows",
"vivisect.impapi.windows.advapi_32",
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
- ghidra: add Ghidra feature extractor and supporting code #1770 @colton-gabertan
- ghidra: add entry script helping users run capa against a loaded Ghidra database #1767 @mike-hunhoff
- binja: add support for symtab names #1504 @xusheng6
- arm: add support for arm architecture
- ELF: improve ELF stripper
- ELF: improve statically linked ELF files analysis

### Breaking Changes

Expand Down
3 changes: 2 additions & 1 deletion capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,10 @@ def get_value_str(self):
# other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
ARCH_I386 = "i386"
ARCH_AMD64 = "amd64"
ARCH_ARM = "ARM"
# dotnet
ARCH_ANY = "any"
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ARM, ARCH_ANY)


class Arch(Feature):
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/elf.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ def detect_elf_os(f) -> str:
elif symtab_guess:
ret = symtab_guess

return ret.value if ret is not None else "unknown"
return ret.value if ret is not None else "linux"


def detect_elf_arch(f: BinaryIO) -> str:
Expand Down
2 changes: 2 additions & 0 deletions capa/features/extractors/elffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def extract_file_arch(elf: ELFFile, **kwargs):
yield Arch("i386"), NO_ADDRESS
elif arch == "x64":
yield Arch("amd64"), NO_ADDRESS
elif arch == "ARM":
yield Arch("ARM"), NO_ADDRESS
else:
logger.warning("unsupported architecture: %s", arch)

Expand Down
20 changes: 13 additions & 7 deletions capa/features/extractors/viv/basicblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import string
import struct
from typing import Tuple, Iterator
from typing import Tuple, Union, Iterator

import envi
import envi.archs.arm.disasm
import envi.archs.i386.disasm

from capa.features.common import Feature, Characteristic
Expand Down Expand Up @@ -76,7 +77,7 @@ def extract_stackstring(f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Featu
yield Characteristic("stack string"), bb.address


def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
def is_mov_imm_to_stack(instr: Union[envi.archs.i386.disasm.i386Opcode, envi.archs.arm.disasm.ArmOpcode]) -> bool:
"""
Return if instruction moves immediate onto stack
"""
Expand All @@ -92,22 +93,27 @@ def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
if not src.isImmed():
return False

if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance(
dst, envi.archs.i386.disasm.i386RegMemOper
if (
not isinstance(dst, envi.archs.i386.disasm.i386SibOper)
and not isinstance(dst, envi.archs.i386.disasm.i386RegMemOper)
and not isinstance(dst, envi.archs.arm.disasm.ArmRegOper)
):
return False

if not dst.reg:
return False

rname = dst._dis_regctx.getRegisterName(dst.reg)
if rname not in ["ebp", "rbp", "esp", "rsp"]:
if isinstance(dst, (envi.archs.i386.disasm.i386SibOper, envi.archs.i386.disasm.i386RegMemOper)):
rname = dst._dis_regctx.getRegisterName(dst.reg)
else:
rname = dst.reg
if rname not in ["ebp", "rbp", "esp", "rsp", envi.archs.arm.disasm.REG_SP, envi.archs.arm.disasm.REG_BP]:
return False

return True


def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int:
def get_printable_len(oper: Union[envi.archs.i386.disasm.i386ImmOper, envi.archs.arm.disasm.ArmImmOper]) -> int:
"""
Return string length if all operand bytes are ascii or utf16-le printable
"""
Expand Down
9 changes: 7 additions & 2 deletions capa/features/extractors/viv/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import capa.features.extractors.viv.insn
import capa.features.extractors.viv.global_
import capa.features.extractors.viv.function
import capa.features.extractors.viv.insn_arm
import capa.features.extractors.viv.basicblock
from capa.features.common import Feature
from capa.features.address import Address, AbsoluteVirtualAddress
Expand All @@ -26,10 +27,11 @@


class VivisectFeatureExtractor(FeatureExtractor):
def __init__(self, vw, path: Path, os):
def __init__(self, vw, path: Path, os, arm=False):
super().__init__()
self.vw = vw
self.path = path
self.arm = arm
self.buf = path.read_bytes()

# pre-compute these because we'll yield them at *every* scope.
Expand Down Expand Up @@ -74,7 +76,10 @@ def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHa
def extract_insn_features(
self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)
if self.arm:
yield from capa.features.extractors.viv.insn_arm.extract_features(fh, bbh, ih)
else:
yield from capa.features.extractors.viv.insn.extract_features(fh, bbh, ih)

def is_library_function(self, addr):
return viv_utils.flirt.is_library_function(self.vw, addr)
Expand Down
2 changes: 1 addition & 1 deletion capa/features/extractors/viv/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def extract_function_loop(fhandle: FunctionHandle) -> Iterator[Tuple[Feature, Ad
bflags & envi.BR_COND
or bflags & envi.BR_FALL
or bflags & envi.BR_TABLE
or bb.instructions[-1].mnem == "jmp"
or bb.instructions[-1].mnem in ["jmp", "b", "bx"]
):
edges.append((bb.va, bva))

Expand Down
5 changes: 4 additions & 1 deletion capa/features/extractors/viv/global_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import logging
from typing import Tuple, Iterator

from capa.features.common import ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.common import ARCH_ARM, ARCH_I386, ARCH_AMD64, Arch, Feature
from capa.features.address import NO_ADDRESS, Address

logger = logging.getLogger(__name__)
Expand All @@ -22,6 +22,9 @@ def extract_arch(vw) -> Iterator[Tuple[Feature, Address]]:
elif arch == "i386":
yield Arch(ARCH_I386), NO_ADDRESS

elif arch == "ARM":
yield Arch(ARCH_ARM), NO_ADDRESS

else:
# we likely end up here:
# 1. handling a new architecture (e.g. aarch64)
Expand Down
17 changes: 17 additions & 0 deletions capa/features/extractors/viv/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# See the License for the specific language governing permissions and limitations under the License.
from typing import Optional

import envi
from vivisect import VivWorkspace
from vivisect.const import XR_TO, REF_CODE

Expand All @@ -21,3 +22,19 @@ def get_coderef_from(vw: VivWorkspace, va: int) -> Optional[int]:
return xrefs[0][XR_TO]
else:
return None


def read_memory(vw, va: int, size: int) -> bytes:
# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)
58 changes: 53 additions & 5 deletions capa/features/extractors/viv/indirect_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import envi
import vivisect.const
import envi.archs.arm.disasm
import envi.archs.i386.disasm
import envi.archs.amd64.disasm
from vivisect import VivWorkspace
Expand All @@ -20,12 +21,15 @@
i386ImmOper = envi.archs.i386.disasm.i386ImmOper
i386ImmMemOper = envi.archs.i386.disasm.i386ImmMemOper
Amd64RipRelOper = envi.archs.amd64.disasm.Amd64RipRelOper
ARMRegOper = envi.archs.arm.disasm.ArmRegOper
ARMImmOper = envi.archs.arm.disasm.ArmImmOper
ARMScaledOffsetOper = envi.archs.arm.disasm.ArmScaledOffsetOper
LOC_OP = vivisect.const.LOC_OP
IF_NOFALL = envi.IF_NOFALL
REF_CODE = vivisect.const.REF_CODE
FAR_BRANCH_MASK = envi.BR_PROC | envi.BR_DEREF | envi.BR_ARCH

DESTRUCTIVE_MNEMONICS = ("mov", "lea", "pop", "xor")
DESTRUCTIVE_MNEMONICS = ("mov", "lea", "ldr", "pop", "xor", "eor")


def get_previous_instructions(vw: VivWorkspace, va: int) -> List[int]:
Expand Down Expand Up @@ -71,6 +75,38 @@ class NotFoundError(Exception):
pass


def find_value(vw: VivWorkspace, va: int, reg: int, q):
tmp = 0
seen = set([]) # type: Set[int]

q.extend(get_previous_instructions(vw, va))
while q:
cur = q.popleft()
if cur in seen:
continue
seen.add(cur)
insn = vw.parseOpcode(cur)
if len(insn.opers) == 0:
q.extend(get_previous_instructions(vw, cur))
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, ARMRegOper) and opnd0.reg == reg):
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "sub" and isinstance(insn.opers[1], ARMImmOper):
tmp -= insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "add" and isinstance(insn.opers[1], ARMImmOper):
tmp += insn.opers[1].val
q.extend(get_previous_instructions(vw, cur))
continue
if insn.mnem == "mov" and isinstance(insn.opers[1], ARMImmOper):
return insn.opers[1].val + tmp
return None


def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[int]]:
"""
scan backwards from the given address looking for assignments to the given register.
Expand Down Expand Up @@ -106,7 +142,9 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
continue

opnd0 = insn.opers[0]
if not (isinstance(opnd0, i386RegOper) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS):
if not (
isinstance(opnd0, (i386RegOper, ARMRegOper)) and opnd0.reg == reg and insn.mnem in DESTRUCTIVE_MNEMONICS
):
q.extend(get_previous_instructions(vw, cur))
continue

Expand All @@ -115,16 +153,24 @@ def find_definition(vw: VivWorkspace, va: int, reg: int) -> Tuple[int, Optional[
# we currently only support extracting the constant from something like: `mov $reg, IAT`
# so, any other pattern results in an unknown value, represented by None.
# this is a good place to extend in the future, if we need more robust support.
if insn.mnem != "mov":
if insn.mnem not in ("mov", "ldr"):
return (cur, None)
else:
opnd1 = insn.opers[1]
if isinstance(opnd1, i386ImmOper):
if isinstance(opnd1, (i386ImmOper, ARMImmOper)):
return (cur, opnd1.getOperValue(opnd1))
elif isinstance(opnd1, i386ImmMemOper):
return (cur, opnd1.getOperAddr(opnd1))
elif isinstance(opnd1, Amd64RipRelOper):
return (cur, opnd1.getOperAddr(insn))
elif isinstance(opnd1, ARMScaledOffsetOper):
base_reg = find_value(vw, cur, opnd1.base_reg, q)
if base_reg is None:
return (cur, None)
offset_reg = find_value(vw, cur, opnd1.offset_reg, q)
if offset_reg is None:
return (cur, None)
return (cur, base_reg + offset_reg)
else:
# might be something like: `mov $reg, dword_401000[eax]`
return (cur, None)
Expand All @@ -136,7 +182,9 @@ def is_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> bool:
if insn is None:
insn = vw.parseOpcode(va)

return insn.mnem in ("call", "jmp") and isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper)
return insn.mnem in ("call", "jmp", "bl", "blx", "b", "bx") and isinstance(
insn.opers[0], (envi.archs.i386.disasm.i386RegOper, envi.archs.arm.disasm.ArmRegOper)
)


def resolve_indirect_call(vw: VivWorkspace, va: int, insn: envi.Opcode) -> Tuple[int, Optional[int]]:
Expand Down
27 changes: 11 additions & 16 deletions capa/features/extractors/viv/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.elf import SymTab
from capa.features.extractors.viv.helpers import read_memory
from capa.features.extractors.viv.syscall import get_library_function_name
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call

Expand Down Expand Up @@ -81,6 +83,15 @@ def extract_insn_api_features(fh: FunctionHandle, bb, ih: InsnHandle) -> Iterato
if f.vw.getFunctionMeta(f.va, "Thunk"):
return

# Added a case for catching basic blocks that contain direct calls to system functions.
if insn.mnem in ("int", "syscall"):
if insn.mnem != "int" or insn.opers[0].imm == 128:
name = get_library_function_name(f.vw, bb)
if name is None:
return
yield API(name), ih.address
return

# traditional call via IAT
if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
oper = insn.opers[0]
Expand Down Expand Up @@ -222,22 +233,6 @@ def derefs(vw, p):
p = next


def read_memory(vw, va: int, size: int) -> bytes:
# as documented in #176, vivisect will not readMemory() when the section is not marked readable.
#
# but here, we don't care about permissions.
# so, copy the viv implementation of readMemory and remove the permissions check.
#
# this is derived from:
# https://github.com/vivisect/vivisect/blob/5eb4d237bddd4069449a6bc094d332ceed6f9a96/envi/memory.py#L453-L462
for mva, mmaxva, mmap, mbytes in vw._map_defs:
if va >= mva and va < mmaxva:
mva, msize, mperms, mfname = mmap
offset = va - mva
return mbytes[offset : offset + size]
raise envi.exc.SegmentationViolation(va)


def read_bytes(vw, va: int) -> bytes:
"""
read up to MAX_BYTES_FEATURE_SIZE from the given address.
Expand Down
Loading
Loading