Skip to content

Commit

Permalink
Support ARM assembly.
Browse files Browse the repository at this point in the history
  • Loading branch information
yugr committed Mar 19, 2022
1 parent 9c59f8e commit 39d5845
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 58 deletions.
51 changes: 51 additions & 0 deletions examples/arm.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
00000000 <check_one_fd>:
0: e92d40f0 push {r4, r5, r6, r7, lr}
4: e1a05001 mov r5, r1
8: e24dd06c sub sp, sp, #108 ; 0x6c
c: e3a01001 mov r1, #1
10: e1a04000 mov r4, r0
14: ebfffffe bl 0 <__fcntl64_nocancel>
18: e3700001 cmn r0, #1
1c: 0a000001 beq 28 <check_one_fd+0x28>
20: e28dd06c add sp, sp, #108 ; 0x6c
24: e8bd80f0 pop {r4, r5, r6, r7, pc}
28: ebfffffe bl 0 <__aeabi_read_tp>
2c: e59f307c ldr r3, [pc, #124] ; b0 <check_one_fd+0xb0>
30: e79f3003 ldr r3, [pc, r3]
34: e7903003 ldr r3, [r0, r3]
38: e3530009 cmp r3, #9
3c: 1afffff7 bne 20 <check_one_fd+0x20>
40: e3550902 cmp r5, #32768 ; 0x8000
44: 059f0068 ldreq r0, [pc, #104] ; b4 <check_one_fd+0xb4>
48: 159f0068 ldrne r0, [pc, #104] ; b8 <check_one_fd+0xb8>
4c: e1a01005 mov r1, r5
50: e3a02000 mov r2, #0
54: 03a07000 moveq r7, #0
58: 13a07000 movne r7, #0
5c: 059f6058 ldreq r6, [pc, #88] ; bc <check_one_fd+0xbc>
60: 159f6058 ldrne r6, [pc, #88] ; c0 <check_one_fd+0xc0>
64: ebfffffe bl 0 <__open_nocancel>
68: e1540000 cmp r4, r0
6c: 1a00000e bne ac <check_one_fd+0xac>
70: e1a01004 mov r1, r4
74: e1a0200d mov r2, sp
78: e3a00003 mov r0, #3
7c: ebfffffe bl 0 <__fxstat64>
80: e3500000 cmp r0, #0
84: 1a000008 bne ac <check_one_fd+0xac>
88: e59d3010 ldr r3, [sp, #16]
8c: e2033a0f and r3, r3, #61440 ; 0xf000
90: e3530a02 cmp r3, #8192 ; 0x2000
94: 1a000004 bne ac <check_one_fd+0xac>
98: e28d3020 add r3, sp, #32
9c: e893000c ldm r3, {r2, r3}
a0: e1530007 cmp r3, r7
a4: 01520006 cmpeq r2, r6
a8: 0affffdc beq 20 <check_one_fd+0x20>
ac: e7f000f0 udf #0
b0: 00000078 .word 0x00000078
b4: 0000000c .word 0x0000000c
b8: 00000000 .word 0x00000000
bc: 00000103 .word 0x00000103
c0: 00000107 .word 0x00000107

Binary file added examples/arm.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,4 @@ expected-line-ending-format = LF
include-naming-hint = yes
ignored-modules = gdb # Ignore because of the GDB integration
notes= # disable warnings for TODO, FIXME etc.
disable=missing-function-docstring
disable=missing-function-docstring,no-self-use,too-many-instance-attributes,too-many-arguments,too-many-locals,too-many-branches,too-many-statements
159 changes: 130 additions & 29 deletions src/asm2cfg/asm2cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def escape(instruction):
instruction = instruction.replace('|', r'\|')
instruction = instruction.replace('{', r'\{')
instruction = instruction.replace('}', r'\}')
instruction = instruction.replace(' ', ' ')
return instruction


Expand Down Expand Up @@ -135,7 +136,7 @@ def parse_function_header(line):
if function_name is not None:
return InputFormat.GDB, function_name[1]

memory_range_pattern = re.compile(fr'from ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$')
memory_range_pattern = re.compile(fr'(?:Address range|from) ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$')
memory_range = memory_range_pattern.search(line)
if memory_range is not None:
return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}'
Expand Down Expand Up @@ -191,42 +192,109 @@ def __str__(self):
return ' '.join(map(lambda b: f'{b:#x}', self.bites))


class X86TargetInfo:
"""
Contains instruction info for X86-compatible targets.
"""

def __init__(self):
pass

def comment(self):
return '#'

def is_call(self, instruction):
# Various flavors of call:
# call *0x26a16(%rip)
# call 0x555555555542
# addr32 call 0x55555558add0
return 'call' in instruction.opcode

def is_jump(self, instruction):
return instruction.opcode[0] == 'j'

def is_unconditional_jump(self, instruction):
return instruction.opcode.startswith('jmp')

def is_sink(self, instruction):
"""
Is this an instruction which terminates function execution e.g. return?
"""
return instruction.opcode.startswith('ret')


class ARMTargetInfo:
"""
Contains instruction info for ARM-compatible targets.
"""

def __init__(self):
pass

def comment(self):
return ';'

def is_call(self, instruction):
# Various flavors of call:
# bl 0x19d90 <_IO_vtable_check>
# Note that we should be careful to not mix it with conditional
# branches like 'ble'.
return instruction.opcode.startswith('bl') \
and instruction.opcode not in ('blt', 'ble', 'bls')

def is_jump(self, instruction):
return instruction.opcode[0] == 'b' and not self.is_call(instruction)

def is_unconditional_jump(self, instruction):
return instruction.opcode == 'b'

def is_sink(self, instruction):
"""
Is this an instruction which terminates function execution e.g. return?
Detect various flavors of return like
bx lr
pop {r2-r6,pc}
Note that we do not consider conditional branches (e.g. 'bxle') to sink.
"""
return re.search(r'\bpop\b.*\bpc\b', instruction.body) \
or (instruction.opcode == 'bx' and instruction.ops[0] == 'lr') \
or instruction.opcode == 'udf'


class Instruction:
"""
Represents a single assembly instruction with it operands, location and
optional branch target
"""
def __init__(self, body, text, lineno, address, opcode, ops, target, imm): # pylint: disable=too-many-arguments
def __init__(self, body, text, lineno, address, opcode, ops, target, imm, target_info): # noqa
self.body = body
self.text = text
self.lineno = lineno
self.address = address
self.opcode = opcode
self.ops = ops
self.target = target
self.info = target_info
if imm is not None and (self.is_jump() or self.is_call()):
if self.target is None:
self.target = imm
else:
self.target.merge(imm)

def is_call(self):
# Various flavors of call:
# call *0x26a16(%rip)
# call 0x555555555542
# addr32 call 0x55555558add0
# TODO: here and elsewhere support other target platforms
return 'call' in self.opcode
return self.info.is_call(self)

def is_jump(self):
return self.opcode[0] == 'j'
return self.info.is_jump(self)

def is_direct_jump(self):
return self.is_jump() and re.match(fr'{HEX_LONG_PATTERN}', self.ops[0])

def is_sink(self):
return self.opcode.startswith('ret')
return self.info.is_sink(self)

# TODO: handle sink instructions like retq
def is_unconditional_jump(self):
return self.opcode.startswith('jmp')
return self.info.is_unconditional_jump(self)

def __str__(self):
result = f'{self.address}: {self.opcode}'
Expand All @@ -246,27 +314,41 @@ def parse_address(line):
return address, address_match[3]


def split_nth(string, count):
"""
Splits string to equally-sized chunks
"""
return [string[i:i+count] for i in range(0, len(string), count)]


def parse_encoding(line):
"""
Parses byte encoding of instruction for objdump disassemblies
e.g. the '31 c0' in
'16bd3: 31 c0 xor %eax,%eax'
In addition to X86 supports ARM encoding styles:
'4: e1a01000 mov r1, r0'
'50: f7ff fffe bl 0 <__aeabi_dadd>'
'54: 0002 movs r2, r0'
"""
# Encoding is separated from assembly mnemonic via tab
# so we allow whitespace separators between bytes
# to avoid accidentally matching the mnemonic.
enc_match = re.match(r'^\s*((?:[0-9a-f][0-9a-f] +)+)(.*)', line)
enc_match = re.match(r'^\s*((?:[0-9a-f]{2,8} +)+)(.*)', line)
if enc_match is None:
return None, line
bites = [int(byte, 16) for byte in enc_match[1].strip().split(' ')]
bites = []
for chunk in enc_match[1].strip().split(' '):
bites.extend(int(byte, 16) for byte in split_nth(chunk, 2))
return Encoding(bites), enc_match[2]


def parse_body(line):
def parse_body(line, target_info):
"""
Parses instruction body (opcode and operands)
"""
body_match = re.match(r'^\s*([^#<]+)(.*)', line)
comment_symbol = target_info.comment()
body_match = re.match(fr'^\s*([^{comment_symbol}<]+)(.*)', line)
if body_match is None:
return None, None, None, line
body = body_match[1].strip()
Expand All @@ -291,13 +373,21 @@ def parse_target(line):
return address, target_match[3]


def parse_imm(line):
def parse_comment(line, target_info):
"""
Parses optional instruction imm hint
Parses optional instruction comment
"""
imm_match = re.match(fr'^\s*#\s*(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)', line)
if imm_match is None:
comment_symbol = target_info.comment()
comment_match = re.match(fr'^\s*{comment_symbol}\s*(.*)', line)
if comment_match is None:
return None, line
comment = comment_match[1]
imm_match = re.match(fr'^(?:0x)?({HEX_PATTERN})\s*(<.*>)?(.*)', comment)
if imm_match is None:
# If no imm was found, ignore the comment.
# In particular this takes care of useless ARM comments like
# '82: 46c0 nop ; (mov r8, r8)'
return None, ''
abs_addr = int(imm_match[1], 16)
if imm_match[2]:
target, _ = parse_target(imm_match[2])
Expand All @@ -307,7 +397,7 @@ def parse_imm(line):
return target, imm_match[3]


def parse_line(line, lineno, function_name, fmt):
def parse_line(line, lineno, function_name, fmt, target_info):
"""
Parses a single line of assembly to create Instruction instance
"""
Expand All @@ -328,13 +418,13 @@ def parse_line(line, lineno, function_name, fmt):
return encoding

original_line = line
body, opcode, ops, line = parse_body(line)
body, opcode, ops, line = parse_body(line, target_info)
if opcode is None:
return None

target, line = parse_target(line)

imm, line = parse_imm(line)
imm, line = parse_comment(line, target_info)
if line:
# Expecting complete parse
return None
Expand All @@ -345,7 +435,7 @@ def parse_line(line, lineno, function_name, fmt):
if target is not None and target.base is None:
target.base = function_name

return Instruction(body, original_line, lineno, address, opcode, ops, target, imm)
return Instruction(body, original_line, lineno, address, opcode, ops, target, imm, target_info)


class JumpTable:
Expand All @@ -365,7 +455,7 @@ def __init__(self, instructions):

# Iterate over the lines and collect jump targets and branching points.
for inst in instructions:
if inst is None or not inst.is_jump():
if inst is None or not inst.is_direct_jump():
continue

self.abs_sources[inst.address.abs] = inst.target
Expand All @@ -389,7 +479,15 @@ def get_target(self, address):
return None


def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-many-branches,too-many-statements,unused-argument
def parse_lines(lines, skip_calls, target_name): # noqa pylint: disable=unused-argument
if target_name == 'x86':
target_info = X86TargetInfo()
elif target_name == 'arm':
target_info = ARMTargetInfo()
else:
print(f'Unsupported platform {target_name}')
sys.exit(1)

instructions = []
current_function_name = current_format = None
for num, line in enumerate(lines, 1):
Expand All @@ -402,7 +500,7 @@ def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-
current_format = fmt
continue

instruction_or_encoding = parse_line(line, num, current_function_name, current_format)
instruction_or_encoding = parse_line(line, num, current_function_name, current_format, target_info)
if isinstance(instruction_or_encoding, Encoding):
# Partial encoding for previous instruction, skip it
continue
Expand All @@ -413,13 +511,16 @@ def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-
if line.startswith('End of assembler dump') or not line:
continue

if line.strip() == '':
continue

print(f'Unexpected assembly at line {num}:\n {line}')
sys.exit(1)

# Infer target address for jump instructions
for instruction in instructions:
if (instruction.target is None or instruction.target.abs is None) \
and instruction.is_jump():
and instruction.is_direct_jump():
if instruction.target is None:
instruction.target = Address(0)
instruction.target.abs = int(instruction.ops[0], 16)
Expand Down Expand Up @@ -497,7 +598,7 @@ def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-
# If last instruction of the function is jump/call, then add dummy
# block to designate end of the function.
end_block = BasicBlock('end_of_function')
dummy_instruction = Instruction('', 'end of function', 0, None, None, [], None, None)
dummy_instruction = Instruction('', 'end of function', 0, None, None, [], None, None, target_info)
end_block.add_instruction(dummy_instruction)
previous_jump_block.add_no_jump_edge(end_block.key)
basic_blocks[end_block.key] = end_block
Expand Down
4 changes: 3 additions & 1 deletion src/asm2cfg/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ def main():
help='File to contain one function assembly dump')
parser.add_argument('-c', '--skip-calls', action='store_true',
help='Skip function calls from dividing code to blocks')
parser.add_argument('--target', choices=['x86', 'arm'], default='x86',
help='Specify target platform for assembly')
parser.add_argument('-v', '--view', action='store_true',
help='View as a dot graph instead of saving to a file')
args = parser.parse_args()
print('If function CFG rendering takes too long, try to skip function calls with -c flag')
lines = asm2cfg.read_lines(args.assembly_file)
function_name, basic_blocks = asm2cfg.parse_lines(lines, args.skip_calls)
function_name, basic_blocks = asm2cfg.parse_lines(lines, args.skip_calls, args.target)
asm2cfg.draw_cfg(function_name, basic_blocks, args.view)
Loading

0 comments on commit 39d5845

Please sign in to comment.