Skip to content

Commit

Permalink
Merge pull request #23 from yugr/objdump/2
Browse files Browse the repository at this point in the history
Added support for objdump.
  • Loading branch information
Kazhuu authored Mar 13, 2022
2 parents 7e0c176 + 710de16 commit 9c59f8e
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 48 deletions.
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ time. To have normal behavior again run `set skipcalls off`.
Pip will install `asm2cfg` command-line tool that can be used as a standalone
program with the same functionality as from GDB but with external files.

To use as standalone script you first need to dump assembly from GDB to the file
which is explained below.
To use as standalone script you first need to dump assembly from GDB or objdump
to the file which is explained below.

### Knowing Function Name

Expand Down Expand Up @@ -158,14 +158,22 @@ gdb -batch -ex 'set breakpoints pending on' -ex 'b test_function' -ex r -ex 'pip
```

(the `set breakpoint pending on` command enables pending breakpoints and
could be added to your `.gdbinit` instead).
could be added to your `.gdbinit` instead)

For functions from main executable it's enough to do

```
gdb -batch -ex 'pipe disassemble test_function | tee test_function.asm' ./test_executable
```

You can also extract function's disassembly from `objdump` output:

```
objdump -d ./test_executable | sed -ne '/<test_function/,/^$/p' > test_executable.asm
```

(this may be useful for specific non-native targets which lack gdb support).

### Draw CFG

Now you have the assembly file. Time to turn that to CFG. Do that by giving it
Expand Down Expand Up @@ -202,6 +210,9 @@ File `huge.asm` is a large stripped
assembly function and its corresponding output `main.pdf`. This can be used to
test processing time of big functions.

Files `objdump.asm` and `stripped_objdump.asm` are the regular and stripped
objdump-based disassemblies of short functions.

## Development

You want to contribute? You're very welcome to do so! This section will give you
Expand Down
Binary file added examples/_obstack_allocated_p@@Base.pdf
Binary file not shown.
22 changes: 22 additions & 0 deletions examples/objdump.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
0000000000016bb0 <_obstack_allocated_p@@Base>:
16bb0: f3 0f 1e fa endbr64
16bb4: 48 8b 47 08 mov 0x8(%rdi),%rax
16bb8: 48 85 c0 test %rax,%rax
16bbb: 74 29 je 16be6 <_obstack_allocated_p@@Base+0x36>
16bbd: 0f 1f 00 nopl (%rax)
16bc0: 48 39 c6 cmp %rax,%rsi
16bc3: 76 05 jbe 16bca <_obstack_allocated_p@@Base+0x1a>
16bc5: 48 39 30 cmp %rsi,(%rax)
16bc8: 73 16 jae 16be0 <_obstack_allocated_p@@Base+0x30>
16bca: 48 8b 40 08 mov 0x8(%rax),%rax
16bce: 48 85 c0 test %rax,%rax
16bd1: 75 ed jne 16bc0 <_obstack_allocated_p@@Base+0x10>
16bd3: 31 c0 xor %eax,%eax
16bd5: c3 retq
16bd6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
16bdd: 00 00 00
16be0: b8 01 00 00 00 mov $0x1,%eax
16be5: c3 retq
16be6: c3 retq
16be7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
16bee: 00 00
14 changes: 14 additions & 0 deletions examples/stripped_objdump.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
0000000000001000 <.text>:
1000: f3 0f 1e fa endbr64
1004: 55 push %rbp
1005: 48 89 e5 mov %rsp,%rbp
1008: 89 7d fc mov %edi,-0x4(%rbp)
100b: 83 7d fc 00 cmpl $0x0,-0x4(%rbp)
100f: 7e 08 jle 0x1019
1011: 8b 45 fc mov -0x4(%rbp),%eax
1014: 83 c0 64 add $0x64,%eax
1017: eb 06 jmp 0x101f
1019: 8b 45 fc mov -0x4(%rbp),%eax
101c: 0f af c0 imul %eax,%eax
101f: 5d pop %rbp
1020: c3 retq
Binary file added examples/stripped_objdump.pdf
Binary file not shown.
101 changes: 86 additions & 15 deletions src/asm2cfg/asm2cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from graphviz import Digraph


# TODO: make this a command-line flag
VERBOSE = 0


Expand Down Expand Up @@ -104,26 +105,42 @@ def read_lines(file_path):
HEX_LONG_PATTERN = r'(?:0x0*)?' + HEX_PATTERN


class InputFormat: # pylint: disable=too-few-public-methods
"""
An enum which represents various supported input formats
"""
GDB = 'GDB'
OBJDUMP = 'OBJDUMP'


def parse_function_header(line):
"""
Return function name of memory range from the given string line.
Match lines for non-stripped binary:
Match lines for non-stripped binaries:
'Dump of assembler code for function test_function:'
and lines for stripped binary:
lines for stripped binaries:
'Dump of assembler code from 0x555555555faf to 0x555555557008:'
and lines for obdjdump disassembly:
'0000000000016bb0 <_obstack_allocated_p@@Base>:'
"""

objdump_name_pattern = re.compile(fr'{HEX_PATTERN} <([a-zA-Z_0-9@.]+)>:')
function_name = objdump_name_pattern.search(line)
if function_name is not None:
return InputFormat.OBJDUMP, function_name[1]

function_name_pattern = re.compile(r'function (\w+):$')
function_name = function_name_pattern.search(line)
if function_name is not None:
return function_name[1]
return InputFormat.GDB, function_name[1]

memory_range_pattern = re.compile(fr'from ({HEX_LONG_PATTERN}) to ({HEX_LONG_PATTERN}):$')
memory_range = memory_range_pattern.search(line)
if memory_range is not None:
return f'{memory_range[1]}-{memory_range[2]}'
return InputFormat.GDB, f'{memory_range[1]}-{memory_range[2]}'

return None
return None, None


class Address:
Expand Down Expand Up @@ -158,6 +175,22 @@ def merge(self, other):
self.offset = other.offset


class Encoding:
"""
Represents a sequence of bytes used for instruction encoding
e.g. the '31 c0' in
'16bd3: 31 c0 xor %eax,%eax'
"""
def __init__(self, bites):
self.bites = bites

def size(self):
return len(self.bites)

def __str__(self):
return ' '.join(map(lambda b: f'{b:#x}', self.bites))


class Instruction:
"""
Represents a single assembly instruction with it operands, location and
Expand All @@ -182,11 +215,16 @@ def is_call(self):
# call *0x26a16(%rip)
# call 0x555555555542
# addr32 call 0x55555558add0
# TODO: here and elsewhere support other target platforms
return 'call' in self.opcode

def is_jump(self):
return self.opcode[0] == 'j'

def is_sink(self):
return self.opcode.startswith('ret')

# TODO: handle sink instructions like retq
def is_unconditional_jump(self):
return self.opcode.startswith('jmp')

Expand All @@ -208,6 +246,22 @@ def parse_address(line):
return address, address_match[3]


def parse_encoding(line):
"""
Parses byte encoding of instruction for objdump disassemblies
e.g. the '31 c0' in
'16bd3: 31 c0 xor %eax,%eax'
"""
# Encoding is separated from assembly mnemonic via tab
# so we allow whitespace separators between bytes
# to avoid accidentally matching the mnemonic.
enc_match = re.match(r'^\s*((?:[0-9a-f][0-9a-f] +)+)(.*)', line)
if enc_match is None:
return None, line
bites = [int(byte, 16) for byte in enc_match[1].strip().split(' ')]
return Encoding(bites), enc_match[2]


def parse_body(line):
"""
Parses instruction body (opcode and operands)
Expand All @@ -229,11 +283,11 @@ def parse_target(line):
"""
Parses optional instruction branch target hint
"""
target_match = re.match(r'\s*<([a-zA-Z_@0-9]+)([+-][0-9]+)?>(.*)', line)
target_match = re.match(r'\s*<([a-zA-Z_@0-9]+)([+-]0x[0-9a-f]+|[+-][0-9]+)?>(.*)', line)
if target_match is None:
return None, line
offset = target_match[2] or '+0'
address = Address(None, target_match[1], int(offset))
address = Address(None, target_match[1], int(offset, 0))
return address, target_match[3]


Expand All @@ -253,24 +307,33 @@ def parse_imm(line):
return target, imm_match[3]


def parse_line(line, lineno, function_name):
def parse_line(line, lineno, function_name, fmt):
"""
Parses a single line of assembly to create Instruction instance
"""

# Strip GDB prefix and leading whites
if line.startswith('=> '):
# Strip GDB marker
line = line[3:]
line = line.strip()
line = line.lstrip()

address, line = parse_address(line)
if address is None:
return None

if fmt == InputFormat.OBJDUMP:
encoding, line = parse_encoding(line)
if not line:
return encoding

original_line = line
body, opcode, ops, line = parse_body(line)
if opcode is None:
return None

target, line = parse_target(line)

imm, line = parse_imm(line)
if line:
# Expecting complete parse
Expand Down Expand Up @@ -328,17 +391,23 @@ def get_target(self, address):

def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-many-branches,too-many-statements,unused-argument
instructions = []
current_function_name = None
current_function_name = current_format = None
for num, line in enumerate(lines, 1):
function_name = parse_function_header(line)
fmt, function_name = parse_function_header(line)
if function_name is not None:
assert current_function_name is None, 'we handle only one function for now'
if VERBOSE:
print(f'New function {function_name} (format {fmt})')
current_function_name = function_name
current_format = fmt
continue

instruction = parse_line(line, num, current_function_name)
if instruction is not None:
instructions.append(instruction)
instruction_or_encoding = parse_line(line, num, current_function_name, current_format)
if isinstance(instruction_or_encoding, Encoding):
# Partial encoding for previous instruction, skip it
continue
if instruction_or_encoding is not None:
instructions.append(instruction_or_encoding)
continue

if line.startswith('End of assembler dump') or not line:
Expand Down Expand Up @@ -382,7 +451,7 @@ def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-
print(f' {dst}')
print('Absolute branches:')
for src, dst in jump_table.abs_sources.items():
print(f' {src} -> {dst}')
print(f' {src:#x} -> {dst}')
print('Relative branches:')
for src, dst in jump_table.rel_sources.items():
print(f' {src} -> {dst}')
Expand Down Expand Up @@ -421,6 +490,8 @@ def parse_lines(lines, skip_calls): # noqa pylint: disable=too-many-locals,too-
current_basic_block.add_jump_edge(jump_point.abs)
previous_jump_block = None if is_unconditional else current_basic_block
current_basic_block = None
elif instruction.is_sink():
previous_jump_block = current_basic_block = None

if previous_jump_block is not None:
# If last instruction of the function is jump/call, then add dummy
Expand Down
Loading

0 comments on commit 9c59f8e

Please sign in to comment.