Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add name section and object symbol table support to emsymbolizer #21367

Merged
merged 9 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 33 additions & 20 deletions emsymbolizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
# line/column number, potentially including inlining.
# If the wasm has separate DWARF info, do the above with the side file
# If there is a source map, we can parse it to get file and line number.
# If there is an emscripten symbol map, we can parse that to get the symbol name
# If there is a name section or symbol table, llvm-nm can show the symbol name.
# If there is an emscripten symbol map, we can use that to get the symbol name
# If there is a name section or symbol table, llvm-symbolizer can show the
# symbol name.
# Separate DWARF and emscripten symbol maps are not supported yet.

import argparse
import json
Expand Down Expand Up @@ -50,21 +52,30 @@ def get_codesec_offset(module):


def has_debug_line_section(module):
for sec in module.sections():
if sec.name == ".debug_line":
return True
return False
return module.get_custom_section('.debug_line') is not None


def has_name_section(module):
return module.get_custom_section('name') is not None


def has_linking_section(module):
return module.get_custom_section('linking') is not None

def symbolize_address_dwarf(module, address):
vma_adjust = get_codesec_offset(module)

def symbolize_address_symbolizer(module, address, is_dwarf=False):
dschuff marked this conversation as resolved.
Show resolved Hide resolved
if is_dwarf:
vma_adjust = get_codesec_offset(module)
else:
vma_adjust = 0
cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}',
str(address)]
out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip()
out_lines = out.splitlines()

# Source location regex, e.g., /abc/def.c:3:5
SOURCE_LOC_RE = re.compile(r'(.+):(\d+):(\d+)$')
# llvm-dwarfdump prints two lines per location. The first line contains a
# llvm-symbolizer prints two lines per location. The first line contains a
# function name, and the second contains a source location like
# '/abc/def.c:3:5'. If the function or source info is not available, it will
# be printed as '??', in which case we store None. If the line and column info
Expand Down Expand Up @@ -210,30 +221,32 @@ def main(args):
with webassembly.Module(args.wasm_file) as module:
base = 16 if args.address.lower().startswith('0x') else 10
address = int(args.address, base)
symbolized = 0

if args.addrtype == 'code':
address += get_codesec_offset(module)

if ((has_debug_line_section(module) and not args.source) or
'dwarf' in args.source):
symbolize_address_dwarf(module, address)
symbolized += 1

if ((get_sourceMappingURL_section(module) and not args.source) or
'sourcemap' in args.source):
'dwarf' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=True)
elif ((get_sourceMappingURL_section(module) and not args.source) or
'sourcemap' in args.source):
symbolize_address_sourcemap(module, address, args.file)
symbolized += 1

if not symbolized:
elif ((has_name_section(module) and not args.source) or
'names' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=False)
elif ((has_linking_section(module) and not args.source) or
'symtab' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=False)
else:
raise Error('No .debug_line or sourceMappingURL section found in '
f'{module.filename}.'
" I don't know how to symbolize this file yet")


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap'],
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap',
'names', 'symtab'],
help='Force debug info source type', default=())
parser.add_argument('-f', '--file', action='store',
help='Force debug info source file')
Expand Down
82 changes: 52 additions & 30 deletions test/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -9736,7 +9736,33 @@ def test(dump_file):
test('foo.wasm.dump')
test('bar.wasm.dump')

def test_emsymbolizer(self):
def get_instr_addr(self, text, filename):
'''
Runs llvm-objdump to get the address of the first occurrence of the
specified line within the given function. llvm-objdump's output format
example is as follows:
...
00000004 <foo>:
...
6: 41 00 i32.const 0
...
The addresses here are the offsets to the start of the file. Returns
the address string in hexadecimal.
'''
out = self.run_process([common.LLVM_OBJDUMP, '-d', filename],
stdout=PIPE).stdout.strip()
out_lines = out.splitlines()
found = False
for line in out_lines:
if text in line:
offset = line.strip().split(':')[0]
found = True
break
assert found
return '0x' + offset

def test_emsymbolizer_srcloc(self):
'Test emsymbolizer use cases that provide src location granularity info'
def check_dwarf_loc_info(address, funcs, locs):
out = self.run_process(
[emsymbolizer, '-s', 'dwarf', 'test_dwarf.wasm', address],
Expand All @@ -9748,45 +9774,19 @@ def check_dwarf_loc_info(address, funcs, locs):

def check_source_map_loc_info(address, loc):
out = self.run_process(
[emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm',
address],
[emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address],
stdout=PIPE).stdout
self.assertIn(loc, out)

# Runs llvm-objdump to get the address of the first occurrence of the
# specified line within the given function. llvm-objdump's output format
# example is as follows:
# ...
# 00000004 <foo>:
# ...
# 6: 41 00 i32.const 0
# ...
# The addresses here are the offsets to the start of the file. Returns
# the address string in hexadecimal.
def get_addr(text):
out = self.run_process([common.LLVM_OBJDUMP, '-d', 'test_dwarf.wasm'],
stdout=PIPE).stdout.strip()
out_lines = out.splitlines()
found = False
for line in out_lines:
if text in line:
offset = line.strip().split(':')[0]
found = True
break
assert found
return '0x' + offset

# We test two locations within test_dwarf.c:
# out_to_js(0); // line 6
# __builtin_trap(); // line 13

# 1. Test DWARF + source map together
self.run_process([EMCC, test_file('core/test_dwarf.c'),
'-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js'])
# Address of out_to_js(0) within foo(), uninlined
out_to_js_call_addr = get_addr('call\t0')
out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm')
# Address of __builtin_trap() within bar(), inlined into main()
unreachable_addr = get_addr('unreachable')
unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm')

# Function name of out_to_js(0) within foo(), uninlined
out_to_js_call_func = ['foo']
Expand All @@ -9800,6 +9800,7 @@ def get_addr(text):
# The first one corresponds to the innermost inlined location.
unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3']

# 1. Test DWARF + source map together
# For DWARF, we check for the full inlined info for both function names and
# source locations. Source maps provide neither function names nor inlined
# info. So we only check for the source location of the outermost function.
Expand All @@ -9825,6 +9826,27 @@ def get_addr(text):
out_to_js_call_loc)
check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)

def test_emsymbolizer_functions(self):
'Test emsymbolizer use cases that only provide function-granularity info'
def check_func_info(filename, address, func):
out = self.run_process(
[emsymbolizer, filename, address], stdout=PIPE).stdout
self.assertIn(func, out)

# 1. Test name section only
self.run_process([EMCC, test_file('core/test_dwarf.c'),
'--profiling-funcs', '-O1', '-o', 'test_dwarf.js'])
with webassembly.Module('test_dwarf.wasm') as wasm:
self.assertTrue(wasm.has_name_section())
self.assertIsNone(wasm.get_custom_section('.debug_info'))
# Address of out_to_js(0) within foo(), uninlined
out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm')
# Address of __builtin_trap() within bar(), inlined into main()
unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm')
check_func_info('test_dwarf.wasm', out_to_js_call_addr, 'foo')
# The name section will not reflect bar being inlined into main
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# The name section will not reflect bar being inlined into main
# The name section will reflect bar being inlined into main

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this was intended to mean that DWARF has the inlining info, and shows bar, but name section doesn't I made it more claer.

check_func_info('test_dwarf.wasm', unreachable_addr, '__original_main')

def test_separate_dwarf(self):
self.run_process([EMCC, test_file('hello_world.c'), '-g'])
self.assertExists('a.out.wasm')
Expand Down
Loading