Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add name section and object symbol table support to emsymbolizer #21367

Merged
merged 9 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 33 additions & 20 deletions emsymbolizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
# line/column number, potentially including inlining.
# If the wasm has separate DWARF info, do the above with the side file
# If there is a source map, we can parse it to get file and line number.
# If there is an emscripten symbol map, we can parse that to get the symbol name
# If there is a name section or symbol table, llvm-nm can show the symbol name.
# If there is an emscripten symbol map, we can use that to get the symbol name
# If there is a name section or symbol table, llvm-symbolizer can show the
# symbol name.
# Separate DWARF and emscripten symbol maps are not supported yet.

import argparse
import json
Expand Down Expand Up @@ -50,21 +52,30 @@ def get_codesec_offset(module):


def has_debug_line_section(module):
for sec in module.sections():
if sec.name == ".debug_line":
return True
return False
return module.get_custom_section('.debug_line') is not None


def has_name_section(module):
return module.get_custom_section('name') is not None


def has_linking_section(module):
return module.get_custom_section('linking') is not None

def symbolize_address_dwarf(module, address):
vma_adjust = get_codesec_offset(module)

def symbolize_address_symbolizer(module, address, is_dwarf=False):
dschuff marked this conversation as resolved.
Show resolved Hide resolved
if is_dwarf:
vma_adjust = 0
else:
vma_adjust = get_codesec_offset(module)
cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}',
str(address)]
out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip()
out_lines = out.splitlines()

# Source location regex, e.g., /abc/def.c:3:5
SOURCE_LOC_RE = re.compile(r'(.+):(\d+):(\d+)$')
# llvm-dwarfdump prints two lines per location. The first line contains a
# llvm-symbolizer prints two lines per location. The first line contains a
# function name, and the second contains a source location like
# '/abc/def.c:3:5'. If the function or source info is not available, it will
# be printed as '??', in which case we store None. If the line and column info
Expand Down Expand Up @@ -210,30 +221,32 @@ def main(args):
with webassembly.Module(args.wasm_file) as module:
base = 16 if args.address.lower().startswith('0x') else 10
address = int(args.address, base)
symbolized = 0

if args.addrtype == 'code':
address += get_codesec_offset(module)

if ((has_debug_line_section(module) and not args.source) or
'dwarf' in args.source):
symbolize_address_dwarf(module, address)
symbolized += 1

if ((get_sourceMappingURL_section(module) and not args.source) or
'sourcemap' in args.source):
'dwarf' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=True)
elif ((get_sourceMappingURL_section(module) and not args.source) or
'sourcemap' in args.source):
symbolize_address_sourcemap(module, address, args.file)
symbolized += 1

if not symbolized:
elif ((has_name_section(module) and not args.source) or
'names' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=False)
elif ((has_linking_section(module) and not args.source) or
'symtab' in args.source):
symbolize_address_symbolizer(module, address, is_dwarf=False)
else:
raise Error('No .debug_line or sourceMappingURL section found in '
f'{module.filename}.'
" I don't know how to symbolize this file yet")


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap'],
parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap',
'names', 'symtab'],
help='Force debug info source type', default=())
parser.add_argument('-f', '--file', action='store',
help='Force debug info source file')
Expand Down
24 changes: 24 additions & 0 deletions test/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -9753,6 +9753,12 @@ def check_source_map_loc_info(address, loc):
stdout=PIPE).stdout
self.assertIn(loc, out)

def check_func_info(address, func):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there some way you could split this test up or @parameterize it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the tests except the last one I added in this PR share the results of the get_addr calls, which means that if we did that, we'd run objdump more times and do more work overall. Would that be worth it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have any great ideas about how to simplify, but this test is IMHO way to long already and can fail in way to many ways.

I just tried to read and understand what was going in this test and found it pretty hard to figure out what was even going on. Perhaps we can find some way to refactor in the future?

I guess in normal python unittest setup you would make emsymbiler test into its own class with a setUpClass method.. but that doesn't really work with the way we run out tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually realized that llvm-symbolizer doesn't work correctly for object file symbol tables, so I removed the test for that (that was 5). I split the tests using exact source locations apart from the ones using functions only (currently it's just the name section one, but if we fix object files, it would go there too). There's a little duplication, but tests are a little simpler.

out = self.run_process(
[emsymbolizer, 'test_dwarf.wasm', address], stdout=PIPE).stdout
print(out)
self.assertIn(func, out)

# Runs llvm-objdump to get the address of the first occurrence of the
# specified line within the given function. llvm-objdump's output format
# example is as follows:
Expand Down Expand Up @@ -9825,6 +9831,24 @@ def get_addr(text):
out_to_js_call_loc)
check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc)

# 4. Test name section only
self.run_process([emstrip, '--strip-debug', 'test_dwarf.wasm'])
with webassembly.Module('test_dwarf.wasm') as wasm:
self.assertTrue(wasm.has_name_section())
self.assertIsNone(wasm.get_custom_section('.debug_info'))
check_func_info(out_to_js_call_addr, out_to_js_call_func[0])
# The name section will not reflect bar being inlined into main
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# The name section will not reflect bar being inlined into main
# The name section will reflect bar being inlined into main

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this was intended to mean that DWARF has the inlining info, and shows bar, but name section doesn't I made it more claer.

check_func_info(unreachable_addr, '__original_main')

# 5. Test an object file with a symbol table
self.run_process([EMCC, test_file('core/test_dwarf.c'),
'-O1', '-c', '-o', 'test_dwarf.o'])
# The code addresses will be different in the object file (section offsets)
out_to_js_call_addr_obj = get_addr('call\t0')
dschuff marked this conversation as resolved.
Show resolved Hide resolved
unreachable_addr_obj = get_addr('unreachable')
check_func_info(out_to_js_call_addr_obj, out_to_js_call_func[0])
check_func_info(unreachable_addr_obj, '__original_main')

def test_separate_dwarf(self):
self.run_process([EMCC, test_file('hello_world.c'), '-g'])
self.assertExists('a.out.wasm')
Expand Down
Loading