From 7fd161492411aaea2174322fc4b237384daf2378 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Fri, 16 Feb 2024 16:52:27 -0800 Subject: [PATCH 1/7] Add name section and object symbol table support to emsymbolizer With https://github.com/llvm/llvm-project/pull/82083, llvm-symbolizer works correctly with name sections, so emsymbolizer can use it. Also do the same for object files with symbol tables. --- emsymbolizer.py | 50 +++++++++++++++++++++++++++------------------- test/test_other.py | 24 ++++++++++++++++++++++ 2 files changed, 54 insertions(+), 20 deletions(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 398d36b6bbd8..264658b2c3e3 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -8,8 +8,10 @@ # line/column number, potentially including inlining. # If the wasm has separate DWARF info, do the above with the side file # If there is a source map, we can parse it to get file and line number. -# If there is an emscripten symbol map, we can parse that to get the symbol name -# If there is a name section or symbol table, llvm-nm can show the symbol name. +# If there is an emscripten symbol map, we can use that to get the symbol name +# If there is a name section or symbol table, llvm-symbolizer can show the +# symbol name. +# Separate DWARF and emscripten symbol maps are not supported yet. import argparse import json @@ -50,21 +52,27 @@ def get_codesec_offset(module): def has_debug_line_section(module): - for sec in module.sections(): - if sec.name == ".debug_line": - return True - return False + return module.get_custom_section('.debug_line') is not None + + +def has_name_section(module): + return module.get_custom_section('name') is not None + +def has_linking_section(module): + return module.get_custom_section('linking') is not None -def symbolize_address_dwarf(module, address): - vma_adjust = get_codesec_offset(module) + +def symbolize_address_symbolizer(module, address, is_dwarf=False): + vma_adjust = get_codesec_offset(module) if is_dwarf else 0 cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}', str(address)] out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip() out_lines = out.splitlines() + # Source location regex, e.g., /abc/def.c:3:5 SOURCE_LOC_RE = re.compile(r'(.+):(\d+):(\d+)$') - # llvm-dwarfdump prints two lines per location. The first line contains a + # llvm-symbolizer prints two lines per location. The first line contains a # function name, and the second contains a source location like # '/abc/def.c:3:5'. If the function or source info is not available, it will # be printed as '??', in which case we store None. If the line and column info @@ -210,22 +218,23 @@ def main(args): with webassembly.Module(args.wasm_file) as module: base = 16 if args.address.lower().startswith('0x') else 10 address = int(args.address, base) - symbolized = 0 if args.addrtype == 'code': address += get_codesec_offset(module) if ((has_debug_line_section(module) and not args.source) or - 'dwarf' in args.source): - symbolize_address_dwarf(module, address) - symbolized += 1 - - if ((get_sourceMappingURL_section(module) and not args.source) or - 'sourcemap' in args.source): + 'dwarf' in args.source): + symbolize_address_symbolizer(module, address, is_dwarf=True) + elif ((get_sourceMappingURL_section(module) and not args.source) or + 'sourcemap' in args.source): symbolize_address_sourcemap(module, address, args.file) - symbolized += 1 - - if not symbolized: + elif ((has_name_section(module) and not args.source) or + 'names' in args.source): + symbolize_address_symbolizer(module, address, is_dwarf=False) + elif ((has_linking_section(module) and not args.source) or + 'symtab' in args.source): + symbolize_address_symbolizer(module, address, is_dwarf=False) + else: raise Error('No .debug_line or sourceMappingURL section found in ' f'{module.filename}.' " I don't know how to symbolize this file yet") @@ -233,7 +242,8 @@ def main(args): def get_args(): parser = argparse.ArgumentParser() - parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap'], + parser.add_argument('-s', '--source', choices=['dwarf', 'sourcemap', + 'names', 'symtab'], help='Force debug info source type', default=()) parser.add_argument('-f', '--file', action='store', help='Force debug info source file') diff --git a/test/test_other.py b/test/test_other.py index 6977575295fd..7b29ee15e656 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9753,6 +9753,12 @@ def check_source_map_loc_info(address, loc): stdout=PIPE).stdout self.assertIn(loc, out) + def check_func_info(address, func): + out = self.run_process( + [emsymbolizer, 'test_dwarf.wasm', address], stdout=PIPE).stdout + print(out) + self.assertIn(func, out) + # Runs llvm-objdump to get the address of the first occurrence of the # specified line within the given function. llvm-objdump's output format # example is as follows: @@ -9825,6 +9831,24 @@ def get_addr(text): out_to_js_call_loc) check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) + # 4. Test name section only + self.run_process([emstrip, '--strip-debug', 'test_dwarf.wasm']) + with webassembly.Module('test_dwarf.wasm') as wasm: + self.assertTrue(wasm.has_name_section()) + self.assertIsNone(wasm.get_custom_section('.debug_info')) + check_func_info(out_to_js_call_addr, out_to_js_call_func[0]) + # The name section will not reflect bar being inlined into main + check_func_info(unreachable_addr, '__original_main') + + # 5. Test an object file with a symbol table + self.run_process([EMCC, test_file('core/test_dwarf.c'), + '-O1', '-c', '-o', 'test_dwarf.o']) + # The code addresses will be different in the object file (section offsets) + out_to_js_call_addr_obj = get_addr('call\t0') + unreachable_addr_obj = get_addr('unreachable') + check_func_info(out_to_js_call_addr_obj, out_to_js_call_func[0]) + check_func_info(unreachable_addr_obj, '__original_main') + def test_separate_dwarf(self): self.run_process([EMCC, test_file('hello_world.c'), '-g']) self.assertExists('a.out.wasm') From 95c8b8fd500a81e9cf06a319d62ad91213a14d0c Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Tue, 20 Feb 2024 14:30:08 -0800 Subject: [PATCH 2/7] review comment --- emsymbolizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 264658b2c3e3..7974667178a0 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -64,7 +64,10 @@ def has_linking_section(module): def symbolize_address_symbolizer(module, address, is_dwarf=False): - vma_adjust = get_codesec_offset(module) if is_dwarf else 0 + if is_dwarf: + vma_adjust = 0 + else: + vma_adjust = get_codesec_offset(module) cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}', str(address)] out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip() From 53571cb5e57dacee49c37ae252dc5b490102aa27 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Thu, 22 Feb 2024 13:02:59 -0800 Subject: [PATCH 3/7] remove objfile test --- emsymbolizer.py | 4 +- test/test_other.py | 94 ++++++++++++++++++++++------------------------ 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 7974667178a0..75c1e892cd0e 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -65,9 +65,9 @@ def has_linking_section(module): def symbolize_address_symbolizer(module, address, is_dwarf=False): if is_dwarf: - vma_adjust = 0 - else: vma_adjust = get_codesec_offset(module) + else: + vma_adjust = 0 cmd = [LLVM_SYMBOLIZER, '-e', module.filename, f'--adjust-vma={vma_adjust}', str(address)] out = shared.run_process(cmd, stdout=subprocess.PIPE).stdout.strip() diff --git a/test/test_other.py b/test/test_other.py index 7b29ee15e656..24ad7b9f9b66 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9736,7 +9736,31 @@ def test(dump_file): test('foo.wasm.dump') test('bar.wasm.dump') - def test_emsymbolizer(self): + # Runs llvm-objdump to get the address of the first occurrence of the + # specified line within the given function. llvm-objdump's output format + # example is as follows: + # ... + # 00000004 : + # ... + # 6: 41 00 i32.const 0 + # ... + # The addresses here are the offsets to the start of the file. Returns + # the address string in hexadecimal. + def get_instr_addr(self, text, filename): + out = self.run_process([common.LLVM_OBJDUMP, '-d', filename], + stdout=PIPE).stdout.strip() + out_lines = out.splitlines() + found = False + for line in out_lines: + if text in line: + offset = line.strip().split(':')[0] + found = True + break + assert found + return '0x' + offset + + def test_emsymbolizer_srcloc(self): + # Test emsymbolizer use cases that provide src location granularity info def check_dwarf_loc_info(address, funcs, locs): out = self.run_process( [emsymbolizer, '-s', 'dwarf', 'test_dwarf.wasm', address], @@ -9748,51 +9772,19 @@ def check_dwarf_loc_info(address, funcs, locs): def check_source_map_loc_info(address, loc): out = self.run_process( - [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', - address], + [emsymbolizer, '-s', 'sourcemap', 'test_dwarf.wasm', address], stdout=PIPE).stdout self.assertIn(loc, out) - def check_func_info(address, func): - out = self.run_process( - [emsymbolizer, 'test_dwarf.wasm', address], stdout=PIPE).stdout - print(out) - self.assertIn(func, out) - - # Runs llvm-objdump to get the address of the first occurrence of the - # specified line within the given function. llvm-objdump's output format - # example is as follows: - # ... - # 00000004 : - # ... - # 6: 41 00 i32.const 0 - # ... - # The addresses here are the offsets to the start of the file. Returns - # the address string in hexadecimal. - def get_addr(text): - out = self.run_process([common.LLVM_OBJDUMP, '-d', 'test_dwarf.wasm'], - stdout=PIPE).stdout.strip() - out_lines = out.splitlines() - found = False - for line in out_lines: - if text in line: - offset = line.strip().split(':')[0] - found = True - break - assert found - return '0x' + offset - # We test two locations within test_dwarf.c: # out_to_js(0); // line 6 # __builtin_trap(); // line 13 - - # 1. Test DWARF + source map together self.run_process([EMCC, test_file('core/test_dwarf.c'), '-g', '-gsource-map', '-O1', '-o', 'test_dwarf.js']) # Address of out_to_js(0) within foo(), uninlined - out_to_js_call_addr = get_addr('call\t0') + out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') # Address of __builtin_trap() within bar(), inlined into main() - unreachable_addr = get_addr('unreachable') + unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') # Function name of out_to_js(0) within foo(), uninlined out_to_js_call_func = ['foo'] @@ -9806,6 +9798,7 @@ def get_addr(text): # The first one corresponds to the innermost inlined location. unreachable_loc = ['test_dwarf.c:13:3', 'test_dwarf.c:18:3'] + # 1. Test DWARF + source map together # For DWARF, we check for the full inlined info for both function names and # source locations. Source maps provide neither function names nor inlined # info. So we only check for the source location of the outermost function. @@ -9831,23 +9824,26 @@ def get_addr(text): out_to_js_call_loc) check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) - # 4. Test name section only - self.run_process([emstrip, '--strip-debug', 'test_dwarf.wasm']) + def test_emsymbolizer_functions(self): + # Test emsymbolizer use cases that only provide function-granularity info + def check_func_info(filename, address, func): + out = self.run_process( + [emsymbolizer, filename, address], stdout=PIPE).stdout + self.assertIn(func, out) + + # 1. Test name section only + self.run_process([EMCC, test_file('core/test_dwarf.c'), + '--profiling-funcs', '-O1', '-o', 'test_dwarf.js']) with webassembly.Module('test_dwarf.wasm') as wasm: self.assertTrue(wasm.has_name_section()) self.assertIsNone(wasm.get_custom_section('.debug_info')) - check_func_info(out_to_js_call_addr, out_to_js_call_func[0]) + # Address of out_to_js(0) within foo(), uninlined + out_to_js_call_addr = self.get_instr_addr('call\t0', 'test_dwarf.wasm') + # Address of __builtin_trap() within bar(), inlined into main() + unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') + check_func_info('test_dwarf.wasm', out_to_js_call_addr, 'foo') # The name section will not reflect bar being inlined into main - check_func_info(unreachable_addr, '__original_main') - - # 5. Test an object file with a symbol table - self.run_process([EMCC, test_file('core/test_dwarf.c'), - '-O1', '-c', '-o', 'test_dwarf.o']) - # The code addresses will be different in the object file (section offsets) - out_to_js_call_addr_obj = get_addr('call\t0') - unreachable_addr_obj = get_addr('unreachable') - check_func_info(out_to_js_call_addr_obj, out_to_js_call_func[0]) - check_func_info(unreachable_addr_obj, '__original_main') + check_func_info('test_dwarf.wasm', unreachable_addr, '__original_main') def test_separate_dwarf(self): self.run_process([EMCC, test_file('hello_world.c'), '-g']) From 3158e6069f79a1914bc427b616635d5244f45d67 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Thu, 22 Feb 2024 13:24:21 -0800 Subject: [PATCH 4/7] Use docstring --- test/test_other.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index 24ad7b9f9b66..52872a41d931 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9736,17 +9736,19 @@ def test(dump_file): test('foo.wasm.dump') test('bar.wasm.dump') - # Runs llvm-objdump to get the address of the first occurrence of the - # specified line within the given function. llvm-objdump's output format - # example is as follows: - # ... - # 00000004 : - # ... - # 6: 41 00 i32.const 0 - # ... - # The addresses here are the offsets to the start of the file. Returns - # the address string in hexadecimal. def get_instr_addr(self, text, filename): + ''' + Runs llvm-objdump to get the address of the first occurrence of the + specified line within the given function. llvm-objdump's output format + example is as follows: + ... + 00000004 : + ... + 6: 41 00 i32.const 0 + ... + The addresses here are the offsets to the start of the file. Returns + the address string in hexadecimal. + ''' out = self.run_process([common.LLVM_OBJDUMP, '-d', filename], stdout=PIPE).stdout.strip() out_lines = out.splitlines() @@ -9760,7 +9762,7 @@ def get_instr_addr(self, text, filename): return '0x' + offset def test_emsymbolizer_srcloc(self): - # Test emsymbolizer use cases that provide src location granularity info + 'Test emsymbolizer use cases that provide src location granularity info' def check_dwarf_loc_info(address, funcs, locs): out = self.run_process( [emsymbolizer, '-s', 'dwarf', 'test_dwarf.wasm', address], @@ -9825,7 +9827,7 @@ def check_source_map_loc_info(address, loc): check_dwarf_loc_info(unreachable_addr, unreachable_func, unreachable_loc) def test_emsymbolizer_functions(self): - # Test emsymbolizer use cases that only provide function-granularity info + 'Test emsymbolizer use cases that only provide function-granularity info' def check_func_info(filename, address, func): out = self.run_process( [emsymbolizer, filename, address], stdout=PIPE).stdout From 420f5a4bb500410e708443d7f2e1126ca0483c05 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Thu, 22 Feb 2024 14:25:49 -0800 Subject: [PATCH 5/7] suggestions --- emsymbolizer.py | 2 +- test/test_other.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 75c1e892cd0e..b70a02149cd8 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -63,7 +63,7 @@ def has_linking_section(module): return module.get_custom_section('linking') is not None -def symbolize_address_symbolizer(module, address, is_dwarf=False): +def symbolize_address_symbolizer(module, address, is_dwarf): if is_dwarf: vma_adjust = get_codesec_offset(module) else: diff --git a/test/test_other.py b/test/test_other.py index 52872a41d931..e82e9e411a85 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9844,7 +9844,7 @@ def check_func_info(filename, address, func): # Address of __builtin_trap() within bar(), inlined into main() unreachable_addr = self.get_instr_addr('unreachable', 'test_dwarf.wasm') check_func_info('test_dwarf.wasm', out_to_js_call_addr, 'foo') - # The name section will not reflect bar being inlined into main + # The name section will not show bar, as it's inlined into main check_func_info('test_dwarf.wasm', unreachable_addr, '__original_main') def test_separate_dwarf(self): From 3293dc5b05f82be94e1fa1bd71dd1297e40eed35 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Mon, 26 Feb 2024 14:38:59 -0800 Subject: [PATCH 6/7] flake8 --- emsymbolizer.py | 2 +- test/test_other.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index b70a02149cd8..53643d5776f8 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -226,7 +226,7 @@ def main(args): address += get_codesec_offset(module) if ((has_debug_line_section(module) and not args.source) or - 'dwarf' in args.source): + 'dwarf' in args.source): symbolize_address_symbolizer(module, address, is_dwarf=True) elif ((get_sourceMappingURL_section(module) and not args.source) or 'sourcemap' in args.source): diff --git a/test/test_other.py b/test/test_other.py index e82e9e411a85..ab15815d6c9d 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -9750,7 +9750,7 @@ def get_instr_addr(self, text, filename): the address string in hexadecimal. ''' out = self.run_process([common.LLVM_OBJDUMP, '-d', filename], - stdout=PIPE).stdout.strip() + stdout=PIPE).stdout.strip() out_lines = out.splitlines() found = False for line in out_lines: From 19127808633f75d454a01dea906c412b32c25013 Mon Sep 17 00:00:00 2001 From: Derek Schuff Date: Tue, 27 Feb 2024 13:44:16 -0800 Subject: [PATCH 7/7] another attempt at flake8 --- emsymbolizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/emsymbolizer.py b/emsymbolizer.py index 53643d5776f8..1a9cf27f25d5 100755 --- a/emsymbolizer.py +++ b/emsymbolizer.py @@ -226,7 +226,7 @@ def main(args): address += get_codesec_offset(module) if ((has_debug_line_section(module) and not args.source) or - 'dwarf' in args.source): + 'dwarf' in args.source): symbolize_address_symbolizer(module, address, is_dwarf=True) elif ((get_sourceMappingURL_section(module) and not args.source) or 'sourcemap' in args.source):