Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SPU LLVM Precompilation #14570

Merged
merged 4 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 183 additions & 3 deletions rpcs3/Emu/Cell/PPUModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "Emu/VFS.h"

#include "Emu/Cell/PPUOpcodes.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/PPUAnalyser.h"

#include "Emu/Cell/lv2/sys_process.h"
Expand Down Expand Up @@ -1070,12 +1071,186 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&
return;
}

const bool is_firmware = mod.path.starts_with(vfs::get("/dev_flash/"));

const std::string_view seg_view{ensure(mod.get_ptr<char>(seg.addr)), seg.size};

for (usz i = seg_view.find("\177ELF"); i < seg.size; i = seg_view.find("\177ELF", i + 4))
auto find_first_of_multiple = [](std::string_view data, std::initializer_list<std::string_view> values, usz index)
{
usz pos = umax;

for (std::string_view value : values)
{
if (usz pos0 = data.substr(index, pos - index).find(value); pos0 != umax && pos0 + index < pos)
{
pos = pos0 + index;
}
}

return pos;
};

extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size);

// Search for [stqd lr,0x10(sp)] instruction or ELF file signature, whichever comes first
const std::initializer_list<std::string_view> prefixes = {"\177ELF"sv, "\x24\0\x40\x80"sv};

usz prev_bound = 0;

for (usz i = find_first_of_multiple(seg_view, prefixes, 0); i < seg.size; i = find_first_of_multiple(seg_view, prefixes, utils::align<u32>(i + 1, 4)))
{
const auto elf_header = ensure(mod.get_ptr<u8>(seg.addr + i));

if (i % 4 == 0 && std::memcmp(elf_header, "\x24\0\x40\x80", 4) == 0)
{
bool next = true;
const u32 old_i = i;

for (u32 search = i & -128, tries = 10; tries && search >= prev_bound; tries--, search = utils::sub_saturate<u32>(search, 128))
{
if (seg_view[search] != 0x42 && seg_view[search] != 0x43)
{
continue;
}

const u32 inst1 = read_from_ptr<be_t<u32>>(seg_view, search);
const u32 inst2 = read_from_ptr<be_t<u32>>(seg_view, search + 4);
const u32 inst3 = read_from_ptr<be_t<u32>>(seg_view, search + 8);
const u32 inst4 = read_from_ptr<be_t<u32>>(seg_view, search + 12);

if ((inst1 & 0xfe'00'00'7f) != 0x42000002 || (inst2 & 0xfe'00'00'7f) != 0x42000002 || (inst3 & 0xfe'00'00'7f) != 0x42000002 || (inst4 & 0xfe'00'00'7f) != 0x42000002)
{
continue;
}

ppu_log.success("Found SPURS GUID Pattern at 0x%05x", search + seg.addr);
i = search;
next = false;
break;
}

if (next)
{
continue;
}

std::string_view ls_segment = seg_view.substr(i);

// Bound to a bit less than LS size
ls_segment = ls_segment.substr(0, 0x38000);

for (usz addr_last = 0, valid_count = 0, invalid_count = 0;;)
{
usz instruction = ls_segment.find("\x24\0\x40\x80"sv, addr_last);

if (instruction != umax)
{
if (instruction % 4 != i % 4)
{
// Unaligned, continue
addr_last = instruction + (i % 4 - instruction % 4) % 4;
continue;
}

// FIXME: This seems to terminate SPU code prematurely in some cases
// Likely due to absolute branches
if (spu_thread::is_exec_code(instruction, {reinterpret_cast<const u8*>(ls_segment.data()), ls_segment.size()}, 0))
{
addr_last = instruction + 4;
valid_count++;
invalid_count = 0;
continue;
}

if (invalid_count == 0)
{
// Allow a single case of invalid data
addr_last = instruction + 4;
invalid_count++;
continue;
}

addr_last = instruction;
}

if (addr_last >= 0x80 && valid_count >= 2)
{
const u32 begin = i & -128;
u32 end = std::min<u32>(seg.size, utils::align<u32>(i + addr_last + 256, 128));

u32 guessed_ls_addr = 0;

// Try to guess LS address by observing the pattern for disable/enable interrupts
// ILA R2, PC + 8
// BIE/BID R2

for (u32 found = 0, last_vaddr = 0, it = begin + 16; it < end - 16; it += 4)
{
const u32 inst1 = read_from_ptr<be_t<u32>>(seg_view, it);
const u32 inst2 = read_from_ptr<be_t<u32>>(seg_view, it + 4);
const u32 inst3 = read_from_ptr<be_t<u32>>(seg_view, it + 8);
const u32 inst4 = read_from_ptr<be_t<u32>>(seg_view, it + 12);

if ((inst1 & 0xfe'00'00'7f) == 0x42000002 && (inst2 & 0xfe'00'00'7f) == 0x42000002 && (inst3 & 0xfe'00'00'7f) == 0x42000002 && (inst4 & 0xfe'00'00'7f) == 0x42000002)
{
// SPURS GUID pattern
end = it;
ppu_log.success("Found SPURS GUID Pattern for terminagtor at 0x%05x", end + seg.addr);
break;
}

if ((inst1 >> 7) % 4 == 0 && (inst1 & 0xfe'00'00'7f) == 0x42000002 && (inst2 == 0x35040100 || inst2 == 0x35080100))
{
const u32 addr_inst = (inst1 >> 7) % 0x40000;

if (u32 addr_seg = addr_inst - std::min<u32>(it + 8 - begin, addr_inst))
{
if (last_vaddr != addr_seg)
{
guessed_ls_addr = 0;
found = 0;
}

found++;
last_vaddr = addr_seg;

if (found >= 2)
{
// Good segment address
guessed_ls_addr = last_vaddr;
ppu_log.notice("Found IENABLE/IDSIABLE Pattern at 0x%05x", it + seg.addr);
}
}
}
}

if (guessed_ls_addr)
{
end = begin + std::min<u32>(end - begin, SPU_LS_SIZE - guessed_ls_addr);
}

ppu_log.success("Found valid roaming SPU code at 0x%x..0x%x (guessed_ls_addr=0x%x)", seg.addr + begin, seg.addr + end, guessed_ls_addr);

if (!is_firmware)
{
// Siginify that the base address is unknown by passing 0
utilize_spu_data_segment(guessed_ls_addr ? guessed_ls_addr : 0x4000, seg_view.data() + begin, end - begin);
}

i = std::max<u32>(end, i + 4) - 4;
prev_bound = i + 4;
}
else
{
i = old_i;
}

break;
}

continue;
}

// Try to load SPU image
const spu_exec_object obj(fs::file(elf_header, seg.size - i));

Expand Down Expand Up @@ -1107,7 +1282,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&

if (prog.p_type == 0x1u /* LOAD */ && prog.p_filesz > 0u)
{
if (prog.p_vaddr)
if (prog.p_vaddr && !is_firmware)
{
extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size);

Expand All @@ -1126,11 +1301,13 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&

if (!name.empty())
{
fmt::append(dump, "\n\tSPUNAME: '%s' (image addr: 0x%x)", name, seg.addr + i);
fmt::append(dump, "\n\tSPUNAME: '%s'", name);
}
}
}

fmt::append(dump, " (image addr: 0x%x, size: 0x%x)", seg.addr + i, obj.highest_offset);

sha1_finish(&sha2, sha1_hash);

// Format patch name
Expand Down Expand Up @@ -1173,6 +1350,9 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu_segment&
{
ppu_loader.success("SPU executable hash: %s (<- %u)%s", hash, applied.size(), dump);
}

i += obj.highest_offset - 4;
prev_bound = i + 4;
}
}

Expand Down
9 changes: 8 additions & 1 deletion rpcs3/Emu/Cell/PPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2112,7 +2112,14 @@ void ppu_thread::cpu_task()
#endif
cmd_pop();

ppu_initialize(), spu_cache::initialize();
ppu_initialize();

if (Emu.IsStopped())
{
return;
}

spu_cache::initialize();

#ifdef __APPLE__
pthread_jit_write_protect_np(true);
Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/Cell/RawSPUThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ void spu_load_exec(const spu_exec_object& elf)
spu->status_npc = {SPU_STATUS_RUNNING, elf.header.e_entry};
atomic_storage<u32>::release(spu->pc, elf.header.e_entry);

const auto funcs = spu->discover_functions(spu->ls, umax);
const auto funcs = spu->discover_functions(0, { spu->ls , SPU_LS_SIZE }, true, umax);

for (u32 addr : funcs)
{
Expand Down
49 changes: 31 additions & 18 deletions rpcs3/Emu/Cell/SPURecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -561,10 +561,7 @@ extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 s

spu_section_data::data_t obj{vaddr, std::move(data)};

std::vector<u8> ls_data(SPU_LS_SIZE);
std::memcpy(ls_data.data() + vaddr, ls_data_vaddr, size);

obj.funcs = spu_thread::discover_functions(ls_data.data(), umax);
obj.funcs = spu_thread::discover_functions(vaddr, { reinterpret_cast<const u8*>(ls_data_vaddr), size }, vaddr != 0, umax);

if (obj.funcs.empty())
{
Expand Down Expand Up @@ -661,7 +658,7 @@ void spu_cache::add(const spu_program& func)
m_file.write_gather(gather, 3);
}

void spu_cache::initialize()
void spu_cache::initialize(bool build_existing_cache)
{
spu_runtime::g_interpreter = spu_runtime::g_gateway;

Expand Down Expand Up @@ -699,15 +696,27 @@ void spu_cache::initialize()
auto data_list = std::move(g_fxo->get<spu_section_data>().data);
g_fxo->get<spu_section_data>().had_been_used = true;

u32 total_precompile = 0;

for (auto& sec : data_list)
{
total_precompile += sec.funcs.size();
}

const bool spu_precompilation_enabled = func_list.empty() && g_cfg.core.spu_cache && g_cfg.core.llvm_precompilation;

if (spu_precompilation_enabled)
{
// What compiles in this case goes straight to disk
g_fxo->get<spu_cache>() = std::move(cache);
}
else if (!build_existing_cache)
{
return;
}
else
{
total_precompile = 0;
data_list.clear();
}

Expand Down Expand Up @@ -752,17 +761,14 @@ void spu_cache::initialize()
thread_ctrl::wait_on(g_progr_ptotal, v);
}

u32 add_count = ::size32(func_list);
const u32 add_count = ::size32(func_list) + total_precompile;

for (auto& sec : data_list)
if (add_count)
{
add_count += sec.funcs.size();
g_progr_ptotal += add_count;
progr.emplace("Building SPU cache...");
}

g_progr_ptotal += add_count;

progr.emplace("Building SPU cache...");

worker_count = rpcs3::utils::get_max_threads();
}

Expand Down Expand Up @@ -954,12 +960,17 @@ void spu_cache::initialize()
return result;
});

u32 built_total = 0;

// Join (implicitly) and print individual results
for (u32 i = 0; i < workers.size(); i++)
{
spu_log.notice("SPU Runtime: Worker %u built %u programs.", i + 1, workers[i]);
built_total += workers[i];
}

spu_log.notice("SPU Runtime: Workers built %u programs.", built_total);

if (Emu.IsStopped())
{
spu_log.error("SPU Runtime: Cache building aborted.");
Expand Down Expand Up @@ -2103,22 +2114,24 @@ void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/
}
}

std::vector<u32> spu_thread::discover_functions(const void* ls_start, u32 /*entry*/)
std::vector<u32> spu_thread::discover_functions(u32 base_addr, std::span<const u8> ls, bool is_known_addr, u32 /*entry*/)
{
std::vector<u32> calls;
calls.reserve(100);

// Discover functions
// Use the most simple method: search for instructions that calls them
// And then filter invalid cases (does not detect tail calls)
for (u32 i = 0x10; i < SPU_LS_SIZE; i += 0x10)
const v128 brasl_mask = is_known_addr ? v128::from32p(0x62) : v128::from32p(umax);

for (u32 i = utils::align<u32>(base_addr, 0x10); i < std::min<u32>(base_addr + ls.size(), 0x3FFF0); i += 0x10)
{
// Search for BRSL and BRASL
// TODO: BISL
const v128 inst = read_from_ptr<be_t<v128>>(static_cast<const u8*>(ls_start), i);
const v128 inst = read_from_ptr<be_t<v128>>(ls.data(), i - base_addr);
const v128 shifted = gv_shr32(inst, 23);
const v128 eq_brsl = gv_eq32(shifted, v128::from32p(0x66));
const v128 eq_brasl = gv_eq32(shifted, v128::from32p(0x62));
const v128 eq_brasl = gv_eq32(shifted, brasl_mask);
const v128 result = eq_brsl | eq_brasl;

if (!gv_testz(result))
Expand All @@ -2136,14 +2149,14 @@ std::vector<u32> spu_thread::discover_functions(const void* ls_start, u32 /*entr
calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller)
{
// Check the validity of both the callee code and the following caller code
return !is_exec_code(caller, ls_start) || !is_exec_code(caller + 4, ls_start);
return !is_exec_code(caller, ls, base_addr) || !is_exec_code(caller + 4, ls, base_addr);
}), calls.end());

std::vector<u32> addrs;

for (u32 addr : calls)
{
const spu_opcode_t op{read_from_ptr<be_t<u32>>(static_cast<const u8*>(ls_start), addr)};
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls, addr - base_addr)};

const u32 func = op_branch_targets(addr, op)[0];

Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/Cell/SPURecompiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class spu_cache

void add(const struct spu_program& func);

static void initialize();
static void initialize(bool build_existing_cache = true);
};

struct spu_program
Expand Down
Loading