From 53119977532dc1344fc5909005aff13f37a68a59 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 16 Dec 2023 08:10:29 -0800 Subject: [PATCH 1/3] x86jit: Correct downcount on replacement in IR. --- Core/MIPS/x86/X64IRCompSystem.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Core/MIPS/x86/X64IRCompSystem.cpp b/Core/MIPS/x86/X64IRCompSystem.cpp index 9d1723aef552..9918bd467858 100644 --- a/Core/MIPS/x86/X64IRCompSystem.cpp +++ b/Core/MIPS/x86/X64IRCompSystem.cpp @@ -232,8 +232,10 @@ void X64JitBackend::CompIR_System(IRInst inst) { ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc); WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT); LoadStaticRegisters(); - //SUB(32, R(DOWNCOUNTREG), R(DOWNCOUNTREG), R(EAX)); - SUB(32, MDisp(CTXREG, downcountOffset), R(EAX)); + if (jo.downcountInRegister) + SUB(32, R(DOWNCOUNTREG), R(EAX)); + else + SUB(32, MDisp(CTXREG, downcountOffset), R(EAX)); break; case IROp::Break: From 053831bf4dc491fa32a25a0db60b9ece55b13541 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 16 Dec 2023 09:08:58 -0800 Subject: [PATCH 2/3] HLE: Add mechanics for sliced replacements. --- Core/HLE/ReplaceTables.cpp | 2 +- Core/HLE/ReplaceTables.h | 2 ++ Core/MIPS/ARM/ArmJit.cpp | 11 +++++++++++ Core/MIPS/ARM64/Arm64IRCompSystem.cpp | 11 ++++++++++- Core/MIPS/ARM64/Arm64Jit.cpp | 11 +++++++++++ Core/MIPS/IR/IRFrontend.cpp | 5 ++++- Core/MIPS/IR/IRInst.cpp | 2 +- Core/MIPS/IR/IRInterpreter.cpp | 3 ++- Core/MIPS/MIPSInt.cpp | 6 +++++- Core/MIPS/RiscV/RiscVCompSystem.cpp | 7 +++++++ Core/MIPS/x86/Jit.cpp | 10 ++++++++++ Core/MIPS/x86/X64IRCompSystem.cpp | 11 +++++++++++ 12 files changed, 75 insertions(+), 6 deletions(-) diff --git a/Core/HLE/ReplaceTables.cpp b/Core/HLE/ReplaceTables.cpp index 4695d13926aa..695c39c05dbe 100644 --- a/Core/HLE/ReplaceTables.cpp +++ b/Core/HLE/ReplaceTables.cpp @@ -1738,7 +1738,7 @@ bool CanReplaceJalTo(u32 dest, const ReplacementTableEntry **entry, u32 *funcSiz return false; } - if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED)) { + if ((*entry)->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT | REPFLAG_DISABLED | REPFLAG_SLICED)) { // If it's a hook, we can't replace the jal, we have to go inside the func. return false; } diff --git a/Core/HLE/ReplaceTables.h b/Core/HLE/ReplaceTables.h index 980f506b6af1..c8dd61f400d6 100644 --- a/Core/HLE/ReplaceTables.h +++ b/Core/HLE/ReplaceTables.h @@ -48,6 +48,8 @@ enum { REPFLAG_HOOKENTER = 0x04, // Only hooks jr ra, so only use on funcs that have that. REPFLAG_HOOKEXIT = 0x08, + // Function may take a lot of time and execute in slices (executed multiple times.) + REPFLAG_SLICED = 0x10, }; // Kind of similar to HLE functions but with different data. diff --git a/Core/MIPS/ARM/ArmJit.cpp b/Core/MIPS/ARM/ArmJit.cpp index 4cfbc2512449..e830a6b3108e 100644 --- a/Core/MIPS/ARM/ArmJit.cpp +++ b/Core/MIPS/ARM/ArmJit.cpp @@ -617,7 +617,18 @@ void ArmJit::Comp_ReplacementFunc(MIPSOpcode op) } else { ApplyRoundingMode(); RestoreDowncount(); + + CMPI2R(R0, 0, SCRATCHREG2); + FixupBranch positive = B_CC(CC_GE); + + RSB(R0, R0, Operand2(0)); + MovFromPC(R1); + FixupBranch done = B(); + + SetJumpTarget(positive); LDR(R1, CTXREG, MIPS_REG_RA * 4); + + SetJumpTarget(done); WriteDownCountR(R0); WriteExitDestInR(R1); js.compiling = false; diff --git a/Core/MIPS/ARM64/Arm64IRCompSystem.cpp b/Core/MIPS/ARM64/Arm64IRCompSystem.cpp index 8fba3c320525..91a63978f419 100644 --- a/Core/MIPS/ARM64/Arm64IRCompSystem.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompSystem.cpp @@ -242,7 +242,16 @@ void Arm64JitBackend::CompIR_System(IRInst inst) { QuickCallFunction(SCRATCH2_64, GetReplacementFunc(inst.constant)->replaceFunc); WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT); LoadStaticRegisters(); - SUB(DOWNCOUNTREG, DOWNCOUNTREG, W0); + + // Absolute value the result and subtract. + CMP(W0, 0); + CSNEG(SCRATCH1, W0, W0, CC_PL); + SUB(DOWNCOUNTREG, DOWNCOUNTREG, SCRATCH1); + + // W0 might be the mapped reg, but there's only one. + // Set dest reg to the sign of the result. + regs_.Map(inst); + ASR(regs_.R(inst.dest), W0, 31); break; case IROp::Break: diff --git a/Core/MIPS/ARM64/Arm64Jit.cpp b/Core/MIPS/ARM64/Arm64Jit.cpp index 9abb69920bfc..a82b791d13e1 100644 --- a/Core/MIPS/ARM64/Arm64Jit.cpp +++ b/Core/MIPS/ARM64/Arm64Jit.cpp @@ -614,7 +614,18 @@ void Arm64Jit::Comp_ReplacementFunc(MIPSOpcode op) } else { ApplyRoundingMode(); LoadStaticRegisters(); + + CMPI2R(W0, 0); + FixupBranch positive = B(CC_GE); + + NEG(W0, W0); + MovFromPC(W1); + FixupBranch done = B(); + + SetJumpTarget(positive); LDR(INDEX_UNSIGNED, W1, CTXREG, MIPS_REG_RA * 4); + + SetJumpTarget(done); WriteDownCountR(W0); WriteExitDestInR(W1); js.compiling = false; diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp index 5b2a3896018a..227048439127 100644 --- a/Core/MIPS/IR/IRFrontend.cpp +++ b/Core/MIPS/IR/IRFrontend.cpp @@ -164,7 +164,7 @@ void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) { FlushAll(); RestoreRoundingMode(); ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC())); - ir.Write(IROp::CallReplacement, 0, ir.AddConstant(index)); + ir.Write(IROp::CallReplacement, IRTEMP_0, ir.AddConstant(index)); if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) { // Compile the original instruction at this address. We ignore cycles for hooks. @@ -172,7 +172,10 @@ void IRFrontend::Comp_ReplacementFunc(MIPSOpcode op) { MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this); } else { ApplyRoundingMode(); + // If IRTEMP_0 was set to 1, it means the replacement needs to run again (sliced.) + // This is necessary for replacements that take a lot of cycles. ir.Write(IROp::Downcount, 0, ir.AddConstant(js.downcountAmount)); + ir.Write(IROp::ExitToConstIfNeq, ir.AddConstant(GetCompilerPC()), IRTEMP_0, MIPS_REG_ZERO); ir.Write(IROp::ExitToReg, 0, MIPS_REG_RA, 0); js.compiling = false; } diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp index 3db8c4e4e2df..64d1497d5ef4 100644 --- a/Core/MIPS/IR/IRInst.cpp +++ b/Core/MIPS/IR/IRInst.cpp @@ -165,7 +165,7 @@ static const IRMeta irMeta[] = { { IROp::Break, "Break", "", IRFLAG_EXIT }, { IROp::SetPC, "SetPC", "_G" }, { IROp::SetPCConst, "SetPC", "_C" }, - { IROp::CallReplacement, "CallRepl", "_C", IRFLAG_BARRIER }, + { IROp::CallReplacement, "CallRepl", "GC", IRFLAG_BARRIER }, { IROp::Breakpoint, "Breakpoint", "_C", IRFLAG_BARRIER }, { IROp::MemoryCheck, "MemoryCheck", "IGC", IRFLAG_BARRIER }, diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp index 8fa713d80929..9d83d5da685d 100644 --- a/Core/MIPS/IR/IRInterpreter.cpp +++ b/Core/MIPS/IR/IRInterpreter.cpp @@ -1089,7 +1089,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) { int funcIndex = inst->constant; const ReplacementTableEntry *f = GetReplacementFunc(funcIndex); int cycles = f->replaceFunc(); - mips->downcount -= cycles; + mips->r[inst->dest] = cycles < 0 ? -1 : 0; + mips->downcount -= cycles < 0 ? -cycles : cycles; break; } diff --git a/Core/MIPS/MIPSInt.cpp b/Core/MIPS/MIPSInt.cpp index 95c341c98803..93644706d672 100644 --- a/Core/MIPS/MIPSInt.cpp +++ b/Core/MIPS/MIPSInt.cpp @@ -1038,13 +1038,17 @@ namespace MIPSInt int index = op.encoding & 0xFFFFFF; const ReplacementTableEntry *entry = GetReplacementFunc(index); if (entry && entry->replaceFunc && (entry->flags & REPFLAG_DISABLED) == 0) { - entry->replaceFunc(); + int cycles = entry->replaceFunc(); if (entry->flags & (REPFLAG_HOOKENTER | REPFLAG_HOOKEXIT)) { // Interpret the original instruction under the hook. MIPSInterpret(Memory::Read_Instruction(PC, true)); + } else if (cycles < 0) { + // Leave PC unchanged, call the replacement again (assumes args are modified.) + currentMIPS->downcount += cycles; } else { PC = currentMIPS->r[MIPS_REG_RA]; + currentMIPS->downcount -= cycles; } } else { if (!entry || !entry->replaceFunc) { diff --git a/Core/MIPS/RiscV/RiscVCompSystem.cpp b/Core/MIPS/RiscV/RiscVCompSystem.cpp index 4605648ed8e2..a291d8593074 100644 --- a/Core/MIPS/RiscV/RiscVCompSystem.cpp +++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp @@ -220,6 +220,13 @@ void RiscVJitBackend::CompIR_System(IRInst inst) { QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc, SCRATCH2); WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT); LoadStaticRegisters(); + + regs_.Map(inst); + SRAIW(regs_.R(inst.dest), X10, 31); + + // Absolute value trick: if neg, abs(x) == (x ^ -1) + 1. + XOR(X10, X10, regs_.R(inst.dest)); + SUBW(X10, X10, regs_.R(inst.dest)); SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10); break; diff --git a/Core/MIPS/x86/Jit.cpp b/Core/MIPS/x86/Jit.cpp index 2f561a2cd05a..b23b8385d682 100644 --- a/Core/MIPS/x86/Jit.cpp +++ b/Core/MIPS/x86/Jit.cpp @@ -658,8 +658,18 @@ void Jit::Comp_ReplacementFunc(MIPSOpcode op) { ApplyRoundingMode(); MIPSCompileOp(Memory::Read_Instruction(GetCompilerPC(), true), this); } else { + CMP(32, R(EAX), Imm32(0)); + FixupBranch positive = J_CC(CC_GE); + + MOV(32, R(ECX), MIPSSTATE_VAR(pc)); + ADD(32, MIPSSTATE_VAR(downcount), R(EAX)); + FixupBranch done = J(); + + SetJumpTarget(positive); MOV(32, R(ECX), MIPSSTATE_VAR(r[MIPS_REG_RA])); SUB(32, MIPSSTATE_VAR(downcount), R(EAX)); + + SetJumpTarget(done); ApplyRoundingMode(); // Need to set flags again, ApplyRoundingMode destroyed them (and EAX.) SUB(32, MIPSSTATE_VAR(downcount), Imm8(0)); diff --git a/Core/MIPS/x86/X64IRCompSystem.cpp b/Core/MIPS/x86/X64IRCompSystem.cpp index 9918bd467858..c3bbd46aee4a 100644 --- a/Core/MIPS/x86/X64IRCompSystem.cpp +++ b/Core/MIPS/x86/X64IRCompSystem.cpp @@ -232,6 +232,17 @@ void X64JitBackend::CompIR_System(IRInst inst) { ABI_CallFunction(GetReplacementFunc(inst.constant)->replaceFunc); WriteDebugProfilerStatus(IRProfilerStatus::IN_JIT); LoadStaticRegisters(); + + // Since we flushed above, and we're mapping write, EAX should be safe. + regs_.Map(inst); + MOV(32, regs_.R(inst.dest), R(EAX)); + NEG(32, R(EAX)); + // Set it back if it negate made it negative. That's the absolute value. + CMOVcc(32, EAX, regs_.R(inst.dest), CC_S); + + // Now set the dest to the sign bit status. + SAR(32, regs_.R(inst.dest), Imm8(31)); + if (jo.downcountInRegister) SUB(32, R(DOWNCOUNTREG), R(EAX)); else From e1eecb475a89d9b4a05bf71fcb56354018a6e4e7 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 16 Dec 2023 09:24:15 -0800 Subject: [PATCH 3/3] HLE: Slice the very slow memset/memcpy variants. When they take an especially long time, this allows thread switches meanwhile. Important for cases where they might consume more than a total frame worth of cycles in a background thread. --- Core/HLE/ReplaceTables.cpp | 68 +++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 15 deletions(-) diff --git a/Core/HLE/ReplaceTables.cpp b/Core/HLE/ReplaceTables.cpp index 695c39c05dbe..ac9054cd35e1 100644 --- a/Core/HLE/ReplaceTables.cpp +++ b/Core/HLE/ReplaceTables.cpp @@ -182,23 +182,33 @@ static int Replace_memcpy_jak() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2); - bool skip = false; + if (bytes == 0) { RETURN(destPtr); return 5; } + + bool skip = false; + bool sliced = false; + static constexpr uint32_t SLICE_SIZE = 32768; + currentMIPS->InvalidateICache(srcPtr, bytes); if ((skipGPUReplacements & (int)GPUReplacementSkip::MEMCPY) == 0) { if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { skip = gpu->PerformMemoryCopy(destPtr, srcPtr, bytes); } } + if (!skip && bytes > SLICE_SIZE && bytes != 512 * 272 * 4) { + // This is a very slow func. To avoid thread blocking, do a slice at a time. + // Avoiding exactly 512 * 272 * 4 to detect videos, though. + bytes = SLICE_SIZE; + sliced = true; + } if (!skip && bytes != 0) { u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes); const u8 *src = Memory::GetPointerRange(srcPtr, bytes); - if (!dst || !src) { - } else { + if (dst && src) { // Jak style overlap. for (u32 i = 0; i < bytes; i++) { dst[i] = src[i]; @@ -206,13 +216,20 @@ static int Replace_memcpy_jak() { } } - // Jak relies on more registers coming out right than the ABI specifies. - // See the disassembly of the function for the explanations for these... - currentMIPS->r[MIPS_REG_T0] = 0; - currentMIPS->r[MIPS_REG_A0] = -1; - currentMIPS->r[MIPS_REG_A2] = 0; - currentMIPS->r[MIPS_REG_A3] = destPtr + bytes; - RETURN(destPtr); + if (sliced) { + currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE; + currentMIPS->r[MIPS_REG_A1] += SLICE_SIZE; + currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE; + } else { + // Jak relies on more registers coming out right than the ABI specifies. + // See the disassembly of the function for the explanations for these... + currentMIPS->r[MIPS_REG_T0] = 0; + currentMIPS->r[MIPS_REG_A0] = -1; + currentMIPS->r[MIPS_REG_A2] = 0; + // Even after slicing, this ends up correct. + currentMIPS->r[MIPS_REG_A3] = destPtr + bytes; + RETURN(destPtr); + } if (MemBlockInfoDetailed(bytes)) { // It's pretty common that games will copy video data. @@ -231,6 +248,10 @@ static int Replace_memcpy_jak() { } } + if (sliced) { + // Negative causes the function to be run again for the next slice. + return 5 + bytes * -8 + 2; + } return 5 + bytes * 8 + 2; // approximation. This is a slow memcpy - a byte copy loop.. } @@ -364,9 +385,16 @@ static int Replace_memset_jak() { } bool skip = false; + bool sliced = false; + static constexpr uint32_t SLICE_SIZE = 32768; if (Memory::IsVRAMAddress(destPtr) && (skipGPUReplacements & (int)GPUReplacementSkip::MEMSET) == 0) { skip = gpu->PerformMemorySet(destPtr, value, bytes); } + if (!skip && bytes > SLICE_SIZE) { + // This is a very slow func. To avoid thread blocking, do a slice at a time. + bytes = SLICE_SIZE; + sliced = true; + } if (!skip && bytes != 0) { u8 *dst = Memory::GetPointerWriteRange(destPtr, bytes); if (dst) { @@ -374,14 +402,24 @@ static int Replace_memset_jak() { } } + NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset"); + + if (sliced) { + currentMIPS->r[MIPS_REG_A0] += SLICE_SIZE; + currentMIPS->r[MIPS_REG_A2] -= SLICE_SIZE; + + // This is approximate, and must be a negative value. + // Negative causes the function to be run again for the next slice. + return 5 + (int)SLICE_SIZE * -6 + 2; + } + + // Even after slicing, this ends up correct. currentMIPS->r[MIPS_REG_T0] = destPtr + bytes; currentMIPS->r[MIPS_REG_A2] = -1; currentMIPS->r[MIPS_REG_A3] = -1; RETURN(destPtr); - NotifyMemInfo(MemBlockFlags::WRITE, destPtr, bytes, "ReplaceMemset"); - - return 5 + bytes * 6 + 2; // approximation (hm, inspecting the disasm this should be 5 + 6 * bytes + 2, but this is what works..) + return 5 + bytes * 6 + 2; // approximation } static uint32_t SafeStringLen(const uint32_t ptr, uint32_t maxLen = 0x07FFFFFF) { @@ -1449,12 +1487,12 @@ static const ReplacementTableEntry entries[] = { { "ceilf", &Replace_ceilf, 0, REPFLAG_DISABLED }, { "memcpy", &Replace_memcpy, 0, 0 }, - { "memcpy_jak", &Replace_memcpy_jak, 0, 0 }, + { "memcpy_jak", &Replace_memcpy_jak, 0, REPFLAG_SLICED }, { "memcpy16", &Replace_memcpy16, 0, 0 }, { "memcpy_swizzled", &Replace_memcpy_swizzled, 0, 0 }, { "memmove", &Replace_memmove, 0, 0 }, { "memset", &Replace_memset, 0, 0 }, - { "memset_jak", &Replace_memset_jak, 0, 0 }, + { "memset_jak", &Replace_memset_jak, 0, REPFLAG_SLICED }, { "strlen", &Replace_strlen, 0, REPFLAG_DISABLED }, { "strcpy", &Replace_strcpy, 0, REPFLAG_DISABLED }, { "strncpy", &Replace_strncpy, 0, REPFLAG_DISABLED },