From 55ec3552f9fe357af2fb12c393e5cb956b3b7c61 Mon Sep 17 00:00:00 2001 From: zhaixiang Date: Mon, 9 Dec 2024 14:51:13 +0800 Subject: [PATCH 1/3] [LA64_DYNAREC] Added more 660F opcodes --- src/dynarec/la64/dynarec_la64_660f.c | 243 ++++++++++++++++++++++++- src/dynarec/la64/dynarec_la64_helper.h | 2 + src/dynarec/la64/la64_emitter.h | 4 + 3 files changed, 248 insertions(+), 1 deletion(-) diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c index 4daba8bf96..e9738608f3 100644 --- a/src/dynarec/la64/dynarec_la64_660f.c +++ b/src/dynarec/la64/dynarec_la64_660f.c @@ -36,7 +36,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int uint8_t eb1, eb2; int64_t j64; uint64_t tmp64u, tmp64u2; - int v0, v1; + int v0, v1, v2; int q0, q1; int d0, d1, d2; int64_t fixedaddress, gdoffset; @@ -316,6 +316,81 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); VSIGNCOV_W(q0, q1, q0); break; + case 0x0B: + INST_NAME("PMULHRSW Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + VMULWEV_W_H(v0, q0, q1); + VMULWOD_W_H(v1, q0, q1); + VSRAI_W(v0, v0, 14); + VSRAI_W(v1, v1, 14); + VADDI_WU(v0, v0, 1); + VADDI_WU(v1, v1, 1); + VSRANI_H_W(v1, v0, 1); + VSHUF4I_W(v1, v1, 0xd8); + VSHUF4I_H(q0, v1, 0xd8); + break; + case 0x1C: + INST_NAME("PABSB Gx,Ex"); + nextop = F8; + GETEX(q1, 0, 0); + GETGX_empty(q0); + v0 = fpu_get_scratch(dyn); + VREPLGR2VR_D(v0, xZR); + VABSD_B(q0, q1, v0); + break; + case 0x1D: + INST_NAME("PABSW Gx,Ex"); + nextop = F8; + GETEX(q1, 0, 0); + GETGX_empty(q0); + v0 = fpu_get_scratch(dyn); + VREPLGR2VR_D(v0, xZR); + VABSD_H(q0, q1, v0); + break; + case 0x2B: + INST_NAME("PACKUSDW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETEX(q1, 0, 0); + GETGX(q0, 1); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + VSLTI_W(v0, q0, 0); + VANDN_V(q0, v0, q0); + VSSRANI_HU_W(q0, q0, 0); + if (q0 == q1) { + VEXTRINS_D(q0, q0, VEXTRINS_IMM_4_0(1, 0)); + } else { + VSLTI_W(v1, q1, 0); + VANDN_V(v1, v1, q1); + VSSRANI_HU_W(v1, v1, 0); + VEXTRINS_D(q0, v1, VEXTRINS_IMM_4_0(1, 0)); + } + break; + case 0x3A: + INST_NAME("PMINUW Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETEX(q1, 0, 0); + GETGX(q0, 1); + VMIN_HU(q0, q0, q1); + break; + case 0x3D: + INST_NAME("PMAXSD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETEX(q1, 0, 0); + GETGX(q0, 1); + VMAX_W(q0, q0, q1); + break; + case 0x40: + INST_NAME("PMULLD Gx, Ex"); // SSE4 opcode! + nextop = F8; + GETEX(q1, 0, 0); + GETGX(q0, 1); + VMUL_W(q0, q0, q1); + break; case 0xDB: INST_NAME("AESIMC Gx, Ex"); // AES-NI nextop = F8; @@ -418,6 +493,63 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int VOR_V(q0, q0, d0); } break; + case 0x0E: + INST_NAME("PBLENDW Gx, Ex, Ib"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 1); + u8 = F8; + i32 = 0; + if (q0 != q1) { + if (u8 == 0xff) { + VAND_V(q0, q1, q1); + } else { + /* 64bits */ + if ((u8 & 0xf) == 0xf) { + VEXTRINS_D(q0, q1, VEXTRINS_IMM_4_0(0, 0)); + u8 &= ~0xf; + } + if ((u8 & 0xf0) == 0xf0) { + VEXTRINS_D(q0, q1, VEXTRINS_IMM_4_0(1, 1)); + u8 &= ~0xf0; + } + /* 32bits */ + if ((u8 & 0x3) == 0x3) { + VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(0, 0)); + u8 &= ~0x3; + } + if ((u8 & 0xc) == 0xc) { + VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(1, 1)); + u8 &= ~0xc; + } + if ((u8 & 0x30) == 0x30) { + VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(2, 2)); + u8 &= ~0x30; + } + if ((u8 & 0xc0) == 0xc0) { + VEXTRINS_W(q0, q1, VEXTRINS_IMM_4_0(3, 3)); + u8 &= ~0xc0; + } + /* 16bits */ + if (u8 & 0x1) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(0, 0)); + if (u8 & 0x2) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(1, 1)); + if (u8 & 0x4) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(2, 2)); + if (u8 & 0x8) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(3, 3)); + if (u8 & 0x10) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(4, 4)); + if (u8 & 0x20) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(5, 5)); + if (u8 & 0x40) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(6, 6)); + if (u8 & 0x80) + VEXTRINS_H(q0, q1, VEXTRINS_IMM_4_0(7, 7)); + } + } + break; case 0x16: if (rex.w) { INST_NAME("PEXTRQ Ed, Gx, Ib"); @@ -1166,6 +1298,19 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int u8 = F8; VSHUF4I_D(v0, v1, 0x8 | (u8 & 1) | ((u8 & 2) << 1)); break; + case 0xD1: + INST_NAME("PSRLW Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + VREPLVEI_D(v0, q1, 0); + VSLEI_DU(v1, v0, 15); + VREPLVEI_H(v0, q1, 0); + VSRL_H(q0, q0, v0); + VAND_V(q0, q0, v1); + break; case 0xD2: INST_NAME("PSRLD Gx, Ex"); nextop = F8; @@ -1242,6 +1387,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int MOVFR2GR_D(x1, v0); BSTRPICK_D(gd, x1, 15, 0); break; + case 0xD8: + INST_NAME("PSUBUSB Gx, Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + VSSUB_BU(q0, q0, q1); + break; case 0xD9: INST_NAME("PSUBUSW Gx, Ex"); nextop = F8; @@ -1249,6 +1401,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); VSSUB_HU(q0, q0, q1); break; + case 0xDA: + INST_NAME("PMINUB Gx, Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + VMIN_BU(q0, q0, q1); + break; case 0xDB: INST_NAME("PAND Gx,Ex"); nextop = F8; @@ -1263,6 +1422,20 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); VSADD_BU(q0, q0, q1); break; + case 0xDD: + INST_NAME("PADDUSW Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + VSADD_HU(q0, q0, q1); + break; + case 0xDE: + INST_NAME("PMAXUB Gx, Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + VMAX_BU(q0, q0, q1); + break; case 0xDF: INST_NAME("PANDN Gx,Ex"); nextop = F8; @@ -1287,6 +1460,21 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int VREPLVEI_H(v0, v0, 0); VSRA_H(q0, q0, v0); break; + case 0xE2: + INST_NAME("PSRAD Gx,Ex"); + nextop = F8; + GETGX(q0, 1); + GETEX(q1, 0, 0); + v0 = fpu_get_scratch(dyn); + v1 = fpu_get_scratch(dyn); + v2 = fpu_get_scratch(dyn); + VREPLVEI_D(v0, q1, 0); + VSLEI_DU(v1, v0, 31); + VREPLVEI_W(v0, q1, 0); + VSRAI_W(v2, q0, 31); + VSRA_W(q0, q0, v0); + VBITSEL_V(q0, v2, q0, v1); + break; case 0xE3: INST_NAME("PAVGW Gx,Ex"); nextop = F8; @@ -1328,6 +1516,27 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int VST(v0, ed, fixedaddress); } break; + case 0xE8: + INST_NAME("PSUBSB Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VSSUB_B(v0, v0, q0); + break; + case 0xE9: + INST_NAME("PSUBSW Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VSSUB_H(v0, v0, q0); + break; + case 0xEA: + INST_NAME("PMINSW Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VMIN_H(v0, v0, q0); + break; case 0xEB: INST_NAME("POR Gx,Ex"); nextop = F8; @@ -1335,6 +1544,27 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q0, 0, 0); VOR_V(v0, v0, q0); break; + case 0xEC: + INST_NAME("PADDSB Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VSADD_B(v0, v0, q0); + break; + case 0xED: + INST_NAME("PADDSW Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VSADD_H(v0, v0, q0); + break; + case 0xEE: + INST_NAME("PMAXSW Gx,Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(q0, 0, 0); + VMAX_H(v0, v0, q0); + break; case 0xEF: INST_NAME("PXOR Gx,Ex"); nextop = F8; @@ -1356,6 +1586,17 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(v1, 0, 0); VMULWEV_D_WU(v0, v0, v1); break; + case 0xF5: + INST_NAME("PMADDWD Gx, Ex"); + nextop = F8; + GETGX(v0, 1); + GETEX(v1, 0, 0); + q0 = fpu_get_scratch(dyn); + q1 = fpu_get_scratch(dyn); + VMULWEV_W_H(q0, v0, v1); + VMULWOD_W_H(q1, v0, v1); + VADD_W(v0, q0, q1); + break; case 0xF6: INST_NAME("PSADBW Gx, Ex"); nextop = F8; diff --git a/src/dynarec/la64/dynarec_la64_helper.h b/src/dynarec/la64/dynarec_la64_helper.h index cbfdc4ef52..b4b33443d2 100644 --- a/src/dynarec/la64/dynarec_la64_helper.h +++ b/src/dynarec/la64/dynarec_la64_helper.h @@ -314,6 +314,8 @@ ed = i; \ } +#define VEXTRINS_IMM_4_0(n, m) ((n & 0xf) << 4 | (m & 0xf)) + // Get GX as a quad (might use x1) #define GETGX(a, w) \ gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index eeaab03ad5..21f3a6b7b4 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -1287,6 +1287,7 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VBITSET_H(vd, vj, vk) EMIT(type_3R(0b01110001000011101, vk, vj, vd)) #define VBITSET_W(vd, vj, vk) EMIT(type_3R(0b01110001000011110, vk, vj, vd)) #define VBITSET_D(vd, vj, vk) EMIT(type_3R(0b01110001000011111, vk, vj, vd)) +#define VBITSEL_V(vd, vj, vk, va) EMIT(type_4R(0b000011010001, va, vk, vj, vd)) #define VBITREV_B(vd, vj, vk) EMIT(type_3R(0b01110001000100000, vk, vj, vd)) #define VBITREV_H(vd, vj, vk) EMIT(type_3R(0b01110001000100001, vk, vj, vd)) #define VBITREV_W(vd, vj, vk) EMIT(type_3R(0b01110001000100010, vk, vj, vd)) @@ -1369,9 +1370,11 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VSLE_HU(vd, vj, vk) EMIT(type_3R(0b01110000000001001, vk, vj, vd)) #define VSLE_WU(vd, vj, vk) EMIT(type_3R(0b01110000000001010, vk, vj, vd)) #define VSLE_DU(vd, vj, vk) EMIT(type_3R(0b01110000000001011, vk, vj, vd)) +#define VSLEI_DU(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001011, imm5, vj, vd)) #define VSLT_B(vd, vj, vk) EMIT(type_3R(0b01110000000001100, vk, vj, vd)) #define VSLT_H(vd, vj, vk) EMIT(type_3R(0b01110000000001101, vk, vj, vd)) #define VSLT_W(vd, vj, vk) EMIT(type_3R(0b01110000000001110, vk, vj, vd)) +#define VSLTI_W(vd, vj, imm5) EMIT(type_2RI5(0b01110010100001110, imm5, vj, vd)) #define VSLT_D(vd, vj, vk) EMIT(type_3R(0b01110000000001111, vk, vj, vd)) #define VSLT_BU(vd, vj, vk) EMIT(type_3R(0b01110000000010000, vk, vj, vd)) #define VSLT_HU(vd, vj, vk) EMIT(type_3R(0b01110000000010001, vk, vj, vd)) @@ -1818,6 +1821,7 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VEXT2XV_WU_HU(vd, vj) EMIT(type_2R(0b0111011010011111001101, vj, vd)) #define VEXT2XV_DU_HU(vd, vj) EMIT(type_2R(0b0111011010011111001110, vj, vd)) #define VEXT2XV_DU_WU(vd, vj) EMIT(type_2R(0b0111011010011111001111, vj, vd)) +#define VREPLGR2VR_D(vd, rj) EMIT(type_2R(0b0111001010011111000011, rj, vd)) //////////////////////////////////////////////////////////////////////////////// // (undocumented) LBT extension instructions From ad59514c2c0cab37c637c710ffbbabac596448b2 Mon Sep 17 00:00:00 2001 From: zhaixiang Date: Tue, 10 Dec 2024 14:00:28 +0800 Subject: [PATCH 2/3] [LA64_DYNAREC] Change VREPLGR2VR_D to VXOR_V --- src/dynarec/la64/dynarec_la64_660f.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c index e9738608f3..d01421439f 100644 --- a/src/dynarec/la64/dynarec_la64_660f.c +++ b/src/dynarec/la64/dynarec_la64_660f.c @@ -339,7 +339,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); GETGX_empty(q0); v0 = fpu_get_scratch(dyn); - VREPLGR2VR_D(v0, xZR); + VXOR_V(v0, v0, v0); VABSD_B(q0, q1, v0); break; case 0x1D: @@ -348,7 +348,7 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); GETGX_empty(q0); v0 = fpu_get_scratch(dyn); - VREPLGR2VR_D(v0, xZR); + VXOR_V(v0, v0, v0); VABSD_H(q0, q1, v0); break; case 0x2B: From 676aa8c5d560d72ee0d095c24b25a313df21b9a7 Mon Sep 17 00:00:00 2001 From: zhaixiang Date: Tue, 10 Dec 2024 17:07:38 +0800 Subject: [PATCH 3/3] [LA64_DYNAREC] Optimize PMULHRSW Co-authored-by: Yang Liu --- src/dynarec/la64/dynarec_la64_660f.c | 16 +++++++--------- src/dynarec/la64/la64_emitter.h | 4 +++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/dynarec/la64/dynarec_la64_660f.c b/src/dynarec/la64/dynarec_la64_660f.c index d01421439f..6dc339d9e2 100644 --- a/src/dynarec/la64/dynarec_la64_660f.c +++ b/src/dynarec/la64/dynarec_la64_660f.c @@ -323,15 +323,13 @@ uintptr_t dynarec64_660F(dynarec_la64_t* dyn, uintptr_t addr, uintptr_t ip, int GETEX(q1, 0, 0); v0 = fpu_get_scratch(dyn); v1 = fpu_get_scratch(dyn); - VMULWEV_W_H(v0, q0, q1); - VMULWOD_W_H(v1, q0, q1); - VSRAI_W(v0, v0, 14); - VSRAI_W(v1, v1, 14); - VADDI_WU(v0, v0, 1); - VADDI_WU(v1, v1, 1); - VSRANI_H_W(v1, v0, 1); - VSHUF4I_W(v1, v1, 0xd8); - VSHUF4I_H(q0, v1, 0xd8); + VEXT2XV_W_H(v0, q0); + VEXT2XV_W_H(v1, q1); + XVMUL_W(v0, v0, v1); + XVSRLI_W(v0, v0, 14); + XVADDI_WU(v0, v0, 1); + XVSRLNI_H_W(v0, v0, 1); + XVPERMI_D(q0, v0, 0b1000); break; case 0x1C: INST_NAME("PABSB Gx,Ex"); diff --git a/src/dynarec/la64/la64_emitter.h b/src/dynarec/la64/la64_emitter.h index 21f3a6b7b4..6e98806fdf 100644 --- a/src/dynarec/la64/la64_emitter.h +++ b/src/dynarec/la64/la64_emitter.h @@ -1821,7 +1821,9 @@ LSX instruction starts with V, LASX instruction starts with XV. #define VEXT2XV_WU_HU(vd, vj) EMIT(type_2R(0b0111011010011111001101, vj, vd)) #define VEXT2XV_DU_HU(vd, vj) EMIT(type_2R(0b0111011010011111001110, vj, vd)) #define VEXT2XV_DU_WU(vd, vj) EMIT(type_2R(0b0111011010011111001111, vj, vd)) -#define VREPLGR2VR_D(vd, rj) EMIT(type_2R(0b0111001010011111000011, rj, vd)) +#define XVADDI_WU(vd, vj, imm5) EMIT(type_2RI5(0b01110110100010110, imm5, vj, vd)) +#define XVSRLNI_H_W(vd, vj, imm5) EMIT(type_2RI5(0b01110111010000001, imm5, vj, vd)) +#define XVSRLI_W(vd, vj, imm5) EMIT(type_2RI5(0b01110111001100001, imm5, vj, vd)) //////////////////////////////////////////////////////////////////////////////// // (undocumented) LBT extension instructions