From ae69a12b3bd40bc86997c96cac9239f2f973aa95 Mon Sep 17 00:00:00 2001 From: Jordi Vaquero Date: Fri, 27 Mar 2020 12:04:12 +0100 Subject: [PATCH] arch-arm: ARMv8.3 CompNum, SIMD complex number support This patch implements the CompNum SIMD instruction for armv8.3. This instructions are Fcadd, Fcmla(vector and element) and Vcadd, Vcmla ( vector and element). + isa/decoder/thumb.isa: Decoding changes for SIMD instructions in T32 + isa/formats/fp.isa: Decoding changes for SIMD instructions in A32 + isa/formats/uncond.isa: Decoding changes for SIMD instructions in A32 + isa/formats/aarch64.isa: Decoding changes for SIMD instructions in A64 + isa/formats/neon64.isa: Decoding changes for SIMD instructions in A64 + isa/insts/neon.isa: Vcadd, Vcmla instruction implementation + isa/insts/neon64.isa: Fcadd, Fcmla instruction implementation + isa/templates/neon.isa: Modify templates for adding byElement support Change-Id: I7f11ce88137dad077d2cad698dcaa9a79a3f317b Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27183 Tested-by: Gem5 Cloud Project GCB service account <345032938727@cloudbuild.gserviceaccount.com> Reviewed-by: Giacomo Travaglini Maintainer: Giacomo Travaglini --- src/arch/arm/ArmISA.py | 6 +- src/arch/arm/isa/decoder/thumb.isa | 2 + src/arch/arm/isa/formats/aarch64.isa | 6 +- src/arch/arm/isa/formats/fp.isa | 87 +++++++++++++++ src/arch/arm/isa/formats/neon64.isa | 130 +++++++++++++++++++++- src/arch/arm/isa/formats/uncond.isa | 8 +- src/arch/arm/isa/insts/neon.isa | 157 +++++++++++++++++++++++++-- src/arch/arm/isa/insts/neon64.isa | 120 +++++++++++++++++++- src/arch/arm/isa/templates/neon.isa | 6 + 9 files changed, 496 insertions(+), 26 deletions(-) diff --git a/src/arch/arm/ArmISA.py b/src/arch/arm/ArmISA.py index 0d917058c..2641ec3fb 100644 --- a/src/arch/arm/ArmISA.py +++ b/src/arch/arm/ArmISA.py @@ -79,7 +79,7 @@ class ArmISA(BaseISA): id_isar2 = Param.UInt32(0x21232141, "Instruction Set Attribute Register 2") id_isar3 = Param.UInt32(0x01112131, "Instruction Set Attribute Register 3") id_isar4 = Param.UInt32(0x10010142, "Instruction Set Attribute Register 4") - id_isar5 = Param.UInt32(0x00000000, "Instruction Set Attribute Register 5") + id_isar5 = Param.UInt32(0x10000000, "Instruction Set Attribute Register 5") fpsid = Param.UInt32(0x410430a0, "Floating-point System ID Register") @@ -101,8 +101,8 @@ class ArmISA(BaseISA): id_aa64isar0_el1 = Param.UInt64(0x0000000000000000, "AArch64 Instruction Set Attribute Register 0") - # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1 - id_aa64isar1_el1 = Param.UInt64(0x0000000001000010, + # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1 | FCMA + id_aa64isar1_el1 = Param.UInt64(0x0000000001010010, "AArch64 Instruction Set Attribute Register 1") # 4K | 64K | !16K | !BigEndEL0 | !SNSMem | !BigEnd | 8b ASID | 40b PA diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa index 7f04ef348..c319ec305 100644 --- a/src/arch/arm/isa/decoder/thumb.isa +++ b/src/arch/arm/isa/decoder/thumb.isa @@ -138,9 +138,11 @@ decode BIGTHUMB { 0x3: Thumb32LongMulMulAccAndDiv::thumb32LongMulMulAccAndDiv(); default: Thumb32DataProcReg::thumb32DataProcReg(); } + 0x2: Thumb32NeonSIMD::thumb32NeonSIMD(); default: decode HTOPCODE_9_8 { 0x2: decode LTOPCODE_4 { 0x0: decode LTCOPROC { + 0x8: Thumb32NeonSIMD::thumb32NeonSIMD(); 0xa, 0xb: VfpData::vfpData(); default: WarnUnimpl::cdp(); // cdp2 } diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa index 7aaca04f5..4ff54564c 100644 --- a/src/arch/arm/isa/formats/aarch64.isa +++ b/src/arch/arm/isa/formats/aarch64.isa @@ -2313,10 +2313,8 @@ namespace Aarch64 } else { return new Unknown64(machInst); } - } else if (bits(machInst, 24) || - bits(machInst, 21) || - bits(machInst, 15)) { - return new Unknown64(machInst); + } else if (bits(machInst, 15) == 1) { + return decodeNeon3SameExtra(machInst); } else if (bits(machInst, 10) == 1) { if (bits(machInst, 23, 22)) return new Unknown64(machInst); diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa index de0fdd270..f1b387e2f 100644 --- a/src/arch/arm/isa/formats/fp.isa +++ b/src/arch/arm/isa/formats/fp.isa @@ -96,6 +96,9 @@ let {{ StaticInstPtr decodeNeonData(ExtMachInst machInst); + + StaticInstPtr + decodeAdvancedSIMD(ExtMachInst machInst); ''' decoder_output = ''' @@ -333,6 +336,84 @@ let {{ return new Unknown(machInst); } ''' + decoder_output += ''' + StaticInstPtr + decodeAdvancedSIMD(ExtMachInst machInst) + { + uint8_t op_code = (bits(machInst, 25) << 1) + | bits(machInst, 21); + + IntRegIndex vd = (IntRegIndex)(2 * (bits(machInst, 15, 12) | + (bits(machInst, 22) << 4))); + IntRegIndex vn = (IntRegIndex)(2 * (bits(machInst, 19, 16) | + (bits(machInst, 7) << 4))); + IntRegIndex vm = (IntRegIndex)(2 * (bits(machInst, 3, 0) | + (bits(machInst, 5) << 4))); + bool q = bits (machInst, 6); + switch (op_code) { + case 0x0: + { + // VCADD + bool s = bits (machInst, 20); + if (s) { + if (q) + return new VcaddQ(machInst, vd, vn, vm); + else + return new VcaddD(machInst, vd, vn, vm); + } else { + if (q) + return new VcaddQ(machInst, vd, vn, vm); + else + return new VcaddD(machInst, vd, vn, vm); + } + } + case 0x1: + { + // VCMLA + bool s = bits (machInst, 20); + if (s) { + if (q) + return new VcmlaQ(machInst, vd, vn, vm); + else + return new VcmlaD(machInst, vd, vn, vm); + } else { + if (q) + return new VcmlaQ(machInst, vd, vn, vm); + else + return new VcmlaD(machInst, vd, vn, vm); + } + } + case 0x2: + case 0x3: + { + // VCMLA by element + bool s = bits (machInst, 23); + if (s) { + uint8_t index_fp = 0; + if (q) + return new VcmlaElemQ(machInst, vd, vn, vm, + index_fp); + else + return new VcmlaElemD(machInst, vd, vn, vm, + index_fp); + } else { + vm = (IntRegIndex)(uint8_t)(2* bits(machInst, 3, 0)); + uint8_t index_fp = bits(machInst, 5); + if (q) + return new VcmlaElemQ(machInst, vd, vn, vm, + index_fp); + else + return new VcmlaElemD(machInst, vd, vn, vm, + index_fp); + } + } + default: + return new Unknown64(machInst); + } + + } + ''' + decoder_output += ''' static StaticInstPtr @@ -1869,6 +1950,12 @@ def format ThumbNeonData() {{ ''' }}; +def format Thumb32NeonSIMD() {{ + decode_block = ''' + return decodeAdvancedSIMD(machInst); + ''' +}}; + let {{ header_output = ''' bool diff --git a/src/arch/arm/isa/formats/neon64.isa b/src/arch/arm/isa/formats/neon64.isa index 1bdc97c83..6c2b2e02c 100644 --- a/src/arch/arm/isa/formats/neon64.isa +++ b/src/arch/arm/isa/formats/neon64.isa @@ -39,6 +39,9 @@ namespace Aarch64 // AdvSIMD three same template StaticInstPtr decodeNeon3Same(ExtMachInst machInst); + // AdvSIMD three same Extra + template + StaticInstPtr decodeNeon3SameExtra(ExtMachInst machInst); // AdvSIMD three different inline StaticInstPtr decodeNeon3Diff(ExtMachInst machInst); // AdvSIMD two-reg misc @@ -500,6 +503,48 @@ namespace Aarch64 } } + template + StaticInstPtr + decodeNeon3SameExtra(ExtMachInst machInst) + { + uint8_t q = bits(machInst, 30); + uint8_t size = bits(machInst, 23, 22); + uint8_t opcode = bits(machInst, 15, 11); + + IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0); + IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5); + IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16); + + switch (opcode) { + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + if (size == 0x1) { + if (q) + return new FcmlaQX(machInst, vd, vn, vm); + else + return new FcmlaDX(machInst, vd, vn, vm); + } else + return decodeNeonUThreeFpReg( + q, size & 0x1, machInst, vd, vn, vm); + + case 0x1c: + case 0x1e: + if (size == 0x1) { + if (q) + return new FcaddQX(machInst, vd, vn, vm); + else + return new FcaddDX(machInst, vd, vn, vm); + } else + return decodeNeonUThreeFpReg( + q, size & 0x1, machInst, vd, vn, vm); + + default: + return new Unknown64(machInst); + } + } + StaticInstPtr decodeNeon3Diff(ExtMachInst machInst) { @@ -1324,7 +1369,27 @@ namespace Aarch64 if (!u && size >= 2 && sz_q != 0x2 && sz_L != 0x3) return decodeNeonUThreeImmFpReg( q, sz, machInst, vd, vn, vm_fp, index_fp); - else + else if (u && (size == 1 || size == 2)){ + // FCMLA by element + if (size == 0x2) { + index_fp = H; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } else { + index_fp = (H << 1) | L; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } + + } else return new Unknown64(machInst); case 0x2: if (size == 0x0 || size == 0x3) @@ -1336,7 +1401,26 @@ namespace Aarch64 return decodeNeonSThreeImmHAndWReg( q, size, machInst, vd, vn, vm, index); case 0x3: - if (u || (size == 0x0 || size == 0x3)) + if (u && (size == 1 || size == 2)){ + // FCMLA by element + if (size == 0x2) { + index_fp = H; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } else { + index_fp = (H << 1) | L; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } + } else if (u || (size == 0x0 || size == 0x3)) return new Unknown64(machInst); else return decodeNeonSThreeImmHAndWReg= 0x2 && sz_L != 0x3 && sz_q != 0x2) return decodeNeonUThreeImmFpReg( q, sz, machInst, vd, vn, vm_fp, index_fp); - else + else if (u && (size == 1 || size == 2)){ + // FCMLA by element + if (size == 0x2) { + index_fp = H; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } else { + index_fp = (H << 1) | L; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } + } else return new Unknown64(machInst); case 0x6: if (size == 0x0 || size == 0x3) @@ -1364,7 +1467,26 @@ namespace Aarch64 return decodeNeonSThreeImmHAndWReg( q, size, machInst, vd, vn, vm, index); case 0x7: - if (u || (size == 0x0 || size == 0x3)) + if (u && (size == 1 || size == 2)){ + // FCMLA by element + if (size == 0x2) { + index_fp = H; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } else { + index_fp = (H << 1) | L; + if (q) + return new FcmlaElemQX(machInst, vd, vn, + vm_fp, index_fp); + else + return new FcmlaElemDX(machInst, vd, vn, + vm_fp, index_fp); + } + } else if (u || (size == 0x0 || size == 0x3)) return new Unknown64(machInst); else return decodeNeonSThreeImmHAndWReg(srcElem2_1); + } else { + el1 = fplibNeg(srcElem2_2); + el3 = srcElem2_1; + } + + destElem_1 = fplibAdd(srcElem1_1, el1, fpscr); + destElem_2 = fplibAdd(srcElem1_2, el3, fpscr); + destReg.elements[2*i] = htole(destElem_1); + destReg.elements[2*i+1] = htole(destElem_2); + } + ''' + + # VCADD + threeEqualRegInst("vcadd", "VcaddD", "SimdFloatAddOp", + ("uint16_t", "uint32_t"), 2, vcaddCode, + standardFpcsr=True, complex=True) + threeEqualRegInst("vcadd", "VcaddQ", "SimdFloatAddOp", + ("uint16_t", "uint32_t"), 4, + vcaddCode, standardFpcsr=True, complex=True) + + vcmlaCode = ''' + uint8_t rot = bits(machInst, %(rot)s); + Element el1; + Element el2; + Element el3; + Element el4; + for (int i = 0; i < eCount/2; ++i) { + + Element srcElem1_1 = letoh(srcReg1.elements[2*i]); + Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]); + Element srcElem2_1 = letoh(srcReg2.elements[2*%(index)s]); + Element srcElem2_2 = letoh(srcReg2.elements[2*%(index)s+1]); + Element destElem_1 = letoh(destReg.elements[2*i]); + Element destElem_2 = letoh(destReg.elements[2*i+1]); + + switch (rot) { + case 0x0: + { + el1 = srcElem2_1; + el2 = srcElem1_1; + el3 = srcElem2_2; + el4 = srcElem1_1; + break; + } + case 0x1: + { + el1 = fplibNeg(srcElem2_2); + el2 = srcElem1_2; + el3 = srcElem2_1; + el4 = srcElem1_2; + break; + } + case 0x2: + { + el1 = fplibNeg(srcElem2_1); + el2 = srcElem1_1; + el3 = fplibNeg(srcElem2_2); + el4 = srcElem1_1; + break; + } + case 0x3: + { + el1 = srcElem2_2; + el2 = srcElem1_2; + el3 = fplibNeg(srcElem2_1); + el4 = srcElem1_2; + break; + } + } + + destElem_1 = fplibMulAdd(destElem_1, el2, el1, fpscr); + destElem_2 = fplibMulAdd(destElem_2, el4, el3, fpscr); + + destReg.elements[2*i] = htole(destElem_1); + destReg.elements[2*i+1] = htole(destElem_2); + } + ''' + + # VCMLA (by element) + vcmla_imm = vcmlaCode % {'rot': '21, 20', 'index': 'imm'} + threeEqualRegInst("vcmla", "VcmlaElemD", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 2, vcmla_imm, + readDest=True, byElem=True, standardFpcsr=True, + complex=True) + threeEqualRegInst("vcmla", "VcmlaElemQ", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 4, vcmla_imm, + readDest=True, byElem=True, standardFpcsr=True, + complex=True) + + # FCMLA (vector) + vcmla_vec = vcmlaCode % {'rot': '24, 23', 'index': 'i'} + threeEqualRegInst("vcmla", "VcmlaD", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 2, vcmla_vec, + readDest=True, standardFpcsr=True, complex=True) + threeEqualRegInst("vcmla", "VcmlaQ", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 4, vcmla_vec, + readDest=True, standardFpcsr=True, complex=True) + vqaddSCode = ''' destElem = srcElem1 + srcElem2; FPSCR fpscr = (FPSCR) FpscrQc; diff --git a/src/arch/arm/isa/insts/neon64.isa b/src/arch/arm/isa/insts/neon64.isa index 6db9e38dc..5186de38c 100644 --- a/src/arch/arm/isa/insts/neon64.isa +++ b/src/arch/arm/isa/insts/neon64.isa @@ -52,7 +52,7 @@ let {{ def threeEqualRegInstX(name, Name, opClass, types, rCount, op, readDest=False, pairwise=False, scalar=False, - byElem=False, decoder='Generic'): + byElem=False, decoder='Generic', complex=False): assert (not pairwise) or ((not byElem) and (not scalar)) global header_output, exec_output, decoders eWalkCode = simd64EnabledCheckCode + ''' @@ -85,7 +85,10 @@ let {{ readDestCode = '' if readDest: readDestCode = 'destElem = letoh(destReg.elements[i]);' - if pairwise: + + if complex: + eWalkCode += op + elif pairwise: eWalkCode += ''' for (unsigned i = 0; i < eCount; i++) { Element srcElem1 = letoh(2 * i < eCount ? @@ -975,6 +978,119 @@ let {{ True) threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode, True) + + # FCADD + fcaddCode = ''' + bool rot = bits(machInst, 12); + Element el1; + Element el3; + for (int i = 0; i < eCount/2; ++i) { + FPSCR fpscr = (FPSCR) FpscrExc; + + Element srcElem1_1 = letoh(srcReg1.elements[2*i]); + Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]); + Element srcElem2_1 = letoh(srcReg2.elements[2*i]); + Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]); + Element destElem_1; + Element destElem_2; + if (rot) { + el1 = srcElem2_2; + el3 = fplibNeg(srcElem2_1); + } else { + el1 = fplibNeg(srcElem2_2); + el3 = srcElem2_1; + } + + destElem_1 = fplibAdd(srcElem1_1, el1, fpscr); + destElem_2 = fplibAdd(srcElem1_2, el3, fpscr); + + FpscrExc = fpscr; + + destReg.elements[2*i] = htole(destElem_1); + destReg.elements[2*i+1] = htole(destElem_2); + } + ''' + + threeEqualRegInstX("fcadd", "FcaddDX", "SimdFloatAddOp", + ("uint16_t", "uint32_t"), 2, + fcaddCode, complex=True) + threeEqualRegInstX("fcadd", "FcaddQX", "SimdFloatAddOp", floatTypes, 4, + fcaddCode, complex=True) + + fcmlaCode = ''' + uint8_t rot = bits(machInst, %(rot)s); + Element el1; + Element el2; + Element el3; + Element el4; + for (int i = 0; i < eCount/2; ++i) { + FPSCR fpscr = (FPSCR) FpscrExc; + + Element srcElem1_1 = letoh(srcReg1.elements[2*i]); + Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]); + Element srcElem2_1 = letoh(srcReg2.elements[2* %(index)s]); + Element srcElem2_2 = letoh(srcReg2.elements[2* %(index)s +1]); + Element destElem_1 = letoh(destReg.elements[2*i]); + Element destElem_2 = letoh(destReg.elements[2*i+1]); + + switch (rot) { + case 0x0: + { + el1 = srcElem2_1; + el2 = srcElem1_1; + el3 = srcElem2_2; + el4 = srcElem1_1; + break; + } + case 0x1: + { + el1 = fplibNeg(srcElem2_2); + el2 = srcElem1_2; + el3 = srcElem2_1; + el4 = srcElem1_2; + break; + } + case 0x2: + { + el1 = fplibNeg(srcElem2_1); + el2 = srcElem1_1; + el3 = fplibNeg(srcElem2_2); + el4 = srcElem1_1; + break; + } + case 0x3: + { + el1 = srcElem2_2; + el2 = srcElem1_2; + el3 = fplibNeg(srcElem2_1); + el4 = srcElem1_2; + break; + } + } + destElem_1 = fplibMulAdd(destElem_1, el2, el1, fpscr); + destElem_2 = fplibMulAdd(destElem_2, el4, el3, fpscr); + + FpscrExc = fpscr; + + destReg.elements[2*i] = htole(destElem_1); + destReg.elements[2*i+1] = htole(destElem_2); + } + ''' + # FCMLA (by element) + fcmla_imm = fcmlaCode % {'rot': '14, 13', 'index': 'imm'} + threeEqualRegInstX("fcmla", "FcmlaElemDX", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 2, fcmla_imm, True, + byElem=True, complex=True) + threeEqualRegInstX("fcmla", "FcmlaElemQX", "SimdFloatMultAccOp", + floatTypes, 4, fcmla_imm, True, byElem=True, + complex=True) + # FCMLA (vector) + fcmla_vec = fcmlaCode % {'rot': '12, 11', 'index': 'i'} + threeEqualRegInstX("fcmla", "FcmlaDX", "SimdFloatMultAccOp", + ("uint16_t", "uint32_t"), 2, fcmla_vec, True, + complex=True) + threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp", + floatTypes, 4, fcmla_vec, True, complex=True) # CLS clsCode = ''' unsigned count = 0; diff --git a/src/arch/arm/isa/templates/neon.isa b/src/arch/arm/isa/templates/neon.isa index ee38c2b0d..39e6d2222 100644 --- a/src/arch/arm/isa/templates/neon.isa +++ b/src/arch/arm/isa/templates/neon.isa @@ -215,12 +215,18 @@ def template NeonEqualRegExecute {{ const unsigned rCount = %(r_count)d; const unsigned eCount = rCount * sizeof(uint32_t) / sizeof(Element); + const unsigned eCountFull = 4 * sizeof(uint32_t) / sizeof(Element); union RegVect { uint32_t regs[rCount]; Element elements[eCount]; }; + union FullRegVect { + uint32_t regs[4]; + Element elements[eCountFull]; + }; + if (%(predicate_test)s) { %(code)s; -- 2.30.2