This patch implements the CompNum SIMD instruction for armv8.3.
This instructions are Fcadd, Fcmla(vector and element) and
Vcadd, Vcmla ( vector and element).
+ isa/decoder/thumb.isa: Decoding changes for SIMD instructions in T32
+ isa/formats/fp.isa: Decoding changes for SIMD instructions in A32
+ isa/formats/uncond.isa: Decoding changes for SIMD instructions in A32
+ isa/formats/aarch64.isa: Decoding changes for SIMD instructions in A64
+ isa/formats/neon64.isa: Decoding changes for SIMD instructions in A64
+ isa/insts/neon.isa: Vcadd, Vcmla instruction implementation
+ isa/insts/neon64.isa: Fcadd, Fcmla instruction implementation
+ isa/templates/neon.isa: Modify templates for adding byElement support
Change-Id: I7f11ce88137dad077d2cad698dcaa9a79a3f317b
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/27183
Tested-by: Gem5 Cloud Project GCB service account <345032938727@cloudbuild.gserviceaccount.com>
Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com>
Maintainer: Giacomo Travaglini <giacomo.travaglini@arm.com>
id_isar2 = Param.UInt32(0x21232141, "Instruction Set Attribute Register 2")
id_isar3 = Param.UInt32(0x01112131, "Instruction Set Attribute Register 3")
id_isar4 = Param.UInt32(0x10010142, "Instruction Set Attribute Register 4")
- id_isar5 = Param.UInt32(0x00000000, "Instruction Set Attribute Register 5")
+ id_isar5 = Param.UInt32(0x10000000, "Instruction Set Attribute Register 5")
fpsid = Param.UInt32(0x410430a0, "Floating-point System ID Register")
id_aa64isar0_el1 = Param.UInt64(0x0000000000000000,
"AArch64 Instruction Set Attribute Register 0")
- # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1
- id_aa64isar1_el1 = Param.UInt64(0x0000000001000010,
+ # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1 | FCMA
+ id_aa64isar1_el1 = Param.UInt64(0x0000000001010010,
"AArch64 Instruction Set Attribute Register 1")
# 4K | 64K | !16K | !BigEndEL0 | !SNSMem | !BigEnd | 8b ASID | 40b PA
0x3: Thumb32LongMulMulAccAndDiv::thumb32LongMulMulAccAndDiv();
default: Thumb32DataProcReg::thumb32DataProcReg();
}
+ 0x2: Thumb32NeonSIMD::thumb32NeonSIMD();
default: decode HTOPCODE_9_8 {
0x2: decode LTOPCODE_4 {
0x0: decode LTCOPROC {
+ 0x8: Thumb32NeonSIMD::thumb32NeonSIMD();
0xa, 0xb: VfpData::vfpData();
default: WarnUnimpl::cdp(); // cdp2
}
} else {
return new Unknown64(machInst);
}
- } else if (bits(machInst, 24) ||
- bits(machInst, 21) ||
- bits(machInst, 15)) {
- return new Unknown64(machInst);
+ } else if (bits(machInst, 15) == 1) {
+ return decodeNeon3SameExtra<DecoderFeatures>(machInst);
} else if (bits(machInst, 10) == 1) {
if (bits(machInst, 23, 22))
return new Unknown64(machInst);
StaticInstPtr
decodeNeonData(ExtMachInst machInst);
+
+ StaticInstPtr
+ decodeAdvancedSIMD(ExtMachInst machInst);
'''
decoder_output = '''
return new Unknown(machInst);
}
'''
+ decoder_output += '''
+ StaticInstPtr
+ decodeAdvancedSIMD(ExtMachInst machInst)
+ {
+ uint8_t op_code = (bits(machInst, 25) << 1)
+ | bits(machInst, 21);
+
+ IntRegIndex vd = (IntRegIndex)(2 * (bits(machInst, 15, 12) |
+ (bits(machInst, 22) << 4)));
+ IntRegIndex vn = (IntRegIndex)(2 * (bits(machInst, 19, 16) |
+ (bits(machInst, 7) << 4)));
+ IntRegIndex vm = (IntRegIndex)(2 * (bits(machInst, 3, 0) |
+ (bits(machInst, 5) << 4)));
+ bool q = bits (machInst, 6);
+ switch (op_code) {
+ case 0x0:
+ {
+ // VCADD
+ bool s = bits (machInst, 20);
+ if (s) {
+ if (q)
+ return new VcaddQ<uint32_t>(machInst, vd, vn, vm);
+ else
+ return new VcaddD<uint32_t>(machInst, vd, vn, vm);
+ } else {
+ if (q)
+ return new VcaddQ<uint16_t>(machInst, vd, vn, vm);
+ else
+ return new VcaddD<uint16_t>(machInst, vd, vn, vm);
+ }
+ }
+ case 0x1:
+ {
+ // VCMLA
+ bool s = bits (machInst, 20);
+ if (s) {
+ if (q)
+ return new VcmlaQ<uint32_t>(machInst, vd, vn, vm);
+ else
+ return new VcmlaD<uint32_t>(machInst, vd, vn, vm);
+ } else {
+ if (q)
+ return new VcmlaQ<uint16_t>(machInst, vd, vn, vm);
+ else
+ return new VcmlaD<uint16_t>(machInst, vd, vn, vm);
+ }
+ }
+ case 0x2:
+ case 0x3:
+ {
+ // VCMLA by element
+ bool s = bits (machInst, 23);
+ if (s) {
+ uint8_t index_fp = 0;
+ if (q)
+ return new VcmlaElemQ<uint32_t>(machInst, vd, vn, vm,
+ index_fp);
+ else
+ return new VcmlaElemD<uint32_t>(machInst, vd, vn, vm,
+ index_fp);
+ } else {
+ vm = (IntRegIndex)(uint8_t)(2* bits(machInst, 3, 0));
+ uint8_t index_fp = bits(machInst, 5);
+ if (q)
+ return new VcmlaElemQ<uint16_t>(machInst, vd, vn, vm,
+ index_fp);
+ else
+ return new VcmlaElemD<uint16_t>(machInst, vd, vn, vm,
+ index_fp);
+ }
+ }
+ default:
+ return new Unknown64(machInst);
+ }
+
+ }
+ '''
+
decoder_output += '''
static StaticInstPtr
'''
}};
+def format Thumb32NeonSIMD() {{
+ decode_block = '''
+ return decodeAdvancedSIMD(machInst);
+ '''
+}};
+
let {{
header_output = '''
bool
// AdvSIMD three same
template <typename DecoderFeatures>
StaticInstPtr decodeNeon3Same(ExtMachInst machInst);
+ // AdvSIMD three same Extra
+ template <typename DecoderFeatures>
+ StaticInstPtr decodeNeon3SameExtra(ExtMachInst machInst);
// AdvSIMD three different
inline StaticInstPtr decodeNeon3Diff(ExtMachInst machInst);
// AdvSIMD two-reg misc
}
}
+ template <typename DecoderFeatures>
+ StaticInstPtr
+ decodeNeon3SameExtra(ExtMachInst machInst)
+ {
+ uint8_t q = bits(machInst, 30);
+ uint8_t size = bits(machInst, 23, 22);
+ uint8_t opcode = bits(machInst, 15, 11);
+
+ IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+ IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+ IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+ switch (opcode) {
+ case 0x18:
+ case 0x19:
+ case 0x1a:
+ case 0x1b:
+ if (size == 0x1) {
+ if (q)
+ return new FcmlaQX<uint16_t>(machInst, vd, vn, vm);
+ else
+ return new FcmlaDX<uint16_t>(machInst, vd, vn, vm);
+ } else
+ return decodeNeonUThreeFpReg<FcmlaDX, FcmlaQX>(
+ q, size & 0x1, machInst, vd, vn, vm);
+
+ case 0x1c:
+ case 0x1e:
+ if (size == 0x1) {
+ if (q)
+ return new FcaddQX<uint16_t>(machInst, vd, vn, vm);
+ else
+ return new FcaddDX<uint16_t>(machInst, vd, vn, vm);
+ } else
+ return decodeNeonUThreeFpReg<FcaddDX, FcaddQX>(
+ q, size & 0x1, machInst, vd, vn, vm);
+
+ default:
+ return new Unknown64(machInst);
+ }
+ }
+
StaticInstPtr
decodeNeon3Diff(ExtMachInst machInst)
{
if (!u && size >= 2 && sz_q != 0x2 && sz_L != 0x3)
return decodeNeonUThreeImmFpReg<FmlaElemDX, FmlaElemQX>(
q, sz, machInst, vd, vn, vm_fp, index_fp);
- else
+ else if (u && (size == 1 || size == 2)){
+ // FCMLA by element
+ if (size == 0x2) {
+ index_fp = H;
+ if (q)
+ return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ } else {
+ index_fp = (H << 1) | L;
+ if (q)
+ return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ }
+
+ } else
return new Unknown64(machInst);
case 0x2:
if (size == 0x0 || size == 0x3)
return decodeNeonSThreeImmHAndWReg<SmlalElemX, SmlalElem2X>(
q, size, machInst, vd, vn, vm, index);
case 0x3:
- if (u || (size == 0x0 || size == 0x3))
+ if (u && (size == 1 || size == 2)){
+ // FCMLA by element
+ if (size == 0x2) {
+ index_fp = H;
+ if (q)
+ return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ } else {
+ index_fp = (H << 1) | L;
+ if (q)
+ return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ }
+ } else if (u || (size == 0x0 || size == 0x3))
return new Unknown64(machInst);
else
return decodeNeonSThreeImmHAndWReg<SqdmlalElemX,
if (!u && size >= 0x2 && sz_L != 0x3 && sz_q != 0x2)
return decodeNeonUThreeImmFpReg<FmlsElemDX, FmlsElemQX>(
q, sz, machInst, vd, vn, vm_fp, index_fp);
- else
+ else if (u && (size == 1 || size == 2)){
+ // FCMLA by element
+ if (size == 0x2) {
+ index_fp = H;
+ if (q)
+ return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ } else {
+ index_fp = (H << 1) | L;
+ if (q)
+ return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ }
+ } else
return new Unknown64(machInst);
case 0x6:
if (size == 0x0 || size == 0x3)
return decodeNeonSThreeImmHAndWReg<SmlslElemX, SmlslElem2X>(
q, size, machInst, vd, vn, vm, index);
case 0x7:
- if (u || (size == 0x0 || size == 0x3))
+ if (u && (size == 1 || size == 2)){
+ // FCMLA by element
+ if (size == 0x2) {
+ index_fp = H;
+ if (q)
+ return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ } else {
+ index_fp = (H << 1) | L;
+ if (q)
+ return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ else
+ return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+ vm_fp, index_fp);
+ }
+ } else if (u || (size == 0x0 || size == 0x3))
return new Unknown64(machInst);
else
return decodeNeonSThreeImmHAndWReg<SqdmlslElemX,
return new BlxImm(machInst, imm, COND_UC);
}
case 0x2:
- if (bits(op1, 4, 0) != 0) {
+ if (bits(machInst, 31, 25) == 0x7e){
+ return decodeAdvancedSIMD(machInst);
+ } else if (bits(op1, 4, 0) != 0) {
if (CPNUM == 0xa || CPNUM == 0xb) {
return decodeExtensionRegLoadStore(machInst);
}
}
break;
case 0x3:
- if (bits(op1, 4) == 0) {
+ if (bits(machInst, 31, 24) == 0xfe) {
+ return decodeAdvancedSIMD(machInst);
+ } else if (bits(op1, 4) == 0) {
if (CPNUM == 0xa || CPNUM == 0xb) {
return decodeShortFpTransfer(machInst);
} else if (CPNUM == 0xe) {
allTypes = unsignedTypes + signedTypes
def threeEqualRegInst(name, Name, opClass, types, rCount, op,
- readDest=False, pairwise=False,
- standardFpcsr=False):
+ readDest=False, pairwise=False, byElem=False,
+ standardFpcsr=False, complex=False):
global header_output, exec_output
eWalkCode = simdEnabledCheckCode + '''
- RegVect srcReg1, srcReg2, destReg;
- '''
+ RegVect srcReg1, destReg;
+ '''
+ if byElem:
+ # 2nd register operand has to be read fully
+ eWalkCode += '''
+ FullRegVect srcReg2;
+ '''
+ else:
+ eWalkCode += '''
+ RegVect srcReg2;
+ '''
for reg in range(rCount):
eWalkCode += '''
srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
eWalkCode += '''
destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
''' % { "reg" : reg }
+ if byElem:
+ # 2nd operand has to be read fully
+ for reg in range(rCount, 4):
+ eWalkCode += '''
+ srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
+ ''' % { "reg" : reg }
+
readDestCode = ''
if standardFpcsr:
eWalkCode += '''
'''
if readDest:
readDestCode = 'destElem = letoh(destReg.elements[i]);'
- if pairwise:
+
+ if complex:
+ eWalkCode += op
+ elif pairwise:
eWalkCode += '''
for (unsigned i = 0; i < eCount; i++) {
Element srcElem1 = letoh(2 * i < eCount ?
FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
''' % { "reg" : reg }
iop = InstObjParams(name, Name,
- "RegRegRegOp",
- { "code": eWalkCode,
- "r_count": rCount,
- "predicate_test": predicateTest,
- "op_class": opClass }, [])
- header_output += NeonRegRegRegOpDeclare.subst(iop)
+ "RegRegRegImmOp" if byElem else "RegRegRegOp",
+ { "code": eWalkCode,
+ "r_count": rCount,
+ "predicate_test": predicateTest,
+ "op_class": opClass }, [])
+ if byElem:
+ header_output += NeonRegRegRegImmOpDeclare.subst(iop)
+ else:
+ header_output += NeonRegRegRegOpDeclare.subst(iop)
exec_output += NeonEqualRegExecute.subst(iop)
for type in types:
substDict = { "targs" : type,
'''
threeRegNarrowInst("vrsubhn", "Vrsubhn", "SimdAddOp", smallTypes, vrsubhnCode)
+ vcaddCode = '''
+ bool rot = bits(machInst, 24);
+ Element el1;
+ Element el3;
+
+ for (int i = 0; i < eCount/2; ++i) {
+ Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+ Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+ Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
+ Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
+ Element destElem_1;
+ Element destElem_2;
+ if (rot) {
+ el1 = srcElem2_2;
+ el3 = fplibNeg<Element>(srcElem2_1);
+ } else {
+ el1 = fplibNeg<Element>(srcElem2_2);
+ el3 = srcElem2_1;
+ }
+
+ destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
+ destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
+ destReg.elements[2*i] = htole(destElem_1);
+ destReg.elements[2*i+1] = htole(destElem_2);
+ }
+ '''
+
+ # VCADD
+ threeEqualRegInst("vcadd", "VcaddD", "SimdFloatAddOp",
+ ("uint16_t", "uint32_t"), 2, vcaddCode,
+ standardFpcsr=True, complex=True)
+ threeEqualRegInst("vcadd", "VcaddQ", "SimdFloatAddOp",
+ ("uint16_t", "uint32_t"), 4,
+ vcaddCode, standardFpcsr=True, complex=True)
+
+ vcmlaCode = '''
+ uint8_t rot = bits(machInst, %(rot)s);
+ Element el1;
+ Element el2;
+ Element el3;
+ Element el4;
+ for (int i = 0; i < eCount/2; ++i) {
+
+ Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+ Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+ Element srcElem2_1 = letoh(srcReg2.elements[2*%(index)s]);
+ Element srcElem2_2 = letoh(srcReg2.elements[2*%(index)s+1]);
+ Element destElem_1 = letoh(destReg.elements[2*i]);
+ Element destElem_2 = letoh(destReg.elements[2*i+1]);
+
+ switch (rot) {
+ case 0x0:
+ {
+ el1 = srcElem2_1;
+ el2 = srcElem1_1;
+ el3 = srcElem2_2;
+ el4 = srcElem1_1;
+ break;
+ }
+ case 0x1:
+ {
+ el1 = fplibNeg<Element>(srcElem2_2);
+ el2 = srcElem1_2;
+ el3 = srcElem2_1;
+ el4 = srcElem1_2;
+ break;
+ }
+ case 0x2:
+ {
+ el1 = fplibNeg<Element>(srcElem2_1);
+ el2 = srcElem1_1;
+ el3 = fplibNeg<Element>(srcElem2_2);
+ el4 = srcElem1_1;
+ break;
+ }
+ case 0x3:
+ {
+ el1 = srcElem2_2;
+ el2 = srcElem1_2;
+ el3 = fplibNeg<Element>(srcElem2_1);
+ el4 = srcElem1_2;
+ break;
+ }
+ }
+
+ destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
+ destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
+
+ destReg.elements[2*i] = htole(destElem_1);
+ destReg.elements[2*i+1] = htole(destElem_2);
+ }
+ '''
+
+ # VCMLA (by element)
+ vcmla_imm = vcmlaCode % {'rot': '21, 20', 'index': 'imm'}
+ threeEqualRegInst("vcmla", "VcmlaElemD", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 2, vcmla_imm,
+ readDest=True, byElem=True, standardFpcsr=True,
+ complex=True)
+ threeEqualRegInst("vcmla", "VcmlaElemQ", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 4, vcmla_imm,
+ readDest=True, byElem=True, standardFpcsr=True,
+ complex=True)
+
+ # FCMLA (vector)
+ vcmla_vec = vcmlaCode % {'rot': '24, 23', 'index': 'i'}
+ threeEqualRegInst("vcmla", "VcmlaD", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 2, vcmla_vec,
+ readDest=True, standardFpcsr=True, complex=True)
+ threeEqualRegInst("vcmla", "VcmlaQ", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 4, vcmla_vec,
+ readDest=True, standardFpcsr=True, complex=True)
+
vqaddSCode = '''
destElem = srcElem1 + srcElem2;
FPSCR fpscr = (FPSCR) FpscrQc;
def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
readDest=False, pairwise=False, scalar=False,
- byElem=False, decoder='Generic'):
+ byElem=False, decoder='Generic', complex=False):
assert (not pairwise) or ((not byElem) and (not scalar))
global header_output, exec_output, decoders
eWalkCode = simd64EnabledCheckCode + '''
readDestCode = ''
if readDest:
readDestCode = 'destElem = letoh(destReg.elements[i]);'
- if pairwise:
+
+ if complex:
+ eWalkCode += op
+ elif pairwise:
eWalkCode += '''
for (unsigned i = 0; i < eCount; i++) {
Element srcElem1 = letoh(2 * i < eCount ?
True)
threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
True)
+
+ # FCADD
+ fcaddCode = '''
+ bool rot = bits(machInst, 12);
+ Element el1;
+ Element el3;
+ for (int i = 0; i < eCount/2; ++i) {
+ FPSCR fpscr = (FPSCR) FpscrExc;
+
+ Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+ Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+ Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
+ Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
+ Element destElem_1;
+ Element destElem_2;
+ if (rot) {
+ el1 = srcElem2_2;
+ el3 = fplibNeg<Element>(srcElem2_1);
+ } else {
+ el1 = fplibNeg<Element>(srcElem2_2);
+ el3 = srcElem2_1;
+ }
+
+ destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
+ destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
+
+ FpscrExc = fpscr;
+
+ destReg.elements[2*i] = htole(destElem_1);
+ destReg.elements[2*i+1] = htole(destElem_2);
+ }
+ '''
+
+ threeEqualRegInstX("fcadd", "FcaddDX", "SimdFloatAddOp",
+ ("uint16_t", "uint32_t"), 2,
+ fcaddCode, complex=True)
+ threeEqualRegInstX("fcadd", "FcaddQX", "SimdFloatAddOp", floatTypes, 4,
+ fcaddCode, complex=True)
+
+ fcmlaCode = '''
+ uint8_t rot = bits(machInst, %(rot)s);
+ Element el1;
+ Element el2;
+ Element el3;
+ Element el4;
+ for (int i = 0; i < eCount/2; ++i) {
+ FPSCR fpscr = (FPSCR) FpscrExc;
+
+ Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+ Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+ Element srcElem2_1 = letoh(srcReg2.elements[2* %(index)s]);
+ Element srcElem2_2 = letoh(srcReg2.elements[2* %(index)s +1]);
+ Element destElem_1 = letoh(destReg.elements[2*i]);
+ Element destElem_2 = letoh(destReg.elements[2*i+1]);
+
+ switch (rot) {
+ case 0x0:
+ {
+ el1 = srcElem2_1;
+ el2 = srcElem1_1;
+ el3 = srcElem2_2;
+ el4 = srcElem1_1;
+ break;
+ }
+ case 0x1:
+ {
+ el1 = fplibNeg<Element>(srcElem2_2);
+ el2 = srcElem1_2;
+ el3 = srcElem2_1;
+ el4 = srcElem1_2;
+ break;
+ }
+ case 0x2:
+ {
+ el1 = fplibNeg<Element>(srcElem2_1);
+ el2 = srcElem1_1;
+ el3 = fplibNeg<Element>(srcElem2_2);
+ el4 = srcElem1_1;
+ break;
+ }
+ case 0x3:
+ {
+ el1 = srcElem2_2;
+ el2 = srcElem1_2;
+ el3 = fplibNeg<Element>(srcElem2_1);
+ el4 = srcElem1_2;
+ break;
+ }
+ }
+ destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
+ destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
+
+ FpscrExc = fpscr;
+
+ destReg.elements[2*i] = htole(destElem_1);
+ destReg.elements[2*i+1] = htole(destElem_2);
+ }
+ '''
+ # FCMLA (by element)
+ fcmla_imm = fcmlaCode % {'rot': '14, 13', 'index': 'imm'}
+ threeEqualRegInstX("fcmla", "FcmlaElemDX", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 2, fcmla_imm, True,
+ byElem=True, complex=True)
+ threeEqualRegInstX("fcmla", "FcmlaElemQX", "SimdFloatMultAccOp",
+ floatTypes, 4, fcmla_imm, True, byElem=True,
+ complex=True)
+ # FCMLA (vector)
+ fcmla_vec = fcmlaCode % {'rot': '12, 11', 'index': 'i'}
+ threeEqualRegInstX("fcmla", "FcmlaDX", "SimdFloatMultAccOp",
+ ("uint16_t", "uint32_t"), 2, fcmla_vec, True,
+ complex=True)
+ threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
+ floatTypes, 4, fcmla_vec, True, complex=True)
# CLS
clsCode = '''
unsigned count = 0;
const unsigned rCount = %(r_count)d;
const unsigned eCount = rCount * sizeof(uint32_t) / sizeof(Element);
+ const unsigned eCountFull = 4 * sizeof(uint32_t) / sizeof(Element);
union RegVect {
uint32_t regs[rCount];
Element elements[eCount];
};
+ union FullRegVect {
+ uint32_t regs[4];
+ Element elements[eCountFull];
+ };
+
if (%(predicate_test)s)
{
%(code)s;