3 // Copyright (c) 2012-2013 ARM Limited
6 // The license below extends only to copyright in the software and shall
7 // not be construed as granting a license to any other intellectual
8 // property including but not limited to intellectual property relating
9 // to a hardware implementation of the functionality of the software
10 // licensed hereunder. You may use the software subject to the license
11 // terms below provided that you ensure that this notice is replicated
12 // unmodified and in its entirety in all distributions of the software,
13 // modified or unmodified, in source code or in binary form.
15 // Redistribution and use in source and binary forms, with or without
16 // modification, are permitted provided that the following conditions are
17 // met: redistributions of source code must retain the above copyright
18 // notice, this list of conditions and the following disclaimer;
19 // redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution;
22 // neither the name of the copyright holders nor the names of its
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 // Authors: Giacomo Gabrielli
46 # FP types (FP operations always work with unsigned representations)
47 floatTypes = ("uint32_t", "uint64_t")
48 smallFloatTypes = ("uint32_t",)
50 def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
51 readDest=False, pairwise=False, scalar=False,
53 assert (not pairwise) or ((not byElem) and (not scalar))
54 global header_output, exec_output
55 eWalkCode = simd64EnabledCheckCode + '''
56 RegVect srcReg1, destReg;
59 # 2nd register operand has to be read fully
67 for reg in range(rCount):
69 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
70 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
74 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
77 # 2nd operand has to be read fully
78 for reg in range(rCount, 4):
80 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
84 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
87 for (unsigned i = 0; i < eCount; i++) {
88 Element srcElem1 = gtoh(2 * i < eCount ?
89 srcReg1.elements[2 * i] :
90 srcReg2.elements[2 * i - eCount]);
91 Element srcElem2 = gtoh(2 * i < eCount ?
92 srcReg1.elements[2 * i + 1] :
93 srcReg2.elements[2 * i + 1 - eCount]);
97 destReg.elements[i] = htog(destElem);
99 ''' % { "op" : op, "readDest" : readDestCode }
103 destReg.elements[i] = 0;
108 for (unsigned i = 0; i < eCount; i++) {
110 Element srcElem1 = gtoh(srcReg1.elements[i]);
111 Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
115 destReg.elements[i] = htog(destElem);
117 ''' % { "op" : op, "readDest" : readDestCode,
118 "scalarCheck" : scalarCheck if scalar else "",
119 "src2Index" : "imm" if byElem else "i" }
120 for reg in range(rCount):
122 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
123 ''' % { "reg" : reg }
124 if rCount < 4: # zero upper half
125 for reg in range(rCount, 4):
127 AA64FpDestP%(reg)d_uw = 0;
128 ''' % { "reg" : reg }
129 iop = InstObjParams(name, Name,
130 "DataX2RegImmOp" if byElem else "DataX2RegOp",
133 "op_class": opClass }, [])
135 header_output += NeonX2RegImmOpDeclare.subst(iop)
137 header_output += NeonX2RegOpDeclare.subst(iop)
138 exec_output += NeonXEqualRegOpExecute.subst(iop)
140 substDict = { "targs" : type,
141 "class_name" : Name }
142 exec_output += NeonXExecDeclare.subst(substDict)
144 def threeUnequalRegInstX(name, Name, opClass, types, op,
145 bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
146 byElem=False, hi=False):
147 assert not (scalar and hi)
148 global header_output, exec_output
149 src1Cnt = src2Cnt = destCnt = 2
150 src1Prefix = src2Prefix = destPrefix = ''
162 eWalkCode = simd64EnabledCheckCode + '''
166 ''' % (src1Prefix, src2Prefix, destPrefix)
168 if hi and not bigSrc1: # long/widening operations
170 for reg in range(src1Cnt):
172 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw);
173 ''' % { "reg" : reg, "srcReg1" : srcReg1 }
176 if (not byElem) and (hi and not bigSrc2): # long/widening operations
178 for reg in range(src2Cnt):
180 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw);
181 ''' % { "reg" : reg, "srcReg2" : srcReg2 }
184 # 2nd operand has to be read fully
185 for reg in range(src2Cnt, 4):
187 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
188 ''' % { "reg" : reg }
190 for reg in range(destCnt):
192 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
193 ''' % { "reg" : reg }
196 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
199 destReg.elements[i] = 0;
204 for (unsigned i = 0; i < eCount; i++) {
206 %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]);
207 %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]);
208 %(destPrefix)sElement destElem;
211 destReg.elements[i] = htog(destElem);
213 ''' % { "op" : op, "readDest" : readDestCode,
214 "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
215 "destPrefix" : destPrefix,
216 "scalarCheck" : scalarCheck if scalar else "",
217 "src2Index" : "imm" if byElem else "i" }
219 if hi and not bigDest:
220 # narrowing operations
222 for reg in range(destCnt):
224 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
225 ''' % { "reg" : reg, "destReg": destReg }
227 if destCnt < 4 and not hi: # zero upper half
228 for reg in range(destCnt, 4):
230 AA64FpDestP%(reg)d_uw = 0;
231 ''' % { "reg" : reg }
232 iop = InstObjParams(name, Name,
233 "DataX2RegImmOp" if byElem else "DataX2RegOp",
236 "op_class": opClass }, [])
238 header_output += NeonX2RegImmOpDeclare.subst(iop)
240 header_output += NeonX2RegOpDeclare.subst(iop)
241 exec_output += NeonXUnequalRegOpExecute.subst(iop)
243 substDict = { "targs" : type,
244 "class_name" : Name }
245 exec_output += NeonXExecDeclare.subst(substDict)
247 def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
248 scalar=False, byElem=False, hi=False):
250 threeUnequalRegInstX(name, Name, opClass, types, op,
251 True, True, False, readDest, scalar, byElem, hi)
253 def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
254 scalar=False, byElem=False, hi=False):
255 threeUnequalRegInstX(name, Name, opClass, types, op,
256 False, False, True, readDest, scalar, byElem, hi)
258 def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
259 scalar=False, byElem=False, hi=False):
261 threeUnequalRegInstX(name, Name, opClass, types, op,
262 True, False, True, readDest, scalar, byElem, hi)
264 def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
265 readDest=False, scalar=False, byElem=False,
266 hasImm=False, isDup=False):
267 global header_output, exec_output
268 assert (not isDup) or byElem
272 eWalkCode = simd64EnabledCheckCode + '''
277 eWalkCode = simd64EnabledCheckCode + '''
278 RegVect srcReg1, destReg;
280 for reg in range(4 if isDup else rCount):
282 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
283 ''' % { "reg" : reg }
286 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
287 ''' % { "reg" : reg }
290 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
293 destReg.elements[i] = 0;
298 for (unsigned i = 0; i < eCount; i++) {
301 Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]);
305 destReg.elements[j] = htog(destElem);
307 ''' % { "op" : op, "readDest" : readDestCode,
308 "scalarCheck" : scalarCheck if scalar else "",
309 "src1Index" : "imm" if byElem else "i" }
310 for reg in range(rCount):
312 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
313 ''' % { "reg" : reg }
314 if rCount < 4: # zero upper half
315 for reg in range(rCount, 4):
317 AA64FpDestP%(reg)d_uw = 0;
318 ''' % { "reg" : reg }
319 iop = InstObjParams(name, Name,
320 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
323 "op_class": opClass }, [])
325 header_output += NeonX1RegImmOpDeclare.subst(iop)
327 header_output += NeonX1RegOpDeclare.subst(iop)
328 exec_output += NeonXEqualRegOpExecute.subst(iop)
330 substDict = { "targs" : type,
331 "class_name" : Name }
332 exec_output += NeonXExecDeclare.subst(substDict)
334 def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
335 hi=False, hasImm=False):
336 global header_output, exec_output
337 eWalkCode = simd64EnabledCheckCode + '''
341 destReg = 0 if not hi else 2
344 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw);
345 ''' % { "reg" : reg, "destReg": destReg }
347 destReg = 0 if not hi else 2
351 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
352 ''' % { "reg" : reg }
356 readDestCode = 'destReg = gtoh(destReg.elements[i]);'
358 for (unsigned i = 0; i < eCount; i++) {
359 Element srcElem1 = gtoh(srcReg1.elements[i]);
363 destReg.elements[i] = htog(destElem);
365 ''' % { "op" : op, "readDest" : readDestCode }
368 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
369 ''' % { "reg" : reg }
370 iop = InstObjParams(name, Name,
371 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
374 "op_class": opClass }, [])
376 header_output += NeonX1RegImmOpDeclare.subst(iop)
378 header_output += NeonX1RegOpDeclare.subst(iop)
379 exec_output += NeonXUnequalRegOpExecute.subst(iop)
381 substDict = { "targs" : type,
382 "class_name" : Name }
383 exec_output += NeonXExecDeclare.subst(substDict)
385 def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
386 scalar=False, hi=False, hasImm=False):
387 global header_output, exec_output
388 eWalkCode = simd64EnabledCheckCode + '''
394 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
395 ''' % { "reg" : reg }
399 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
400 ''' % { "reg" : reg }
403 destReg.elements[0] = 0;
404 ''' % { "reg" : reg }
407 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
410 destReg.elements[i] = 0;
415 for (unsigned i = 0; i < eCount; i++) {
417 BigElement srcElem1 = gtoh(srcReg1.elements[i]);
421 destReg.elements[i] = htog(destElem);
423 ''' % { "op" : op, "readDest" : readDestCode,
424 "scalarCheck" : scalarCheck if scalar else "" }
425 destReg = 0 if not hi else 2
428 AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]);
429 ''' % { "reg" : reg, "destReg": destReg }
432 for reg in range(2, 4): # zero upper half
434 AA64FpDestP%(reg)d_uw = 0;
435 ''' % { "reg" : reg }
436 iop = InstObjParams(name, Name,
437 "DataX1RegImmOp" if hasImm else "DataX1RegOp",
440 "op_class": opClass }, [])
442 header_output += NeonX1RegImmOpDeclare.subst(iop)
444 header_output += NeonX1RegOpDeclare.subst(iop)
445 exec_output += NeonXUnequalRegOpExecute.subst(iop)
447 substDict = { "targs" : type,
448 "class_name" : Name }
449 exec_output += NeonXExecDeclare.subst(substDict)
451 def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
452 global header_output, exec_output
453 eWalkCode = simd64EnabledCheckCode + '''
454 RegVect srcReg1, srcReg2, destReg;
456 for reg in range(rCount):
458 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
459 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
460 ''' % { "reg" : reg }
462 for reg in range(rCount):
464 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
465 ''' % { "reg" : reg }
467 for reg in range(rCount, 4):
469 AA64FpDestP%(reg)d_uw = 0;
470 ''' % { "reg" : reg }
471 iop = InstObjParams(name, Name,
475 "op_class": opClass }, [])
476 header_output += NeonX2RegOpDeclare.subst(iop)
477 exec_output += NeonXEqualRegOpExecute.subst(iop)
479 substDict = { "targs" : type,
480 "class_name" : Name }
481 exec_output += NeonXExecDeclare.subst(substDict)
483 def insFromVecElemInstX(name, Name, opClass, types, rCount):
484 global header_output, exec_output
485 eWalkCode = simd64EnabledCheckCode + '''
491 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
492 ''' % { "reg" : reg }
493 for reg in range(rCount):
495 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
496 ''' % { "reg" : reg }
498 Element srcElem1 = gtoh(srcReg1.elements[imm2]);
499 Element destElem = srcElem1;
500 destReg.elements[imm1] = htog(destElem);
502 for reg in range(rCount):
504 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
505 ''' % { "reg" : reg }
506 iop = InstObjParams(name, Name,
510 "op_class": opClass }, [])
511 header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
512 exec_output += NeonXEqualRegOpExecute.subst(iop)
514 substDict = { "targs" : type,
515 "class_name" : Name }
516 exec_output += NeonXExecDeclare.subst(substDict)
518 def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
519 global header_output, exec_output
520 eWalkCode = simd64EnabledCheckCode + '''
521 RegVect srcReg1, destReg;
523 for reg in range(rCount):
525 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
526 ''' % { "reg" : reg }
528 Element srcElem1 = gtoh(srcReg1.elements[0]);
529 Element srcElem2 = gtoh(srcReg1.elements[1]);
532 destReg.elements[0] = htog(destElem);
535 for reg in range(destCnt):
537 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
538 ''' % { "reg" : reg }
539 for reg in range(destCnt, 4): # zero upper half
541 AA64FpDestP%(reg)d_uw = 0;
542 ''' % { "reg" : reg }
543 iop = InstObjParams(name, Name,
547 "op_class": opClass }, [])
548 header_output += NeonX1RegOpDeclare.subst(iop)
549 exec_output += NeonXEqualRegOpExecute.subst(iop)
551 substDict = { "targs" : type,
552 "class_name" : Name }
553 exec_output += NeonXExecDeclare.subst(substDict)
555 def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
556 doubleDest=False, long=False):
557 global header_output, exec_output
558 destPrefix = "Big" if long else ""
559 eWalkCode = simd64EnabledCheckCode + '''
563 for reg in range(rCount):
565 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
566 ''' % { "reg" : reg }
569 %(destPrefix)sElement destElem = 0;
570 for (unsigned i = 0; i < eCount; i++) {
571 Element srcElem1 = gtoh(srcReg1.elements[i]);
578 destReg.elements[0] = htog(destElem);
579 ''' % { "op" : op, "destPrefix" : destPrefix }
580 destCnt = 2 if doubleDest else 1
581 for reg in range(destCnt):
583 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
584 ''' % { "reg" : reg }
585 for reg in range(destCnt, 4): # zero upper half
587 AA64FpDestP%(reg)d_uw = 0;
588 ''' % { "reg" : reg }
589 iop = InstObjParams(name, Name,
593 "op_class": opClass }, [])
594 header_output += NeonX1RegOpDeclare.subst(iop)
596 exec_output += NeonXUnequalRegOpExecute.subst(iop)
598 exec_output += NeonXEqualRegOpExecute.subst(iop)
600 substDict = { "targs" : type,
601 "class_name" : Name }
602 exec_output += NeonXExecDeclare.subst(substDict)
604 def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
606 global header_output, exec_output
607 eWalkCode = simd64EnabledCheckCode + '''
611 for reg in range(rCount):
613 srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
614 ''' % { "reg" : reg }
617 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
618 ''' % { "reg" : reg }
621 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
623 for (unsigned i = 0; i < eCount / 2; i++) {
624 Element srcElem1 = gtoh(srcRegs.elements[2 * i]);
625 Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]);
629 destReg.elements[i] = htog(destElem);
631 ''' % { "op" : op, "readDest" : readDestCode }
632 for reg in range(rCount):
634 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
635 ''' % { "reg" : reg }
636 if rCount < 4: # zero upper half
637 for reg in range(rCount, 4):
639 AA64FpDestP%(reg)d_uw = 0;
640 ''' % { "reg" : reg }
641 iop = InstObjParams(name, Name,
645 "op_class": opClass }, [])
646 header_output += NeonX1RegOpDeclare.subst(iop)
647 exec_output += NeonXUnequalRegOpExecute.subst(iop)
649 substDict = { "targs" : type,
650 "class_name" : Name }
651 exec_output += NeonXExecDeclare.subst(substDict)
653 def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
654 global header_output, exec_output
655 eWalkCode = simd64EnabledCheckCode + '''
659 for reg in range(rCount):
661 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
662 ''' % { "reg" : reg }
665 readDestCode = 'destElem = gtoh(destReg.elements[i]);'
667 for (unsigned i = 0; i < eCount; i++) {
671 destReg.elements[i] = htog(destElem);
673 ''' % { "op" : op, "readDest" : readDestCode }
674 for reg in range(rCount):
676 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
677 ''' % { "reg" : reg }
678 if rCount < 4: # zero upper half
679 for reg in range(rCount, 4):
681 AA64FpDestP%(reg)d_uw = 0;
682 ''' % { "reg" : reg }
683 iop = InstObjParams(name, Name,
687 "op_class": opClass }, [])
688 header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
689 exec_output += NeonXEqualRegOpExecute.subst(iop)
691 substDict = { "targs" : type,
692 "class_name" : Name }
693 exec_output += NeonXExecDeclare.subst(substDict)
695 def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
696 global header_output, exec_output
697 eWalkCode = simd64EnabledCheckCode + '''
699 for (unsigned i = 0; i < eCount; i++) {
700 destReg.elements[i] = htog((Element) %sOp1);
703 for reg in range(rCount):
705 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
706 ''' % { "reg" : reg }
707 if rCount < 4: # zero upper half
708 for reg in range(rCount, 4):
710 AA64FpDestP%(reg)d_uw = 0;
711 ''' % { "reg" : reg }
712 iop = InstObjParams(name, Name,
716 "op_class": opClass }, [])
717 header_output += NeonX1RegOpDeclare.subst(iop)
718 exec_output += NeonXEqualRegOpExecute.subst(iop)
720 substDict = { "targs" : type,
721 "class_name" : Name }
722 exec_output += NeonXExecDeclare.subst(substDict)
724 def extInstX(name, Name, opClass, types, rCount, op):
725 global header_output, exec_output
726 eWalkCode = simd64EnabledCheckCode + '''
727 RegVect srcReg1, srcReg2, destReg;
729 for reg in range(rCount):
731 srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
732 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
733 ''' % { "reg" : reg }
735 for reg in range(rCount):
737 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
738 ''' % { "reg" : reg }
739 if rCount < 4: # zero upper half
740 for reg in range(rCount, 4):
742 AA64FpDestP%(reg)d_uw = 0;
743 ''' % { "reg" : reg }
744 iop = InstObjParams(name, Name,
748 "op_class": opClass }, [])
749 header_output += NeonX2RegImmOpDeclare.subst(iop)
750 exec_output += NeonXEqualRegOpExecute.subst(iop)
752 substDict = { "targs" : type,
753 "class_name" : Name }
754 exec_output += NeonXExecDeclare.subst(substDict)
756 def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
757 global header_output, exec_output
758 eWalkCode = simd64EnabledCheckCode + '''
761 for reg in range(rCount):
763 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
764 ''' % { "reg" : reg }
766 destReg.elements[imm] = htog((Element) %sOp1);
768 for reg in range(rCount):
770 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
771 ''' % { "reg" : reg }
772 iop = InstObjParams(name, Name,
776 "op_class": opClass }, [])
777 header_output += NeonX1RegImmOpDeclare.subst(iop)
778 exec_output += NeonXEqualRegOpExecute.subst(iop)
780 substDict = { "targs" : type,
781 "class_name" : Name }
782 exec_output += NeonXExecDeclare.subst(substDict)
784 def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
786 global header_output, exec_output
787 eWalkCode = simd64EnabledCheckCode + '''
792 srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw);
793 ''' % { "reg" : reg }
796 %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
800 %sDest = srcReg.elements[imm];
802 iop = InstObjParams(name, Name,
806 "op_class": opClass }, [])
807 header_output += NeonX1RegImmOpDeclare.subst(iop)
808 exec_output += NeonXEqualRegOpExecute.subst(iop)
810 substDict = { "targs" : type,
811 "class_name" : Name }
812 exec_output += NeonXExecDeclare.subst(substDict)
814 def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
815 global header_output, decoder_output, exec_output
816 code = simd64EnabledCheckCode + '''
820 FloatRegBits regs[16];
825 uint8_t bytes[%(rCount)d * 4];
826 FloatRegBits regs[%(rCount)d];
829 const unsigned length = %(length)d;
830 const bool isTbl = %(isTbl)s;
831 ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
832 for reg in range(rCount):
834 srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw);
835 destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
836 ''' % { "reg" : reg }
837 for reg in range(16):
840 table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw);
841 ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
844 table.regs[%(reg)d] = 0;
845 ''' % { "reg" : reg }
847 for (unsigned i = 0; i < sizeof(destReg); i++) {
848 uint8_t index = srcReg2.bytes[i];
849 if (index < 16 * length) {
850 destReg.bytes[i] = table.bytes[index];
853 destReg.bytes[i] = 0;
854 // else destReg.bytes[i] unchanged
858 for reg in range(rCount):
860 AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]);
861 ''' % { "reg" : reg }
862 if rCount < 4: # zero upper half
863 for reg in range(rCount, 4):
865 AA64FpDestP%(reg)d_uw = 0;
866 ''' % { "reg" : reg }
867 iop = InstObjParams(name, Name,
871 "op_class": opClass }, [])
872 header_output += NeonX2RegOpDeclare.subst(iop)
873 exec_output += NeonXEqualRegOpExecute.subst(iop)
875 substDict = { "targs" : type,
876 "class_name" : Name }
877 exec_output += NeonXExecDeclare.subst(substDict)
882 destElem = -srcElem1;
887 twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
888 twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
890 addCode = "destElem = srcElem1 + srcElem2;"
891 threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
892 threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
895 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
896 (sizeof(Element) * 8);
898 threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
900 threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
903 twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
906 threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
907 addCode, pairwise=True)
908 threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
909 addCode, pairwise=True)
911 # Note: SimdAddOp can be a bit optimistic here
912 addAcrossCode = "destElem += srcElem1;"
913 twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
915 twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
918 andCode = "destElem = srcElem1 & srcElem2;"
919 threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
920 threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
922 bicImmCode = "destElem &= ~imm;"
923 oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
925 oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
928 bicCode = "destElem = srcElem1 & ~srcElem2;"
929 threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
930 threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
932 bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
933 threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
935 threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
938 bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
939 threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
941 threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
944 bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
945 threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
947 threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
954 while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
960 while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
967 twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
968 twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
972 while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
978 twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
979 twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
981 cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
982 threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
984 threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
987 cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
988 twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
990 twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
993 cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
994 threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
995 threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
997 cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
998 twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
1000 twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
1003 cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
1004 threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
1005 threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
1007 cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
1008 twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
1010 twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
1013 threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
1015 threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
1018 threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
1020 threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
1023 cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
1024 twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
1026 twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
1029 cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
1030 twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
1032 twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
1035 tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
1036 threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
1038 threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
1043 while (srcElem1 && count < sizeof(Element) * 8) {
1044 count += srcElem1 & 0x1;
1049 twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
1050 twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
1052 dupCode = "destElem = srcElem1;"
1053 twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
1054 dupCode, isDup=True, byElem=True)
1055 twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
1056 dupCode, isDup=True, byElem=True)
1057 twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
1058 dupCode, isDup=True, byElem=True, scalar=True)
1059 # DUP (general register)
1060 dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
1061 dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
1062 dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
1064 eorCode = "destElem = srcElem1 ^ srcElem2;"
1065 threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
1066 threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
1069 for (unsigned i = 0; i < eCount; i++) {
1070 unsigned index = i + imm;
1071 if (index < eCount) {
1072 destReg.elements[i] = srcReg1.elements[index];
1075 if (index >= eCount) {
1076 fault = new UndefinedInstruction(machInst, false, mnemonic);
1078 destReg.elements[i] = srcReg2.elements[index];
1083 extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
1084 extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
1087 FPSCR fpscr = (FPSCR) FpscrExc;
1091 fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
1092 threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
1094 threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
1096 threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
1097 fabdCode, scalar=True)
1099 fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
1100 twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
1102 twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
1105 fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
1106 " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
1107 facgeCode = fpCmpAbsOp % "GE"
1108 threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
1110 threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
1112 threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
1113 facgeCode, scalar=True)
1115 facgtCode = fpCmpAbsOp % "GT"
1116 threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
1118 threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
1120 threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
1121 facgtCode, scalar=True)
1123 fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
1124 faddCode = fpBinOp % "Add"
1125 threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
1127 threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
1130 twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
1131 ("uint32_t",), 2, faddCode)
1132 twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
1133 ("uint64_t",), 4, faddCode)
1135 threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
1136 2, faddCode, pairwise=True)
1137 threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
1138 faddCode, pairwise=True)
1140 fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
1142 fcmeqCode = fpCmpOp % "EQ"
1143 threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
1145 threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
1147 threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
1148 fcmeqCode, scalar=True)
1150 fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
1151 fcmeqZeroCode = fpCmpZeroOp % "EQ"
1152 twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1154 twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1156 twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1157 fcmeqZeroCode, scalar=True)
1159 fcmgeCode = fpCmpOp % "GE"
1160 threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
1162 threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
1164 threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
1165 fcmgeCode, scalar=True)
1167 fcmgeZeroCode = fpCmpZeroOp % "GE"
1168 twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1170 twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1172 twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1173 fcmgeZeroCode, scalar=True)
1175 fcmgtCode = fpCmpOp % "GT"
1176 threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
1178 threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
1180 threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
1181 fcmgtCode, scalar=True)
1183 fcmgtZeroCode = fpCmpZeroOp % "GT"
1184 twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1186 twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1188 twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1189 fcmgtZeroCode, scalar=True)
1191 fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
1193 fcmleZeroCode = fpCmpRevZeroOp % "GE"
1194 twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1196 twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1198 twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1199 fcmleZeroCode, scalar=True)
1201 fcmltZeroCode = fpCmpRevZeroOp % "GT"
1202 twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
1204 twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
1206 twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
1207 fcmltZeroCode, scalar=True)
1209 fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
1210 "srcElem1, %s, %s, %s, fpscr)")
1211 fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
1212 twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
1214 twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
1216 twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
1217 fcvtasCode, scalar=True)
1219 fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
1220 twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
1222 twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
1224 twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
1225 fcvtauCode, scalar=True)
1227 fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
1228 "srcElem1, FPCRRounding(fpscr), fpscr)")
1229 twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
1231 twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
1234 fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
1235 twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
1237 twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
1239 twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
1240 fcvtmsCode, scalar=True)
1242 fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
1243 twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
1245 twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
1247 twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
1248 fcvtmuCode, scalar=True)
1250 fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
1251 "srcElem1, FPCRRounding(fpscr), fpscr)")
1252 twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
1253 ("uint16_t", "uint32_t"), fcvtnCode)
1254 twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
1255 ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
1257 fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
1258 twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
1260 twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
1262 twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
1263 fcvtnsCode, scalar=True)
1265 fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
1266 twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
1268 twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
1270 twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
1271 fcvtnuCode, scalar=True)
1273 fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
1274 twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
1276 twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
1278 twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
1279 fcvtpsCode, scalar=True)
1281 fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
1282 twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
1284 twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
1286 twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
1287 fcvtpuCode, scalar=True)
1289 fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
1290 "srcElem1, FPRounding_ODD, fpscr)")
1291 twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
1293 twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
1294 fcvtxnCode, hi=True)
1295 twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
1296 fcvtxnCode, scalar=True)
1297 # FCVTZS (fixed-point)
1298 fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
1299 twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
1300 2, fcvtzsCode, hasImm=True)
1301 twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
1302 fcvtzsCode, hasImm=True)
1303 twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
1304 fcvtzsCode, hasImm=True, scalar=True)
1306 fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
1307 twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
1309 twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
1311 twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
1312 fcvtzsIntCode, scalar=True)
1313 # FCVTZU (fixed-point)
1314 fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
1315 twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
1316 2, fcvtzuCode, hasImm=True)
1317 twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
1318 fcvtzuCode, hasImm=True)
1319 twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
1320 fcvtzuCode, hasImm=True, scalar=True)
1322 fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
1323 twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
1325 twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
1327 twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
1328 fcvtzuIntCode, scalar=True)
1330 fdivCode = fpBinOp % "Div"
1331 threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
1333 threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
1336 fmaxCode = fpBinOp % "Max"
1337 threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1339 threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
1342 fmaxnmCode = fpBinOp % "MaxNum"
1343 threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
1345 threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
1348 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
1349 ("uint32_t",), 2, fmaxnmCode)
1350 twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
1351 ("uint64_t",), 4, fmaxnmCode)
1353 threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
1354 smallFloatTypes, 2, fmaxnmCode, pairwise=True)
1355 threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1356 fmaxnmCode, pairwise=True)
1358 # Note: SimdFloatCmpOp can be a bit optimistic here
1359 fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
1360 fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
1361 twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1362 4, fmaxnmAcrossCode)
1364 twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
1365 ("uint32_t",), 2, fmaxCode)
1366 twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
1367 ("uint64_t",), 4, fmaxCode)
1369 threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
1370 2, fmaxCode, pairwise=True)
1371 threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
1372 fmaxCode, pairwise=True)
1374 # Note: SimdFloatCmpOp can be a bit optimistic here
1375 fmaxAcrossCode = fpAcrossOp % "Max"
1376 twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1379 fminCode = fpBinOp % "Min"
1380 threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
1382 threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
1385 fminnmCode = fpBinOp % "MinNum"
1386 threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
1388 threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
1391 twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
1392 ("uint32_t",), 2, fminnmCode)
1393 twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
1394 ("uint64_t",), 4, fminnmCode)
1396 threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
1397 smallFloatTypes, 2, fminnmCode, pairwise=True)
1398 threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
1399 fminnmCode, pairwise=True)
1401 # Note: SimdFloatCmpOp can be a bit optimistic here
1402 fminnmAcrossCode = fpAcrossOp % "MinNum"
1403 twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",),
1404 4, fminnmAcrossCode)
1406 twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
1407 ("uint32_t",), 2, fminCode)
1408 twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
1409 ("uint64_t",), 4, fminCode)
1411 threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
1412 2, fminCode, pairwise=True)
1413 threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
1414 fminCode, pairwise=True)
1416 # Note: SimdFloatCmpOp can be a bit optimistic here
1417 fminAcrossCode = fpAcrossOp % "Min"
1418 twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4,
1421 fmlaCode = fpOp % ("fplibMulAdd<Element>("
1422 "destElem, srcElem1, srcElem2, fpscr)")
1423 threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
1424 smallFloatTypes, 2, fmlaCode, True, byElem=True)
1425 threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
1426 4, fmlaCode, True, byElem=True)
1427 threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
1428 4, fmlaCode, True, byElem=True, scalar=True)
1430 threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
1432 threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
1435 fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
1436 " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
1437 threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
1438 smallFloatTypes, 2, fmlsCode, True, byElem=True)
1439 threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
1440 4, fmlsCode, True, byElem=True)
1441 threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
1442 4, fmlsCode, True, byElem=True, scalar=True)
1444 threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
1446 threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
1449 fmovCode = 'destElem = imm;'
1450 oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
1452 oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
1454 fmulCode = fpBinOp % "Mul"
1455 threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
1456 smallFloatTypes, 2, fmulCode, byElem=True)
1457 threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
1458 fmulCode, byElem=True)
1459 threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
1460 fmulCode, byElem=True, scalar=True)
1462 threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
1464 threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
1467 fmulxCode = fpBinOp % "MulX"
1468 threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
1470 threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
1472 threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
1473 fmulxCode, scalar=True)
1474 # FMULX (by element)
1475 threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
1476 smallFloatTypes, 2, fmulxCode, byElem=True)
1477 threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
1478 4, fmulxCode, byElem=True)
1479 threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
1480 4, fmulxCode, byElem=True, scalar=True)
1482 fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
1483 twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
1485 twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
1488 frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
1489 twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
1490 smallFloatTypes, 2, frecpeCode)
1491 twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
1493 twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
1494 4, frecpeCode, scalar=True)
1496 frecpsCode = fpBinOp % "RecipStepFused"
1497 threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
1498 smallFloatTypes, 2, frecpsCode)
1499 threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
1501 threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
1502 4, frecpsCode, scalar=True)
1504 frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
1505 twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
1506 frecpxCode, scalar=True)
1508 frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
1509 frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
1510 twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
1512 twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
1515 frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
1516 twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
1518 twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
1521 frintmCode = frintCode % ("FPRounding_NEGINF", "false")
1522 twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
1524 twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
1527 frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
1528 twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
1530 twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
1533 frintpCode = frintCode % ("FPRounding_POSINF", "false")
1534 twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
1536 twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
1539 frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
1540 twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
1542 twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
1545 frintzCode = frintCode % ("FPRounding_ZERO", "false")
1546 twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
1548 twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
1551 frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
1552 twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
1553 smallFloatTypes, 2, frsqrteCode)
1554 twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
1556 twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
1557 frsqrteCode, scalar=True)
1559 frsqrtsCode = fpBinOp % "RSqrtStepFused"
1560 threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
1561 smallFloatTypes, 2, frsqrtsCode)
1562 threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
1564 threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
1565 4, frsqrtsCode, scalar=True)
1567 fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
1568 twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
1570 twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
1573 fsubCode = fpBinOp % "Sub"
1574 threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
1576 threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
1579 insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
1580 # INS (general register)
1581 insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
1583 insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
1585 mlaCode = "destElem += srcElem1 * srcElem2;"
1586 threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
1587 ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
1588 threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
1589 ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
1591 threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1593 threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1596 mlsCode = "destElem -= srcElem1 * srcElem2;"
1597 threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
1598 ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
1599 threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
1600 ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
1602 threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
1604 threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
1606 # MOV (element) -> alias to INS (element)
1607 # MOV (from general) -> alias to INS (general register)
1608 # MOV (scalar) -> alias to DUP (element)
1609 # MOV (to general) -> alias to UMOV
1610 # MOV (vector) -> alias to ORR (register)
1612 movImmCode = "destElem = imm;"
1613 oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
1615 oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
1618 mulCode = "destElem = srcElem1 * srcElem2;"
1619 threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
1620 ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
1621 threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
1622 ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
1624 threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
1626 threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
1629 mvnCode = "destElem = ~srcElem1;"
1630 twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
1631 twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
1633 mvniCode = "destElem = ~imm;"
1634 oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
1635 oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
1637 negCode = "destElem = -srcElem1;"
1638 twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
1639 twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
1640 # NOT -> alias to MVN
1642 ornCode = "destElem = srcElem1 | ~srcElem2;"
1643 threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
1644 threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
1646 orrImmCode = "destElem |= imm;"
1647 oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
1649 oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
1652 orrCode = "destElem = srcElem1 | srcElem2;"
1653 threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
1654 threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
1658 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1659 if (bits(srcElem2, j))
1660 destElem ^= srcElem1 << j;
1663 threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
1665 threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
1668 # Note: 64-bit PMULL is not available (Crypto. Extension)
1671 for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
1672 if (bits(srcElem2, j))
1673 destElem ^= (BigElement)srcElem1 << j;
1676 threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
1677 threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
1681 destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
1682 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1683 (sizeof(Element) * 8);
1685 threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
1687 threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
1688 raddhnCode, hi=True)
1692 Element temp = srcElem1;
1693 for (int i = 0; i < 8 * sizeof(Element); i++) {
1694 destElem = destElem | ((temp & 0x1) <<
1695 (8 * sizeof(Element) - 1 - i));
1699 twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
1700 twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
1703 destElem = srcElem1;
1704 unsigned groupSize = ((1 << 1) / sizeof(Element));
1705 unsigned reverseMask = (groupSize - 1);
1706 j = i ^ reverseMask;
1708 twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
1710 twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
1714 destElem = srcElem1;
1715 unsigned groupSize = ((1 << 2) / sizeof(Element));
1716 unsigned reverseMask = (groupSize - 1);
1717 j = i ^ reverseMask;
1719 twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
1721 twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
1725 destElem = srcElem1;
1726 unsigned groupSize = ((1 << 3) / sizeof(Element));
1727 unsigned reverseMask = (groupSize - 1);
1728 j = i ^ reverseMask;
1730 twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
1732 twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
1736 if (imm > sizeof(srcElem1) * 8) {
1739 Element rBit = bits(srcElem1, imm - 1);
1740 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
1742 destElem = srcElem1;
1745 twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
1746 rshrnCode, hasImm=True)
1747 twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
1748 rshrnCode, hasImm=True, hi=True)
1751 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
1752 ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
1753 (sizeof(Element) * 8);
1755 threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
1757 threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
1758 rsubhnCode, hi=True)
1761 destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1762 (srcElem2 - srcElem1);
1764 threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
1766 threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
1770 destElem += (srcElem1 > srcElem2) ?
1771 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1772 ((BigElement)srcElem2 - (BigElement)srcElem1);
1774 threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
1776 threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
1777 abalCode, True, hi=True)
1780 destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
1781 (srcElem2 - srcElem1);
1783 threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
1785 threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
1789 destElem = (srcElem1 > srcElem2) ?
1790 ((BigElement)srcElem1 - (BigElement)srcElem2) :
1791 ((BigElement)srcElem2 - (BigElement)srcElem1);
1793 threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
1795 threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
1796 abdlCode, True, hi=True)
1798 adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
1799 twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
1801 twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
1804 addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
1805 threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
1807 threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
1810 twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
1812 twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
1815 # Note: SimdAddOp can be a bit optimistic here
1816 addAcrossLongCode = "destElem += (BigElement)srcElem1;"
1817 twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
1818 2, addAcrossLongCode, long=True)
1819 twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
1820 4, addAcrossLongCode, long=True)
1821 twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
1822 addAcrossLongCode, doubleDest=True, long=True)
1824 threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
1826 threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
1828 # SCVTF (fixed-point)
1829 scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm,"
1830 " false, FPCRRounding(fpscr), fpscr)")
1831 twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
1832 scvtfFixedCode % 32, hasImm=True)
1833 twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
1834 scvtfFixedCode % 32, hasImm=True)
1835 twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
1836 scvtfFixedCode % 64, hasImm=True)
1837 twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
1838 4, scvtfFixedCode % 32, hasImm=True, scalar=True)
1839 twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
1840 scvtfFixedCode % 64, hasImm=True, scalar=True)
1842 scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0,"
1843 " false, FPCRRounding(fpscr), fpscr)")
1844 twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
1846 twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
1848 twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
1850 twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4,
1851 scvtfIntCode % 32, scalar=True)
1852 twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
1853 scvtfIntCode % 64, scalar=True)
1857 (((unsigned)srcElem1 & 0x1) +
1858 ((unsigned)srcElem2 & 0x1)) >> 1;
1859 // Use division instead of a shift to ensure the sign extension works
1860 // right. The compiler will figure out if it can be a shift. Mask the
1861 // inputs so they get truncated correctly.
1862 destElem = (((srcElem1 & ~(Element)1) / 2) +
1863 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
1865 threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
1867 threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
1871 if (imm >= sizeof(Element) * 8)
1872 destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
1874 destElem = srcElem1 << imm;
1876 twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
1878 twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
1881 shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
1882 twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
1883 twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
1887 if (imm >= sizeof(srcElem1) * 8) {
1890 destElem = srcElem1 >> imm;
1893 twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
1894 shrnCode, hasImm=True)
1895 twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
1896 shrnCode, hasImm=True, hi=True)
1900 (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
1901 // Use division instead of a shift to ensure the sign extension works
1902 // right. The compiler will figure out if it can be a shift. Mask the
1903 // inputs so they get truncated correctly.
1904 destElem = (((srcElem1 & ~(Element)1) / 2) -
1905 ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
1907 threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
1909 threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
1913 if (imm >= sizeof(Element) * 8)
1914 destElem = destElem;
1916 destElem = (srcElem1 << imm) | (destElem & mask(imm));
1918 twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
1920 twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
1923 maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
1924 threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
1926 threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
1929 threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
1930 maxCode, pairwise=True)
1931 threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
1932 maxCode, pairwise=True)
1935 if (i == 0 || srcElem1 > destElem)
1936 destElem = srcElem1;
1938 twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1940 twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
1943 minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
1944 threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
1946 threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
1949 threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
1950 minCode, pairwise=True)
1951 threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
1952 minCode, pairwise=True)
1955 if (i == 0 || srcElem1 < destElem)
1956 destElem = srcElem1;
1958 twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
1960 twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
1962 # SMLAL, SMLAL2 (by element)
1963 mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
1964 threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
1965 ("int16_t", "int32_t"), mlalCode, True, byElem=True)
1966 threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
1967 ("int16_t", "int32_t"), mlalCode, True, byElem=True,
1969 # SMLAL, SMLAL2 (vector)
1970 threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
1972 threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
1973 mlalCode, True, hi=True)
1974 # SMLSL, SMLSL2 (by element)
1975 mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
1976 threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
1977 mlslCode, True, byElem=True)
1978 threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
1979 smallSignedTypes, mlslCode, True, byElem=True, hi=True)
1980 # SMLSL, SMLSL2 (vector)
1981 threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
1983 threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
1984 mlslCode, True, hi=True)
1986 insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
1988 insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
1990 # SMULL, SMULL2 (by element)
1991 mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
1992 threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
1993 mullCode, byElem=True)
1994 threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
1995 mullCode, byElem=True, hi=True)
1996 # SMULL, SMULL2 (vector)
1997 threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
1999 threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
2003 FPSCR fpscr = (FPSCR) FpscrQc;
2004 if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
2006 destElem = ~srcElem1;
2007 } else if (srcElem1 < 0) {
2008 destElem = -srcElem1;
2010 destElem = srcElem1;
2014 twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
2016 twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
2018 twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
2019 sqabsCode, scalar=True)
2022 destElem = srcElem1 + srcElem2;
2023 FPSCR fpscr = (FPSCR) FpscrQc;
2024 bool negDest = (destElem < 0);
2025 bool negSrc1 = (srcElem1 < 0);
2026 bool negSrc2 = (srcElem2 < 0);
2027 if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
2028 destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2035 threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
2037 threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
2039 threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
2040 sqaddCode, scalar=True)
2041 # SQDMLAL, SQDMLAL2 (by element)
2043 FPSCR fpscr = (FPSCR) FpscrQc;
2044 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2045 Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2046 Element halfNeg = maxNeg / 2;
2047 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2048 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2049 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2050 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2053 bool negPreDest = ltz(destElem);
2054 destElem += midElem;
2055 bool negDest = ltz(destElem);
2056 bool negMid = ltz(midElem);
2057 if (negPreDest == negMid && negMid != negDest) {
2058 destElem = mask(sizeof(BigElement) * 8 - 1);
2060 destElem = ~destElem;
2065 threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
2066 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
2067 threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
2068 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2070 threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
2071 ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
2073 # SQDMLAL, SQDMLAL2 (vector)
2074 threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
2075 ("int16_t", "int32_t"), qdmlalCode, True)
2076 threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
2077 ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
2078 threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
2079 ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
2080 # SQDMLSL, SQDMLSL2 (by element)
2082 FPSCR fpscr = (FPSCR) FpscrQc;
2083 BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2084 Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2085 Element halfNeg = maxNeg / 2;
2086 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2087 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2088 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2089 midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
2092 bool negPreDest = ltz(destElem);
2093 destElem -= midElem;
2094 bool negDest = ltz(destElem);
2095 bool posMid = ltz((BigElement)-midElem);
2096 if (negPreDest == posMid && posMid != negDest) {
2097 destElem = mask(sizeof(BigElement) * 8 - 1);
2099 destElem = ~destElem;
2104 threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
2105 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
2106 threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
2107 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2109 threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
2110 ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
2112 # SQDMLSL, SQDMLSL2 (vector)
2113 threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
2114 ("int16_t", "int32_t"), qdmlslCode, True)
2115 threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
2116 ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
2117 threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
2118 ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
2119 # SQDMULH (by element)
2121 FPSCR fpscr = (FPSCR) FpscrQc;
2122 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
2123 (sizeof(Element) * 8);
2124 if (srcElem1 == srcElem2 &&
2125 srcElem1 == (Element)((Element)1 <<
2126 (sizeof(Element) * 8 - 1))) {
2127 destElem = ~srcElem1;
2132 threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
2133 ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
2134 threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
2135 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
2136 threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
2137 ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
2140 threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
2141 ("int16_t", "int32_t"), 2, sqdmulhCode)
2142 threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
2143 ("int16_t", "int32_t"), 4, sqdmulhCode)
2144 threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
2145 ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
2146 # SQDMULL, SQDMULL2 (by element)
2148 FPSCR fpscr = (FPSCR) FpscrQc;
2149 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
2150 if (srcElem1 == srcElem2 &&
2151 srcElem1 == (Element)((Element)1 <<
2152 (Element)(sizeof(Element) * 8 - 1))) {
2153 destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
2158 threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
2159 ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
2160 threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
2161 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2163 threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
2164 ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
2166 # SQDMULL, SQDMULL2 (vector)
2167 threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
2168 ("int16_t", "int32_t"), qdmullCode, True)
2169 threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
2170 ("int16_t", "int32_t"), qdmullCode, True, hi=True)
2171 threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
2172 ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
2175 FPSCR fpscr = (FPSCR) FpscrQc;
2176 if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) {
2178 destElem = ~srcElem1;
2180 destElem = -srcElem1;
2184 twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
2186 twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
2188 twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
2189 sqnegCode, scalar=True)
2190 # SQRDMULH (by element)
2192 FPSCR fpscr = (FPSCR) FpscrQc;
2193 destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
2194 ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
2195 (sizeof(Element) * 8);
2196 Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1);
2197 Element halfNeg = maxNeg / 2;
2198 if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
2199 (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
2200 (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
2202 destElem = mask(sizeof(Element) * 8 - 1);
2204 destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2210 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
2211 ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
2212 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
2213 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
2214 threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
2215 ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
2218 threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
2219 ("int16_t", "int32_t"), 2, sqrdmulhCode)
2220 threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
2221 ("int16_t", "int32_t"), 4, sqrdmulhCode)
2222 threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
2223 ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
2226 int16_t shiftAmt = (int8_t)srcElem2;
2227 FPSCR fpscr = (FPSCR) FpscrQc;
2229 shiftAmt = -shiftAmt;
2231 if (shiftAmt <= sizeof(Element) * 8)
2232 rBit = bits(srcElem1, shiftAmt - 1);
2233 if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
2235 if (shiftAmt >= sizeof(Element) * 8) {
2236 shiftAmt = sizeof(Element) * 8 - 1;
2239 destElem = (srcElem1 >> shiftAmt);
2241 // Make sure the right shift sign extended when it should.
2242 if (srcElem1 < 0 && destElem >= 0) {
2243 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2247 } else if (shiftAmt > 0) {
2249 if (shiftAmt >= sizeof(Element) * 8) {
2255 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2256 sizeof(Element) * 8 - 1 - shiftAmt) !=
2257 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2260 destElem = srcElem1 << shiftAmt;
2265 destElem = mask(sizeof(Element) * 8 - 1);
2267 destElem = ~destElem;
2270 destElem = srcElem1;
2274 threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
2276 threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
2278 threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
2279 sqrshlCode, scalar=True)
2282 FPSCR fpscr = (FPSCR) FpscrQc;
2283 if (imm > sizeof(srcElem1) * 8) {
2284 if (srcElem1 != 0 && srcElem1 != -1)
2288 BigElement mid = (srcElem1 >> (imm - 1));
2289 uint64_t rBit = mid & 0x1;
2291 mid |= -(mid & ((BigElement)1 <<
2292 (sizeof(BigElement) * 8 - 1 - imm)));
2294 if (mid != (Element)mid) {
2295 destElem = mask(sizeof(Element) * 8 - 1);
2297 destElem = ~destElem;
2303 if (srcElem1 != (Element)srcElem1) {
2304 destElem = mask(sizeof(Element) * 8 - 1);
2306 destElem = ~destElem;
2309 destElem = srcElem1;
2314 twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
2315 sqrshrnCode, hasImm=True)
2316 twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
2317 sqrshrnCode, hasImm=True, hi=True)
2318 twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
2319 sqrshrnCode, hasImm=True, scalar=True)
2320 # SQRSHRUN, SQRSHRUN2
2322 FPSCR fpscr = (FPSCR) FpscrQc;
2323 if (imm > sizeof(srcElem1) * 8) {
2328 BigElement mid = (srcElem1 >> (imm - 1));
2329 uint64_t rBit = mid & 0x1;
2331 mid |= -(mid & ((BigElement)1 <<
2332 (sizeof(BigElement) * 8 - 1 - imm)));
2334 if (bits(mid, sizeof(BigElement) * 8 - 1,
2335 sizeof(Element) * 8) != 0) {
2339 destElem = mask(sizeof(Element) * 8);
2350 destElem = srcElem1;
2355 twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
2356 sqrshrunCode, hasImm=True)
2357 twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
2358 smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
2359 twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
2360 smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
2363 FPSCR fpscr = (FPSCR) FpscrQc;
2364 if (imm >= sizeof(Element) * 8) {
2365 if (srcElem1 != 0) {
2366 destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2368 destElem = ~destElem;
2374 destElem = (srcElem1 << imm);
2375 uint64_t topBits = bits((uint64_t)srcElem1,
2376 sizeof(Element) * 8 - 1,
2377 sizeof(Element) * 8 - 1 - imm);
2378 if (topBits != 0 && topBits != mask(imm + 1)) {
2379 destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2381 destElem = ~destElem;
2385 destElem = srcElem1;
2389 twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
2390 sqshlImmCode, hasImm=True)
2391 twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
2392 sqshlImmCode, hasImm=True)
2393 twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
2394 sqshlImmCode, hasImm=True, scalar=True)
2397 int16_t shiftAmt = (int8_t)srcElem2;
2398 FPSCR fpscr = (FPSCR) FpscrQc;
2400 shiftAmt = -shiftAmt;
2401 if (shiftAmt >= sizeof(Element) * 8) {
2402 shiftAmt = sizeof(Element) * 8 - 1;
2405 destElem = (srcElem1 >> shiftAmt);
2407 // Make sure the right shift sign extended when it should.
2408 if (srcElem1 < 0 && destElem >= 0) {
2409 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2412 } else if (shiftAmt > 0) {
2414 if (shiftAmt >= sizeof(Element) * 8) {
2420 if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
2421 sizeof(Element) * 8 - 1 - shiftAmt) !=
2422 ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
2425 destElem = srcElem1 << shiftAmt;
2430 destElem = mask(sizeof(Element) * 8 - 1);
2432 destElem = ~destElem;
2435 destElem = srcElem1;
2439 threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
2441 threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
2443 threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
2444 sqshlCode, scalar=True)
2447 FPSCR fpscr = (FPSCR) FpscrQc;
2448 if (imm >= sizeof(Element) * 8) {
2452 } else if (srcElem1 > 0) {
2453 destElem = mask(sizeof(Element) * 8);
2459 destElem = (srcElem1 << imm);
2460 uint64_t topBits = bits((uint64_t)srcElem1,
2461 sizeof(Element) * 8 - 1,
2462 sizeof(Element) * 8 - imm);
2466 } else if (topBits != 0) {
2467 destElem = mask(sizeof(Element) * 8);
2475 destElem = srcElem1;
2480 twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
2481 sqshluCode, hasImm=True)
2482 twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
2483 sqshluCode, hasImm=True)
2484 twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
2485 sqshluCode, hasImm=True, scalar=True)
2488 FPSCR fpscr = (FPSCR) FpscrQc;
2489 if (imm > sizeof(srcElem1) * 8) {
2490 if (srcElem1 != 0 && srcElem1 != -1)
2494 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2495 mid |= -(mid & ((BigElement)1 <<
2496 (sizeof(BigElement) * 8 - 1 - imm)));
2497 if (mid != (Element)mid) {
2498 destElem = mask(sizeof(Element) * 8 - 1);
2500 destElem = ~destElem;
2506 destElem = srcElem1;
2510 twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
2511 sqshrnCode, hasImm=True)
2512 twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
2513 sqshrnCode, hasImm=True, hi=True)
2514 twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
2515 sqshrnCode, hasImm=True, scalar=True)
2518 FPSCR fpscr = (FPSCR) FpscrQc;
2519 if (imm > sizeof(srcElem1) * 8) {
2524 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
2525 if (bits(mid, sizeof(BigElement) * 8 - 1,
2526 sizeof(Element) * 8) != 0) {
2530 destElem = mask(sizeof(Element) * 8);
2537 destElem = srcElem1;
2541 twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
2542 sqshrunCode, hasImm=True)
2543 twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
2544 sqshrunCode, hasImm=True, hi=True)
2545 twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
2546 sqshrunCode, hasImm=True, scalar=True)
2549 destElem = srcElem1 - srcElem2;
2550 FPSCR fpscr = (FPSCR) FpscrQc;
2551 bool negDest = (destElem < 0);
2552 bool negSrc1 = (srcElem1 < 0);
2553 bool posSrc2 = (srcElem2 >= 0);
2554 if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
2555 destElem = (Element)1 << (sizeof(Element) * 8 - 1);
2562 threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
2564 threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
2566 threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
2567 sqsubCode, scalar=True)
2570 FPSCR fpscr = (FPSCR) FpscrQc;
2571 destElem = srcElem1;
2572 if ((BigElement)destElem != srcElem1) {
2574 destElem = mask(sizeof(Element) * 8 - 1);
2576 destElem = ~destElem;
2580 twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
2582 twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
2584 twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
2585 sqxtnCode, scalar=True)
2588 FPSCR fpscr = (FPSCR) FpscrQc;
2589 destElem = srcElem1;
2591 ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
2593 destElem = mask(sizeof(Element) * 8);
2595 destElem = ~destElem;
2599 twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
2601 twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
2602 sqxtunCode, hi=True)
2603 twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
2604 sqxtunCode, scalar=True)
2608 (((unsigned)srcElem1 & 0x1) +
2609 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
2610 // Use division instead of a shift to ensure the sign extension works
2611 // right. The compiler will figure out if it can be a shift. Mask the
2612 // inputs so they get truncated correctly.
2613 destElem = (((srcElem1 & ~(Element)1) / 2) +
2614 ((srcElem2 & ~(Element)1) / 2)) + carryBit;
2616 threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
2618 threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
2622 if (imm >= sizeof(Element) * 8)
2623 destElem = destElem;
2625 destElem = (srcElem1 >> imm) |
2626 (destElem & ~mask(sizeof(Element) * 8 - imm));
2628 twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
2630 twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
2634 int16_t shiftAmt = (int8_t)srcElem2;
2636 shiftAmt = -shiftAmt;
2638 if (shiftAmt <= sizeof(Element) * 8)
2639 rBit = bits(srcElem1, shiftAmt - 1);
2640 if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
2642 if (shiftAmt >= sizeof(Element) * 8) {
2643 shiftAmt = sizeof(Element) * 8 - 1;
2646 destElem = (srcElem1 >> shiftAmt);
2648 // Make sure the right shift sign extended when it should.
2649 if (ltz(srcElem1) && !ltz(destElem)) {
2650 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2654 } else if (shiftAmt > 0) {
2655 if (shiftAmt >= sizeof(Element) * 8) {
2658 destElem = srcElem1 << shiftAmt;
2661 destElem = srcElem1;
2664 threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
2666 threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
2670 if (imm > sizeof(srcElem1) * 8) {
2673 Element rBit = bits(srcElem1, imm - 1);
2674 destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2676 destElem = srcElem1;
2679 twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
2680 rshrCode, hasImm=True)
2681 twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
2682 rshrCode, hasImm=True)
2685 if (imm > sizeof(srcElem1) * 8) {
2688 Element rBit = bits(srcElem1, imm - 1);
2689 destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
2691 destElem += srcElem1;
2694 twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
2695 rsraCode, True, hasImm=True)
2696 twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
2697 rsraCode, True, hasImm=True)
2700 int16_t shiftAmt = (int8_t)srcElem2;
2702 shiftAmt = -shiftAmt;
2703 if (shiftAmt >= sizeof(Element) * 8) {
2704 shiftAmt = sizeof(Element) * 8 - 1;
2707 destElem = (srcElem1 >> shiftAmt);
2709 // Make sure the right shift sign extended when it should.
2710 if (ltz(srcElem1) && !ltz(destElem)) {
2711 destElem |= -((Element)1 << (sizeof(Element) * 8 -
2715 if (shiftAmt >= sizeof(Element) * 8) {
2718 destElem = srcElem1 << shiftAmt;
2722 threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
2724 threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
2728 if (imm >= sizeof(destElem) * 8) {
2731 destElem = (BigElement)srcElem1 << imm;
2734 twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
2735 shllCode, hasImm=True)
2736 twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
2737 shllCode, hasImm=True, hi=True)
2740 if (imm >= sizeof(srcElem1) * 8) {
2746 destElem = srcElem1 >> imm;
2749 twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
2751 twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
2756 if (imm >= sizeof(srcElem1) * 8) {
2757 mid = ltz(srcElem1) ? -1 : 0;
2759 mid = srcElem1 >> imm;
2760 if (ltz(srcElem1) && !ltz(mid)) {
2761 mid |= -(mid & ((Element)1 <<
2762 (sizeof(Element) * 8 - 1 - imm)));
2767 twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
2769 twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
2772 sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
2773 threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
2775 threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
2778 threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
2780 threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
2783 subCode = "destElem = srcElem1 - srcElem2;"
2784 threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
2785 threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
2788 destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
2789 (sizeof(Element) * 8);
2791 threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
2793 threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
2797 FPSCR fpscr = (FPSCR) FpscrQc;
2798 Element tmp = destElem + srcElem1;
2799 if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
2800 if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
2801 tmp < srcElem1 || tmp < destElem) {
2802 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2808 Element absDestElem = (~destElem) + 1;
2809 if (absDestElem < srcElem1) {
2810 // Still check for positive sat., no need to check for negative sat.
2811 if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
2812 destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1;
2823 twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2825 twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
2827 twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
2828 suqaddCode, True, scalar=True)
2829 # SXTL -> alias to SSHLL
2831 tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
2832 tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
2833 tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
2834 tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
2835 tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
2836 tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
2837 tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
2838 tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
2840 tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
2841 tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
2842 tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
2843 tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
2844 tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
2845 tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
2846 tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
2847 tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
2851 for (unsigned i = 0; i < eCount / 2; i++) {
2852 destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
2853 destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
2856 threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
2858 threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
2861 threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
2863 threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
2866 threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
2868 threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
2871 threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
2873 threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
2874 abalCode, True, hi=True)
2876 threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
2878 threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
2881 threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
2883 threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
2884 abdlCode, True, hi=True)
2886 twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
2888 twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
2891 threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
2893 threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
2896 twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
2898 twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
2901 twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
2902 ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
2903 twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
2904 ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
2905 twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
2906 addAcrossLongCode, doubleDest=True, long=True)
2908 threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
2910 threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
2912 # UCVTF (fixed-point)
2913 ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
2914 " FPCRRounding(fpscr), fpscr)")
2915 twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
2916 ucvtfFixedCode, hasImm=True)
2917 twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
2918 ucvtfFixedCode, hasImm=True)
2919 twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
2920 ucvtfFixedCode, hasImm=True, scalar=True)
2922 ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
2923 " FPCRRounding(fpscr), fpscr)")
2924 twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
2926 twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
2928 twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
2929 ucvtfIntCode, scalar=True)
2931 threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
2933 threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
2936 threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
2938 threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
2941 threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
2943 threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
2946 threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2947 maxCode, pairwise=True)
2948 threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2949 maxCode, pairwise=True)
2951 twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2953 twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2956 threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
2958 threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
2961 threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
2962 minCode, pairwise=True)
2963 threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
2964 minCode, pairwise=True)
2966 twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
2968 twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
2970 # UMLAL (by element)
2971 threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
2972 smallUnsignedTypes, mlalCode, True, byElem=True)
2973 threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
2974 smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
2976 threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
2978 threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
2979 mlalCode, True, hi=True)
2980 # UMLSL (by element)
2981 threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
2982 smallUnsignedTypes, mlslCode, True, byElem=True)
2983 threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
2984 smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
2986 threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
2988 threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
2989 mlslCode, True, hi=True)
2991 insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
2992 insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
2993 # UMULL, UMULL2 (by element)
2994 threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
2995 mullCode, byElem=True)
2996 threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
2997 mullCode, byElem=True, hi=True)
2998 # UMULL, UMULL2 (vector)
2999 threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
3001 threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
3005 destElem = srcElem1 + srcElem2;
3006 FPSCR fpscr = (FPSCR) FpscrQc;
3007 if (destElem < srcElem1 || destElem < srcElem2) {
3008 destElem = (Element)(-1);
3013 threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3015 threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
3017 threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
3018 uqaddCode, scalar=True)
3021 int16_t shiftAmt = (int8_t)srcElem2;
3022 FPSCR fpscr = (FPSCR) FpscrQc;
3024 shiftAmt = -shiftAmt;
3026 if (shiftAmt <= sizeof(Element) * 8)
3027 rBit = bits(srcElem1, shiftAmt - 1);
3028 if (shiftAmt >= sizeof(Element) * 8) {
3029 shiftAmt = sizeof(Element) * 8 - 1;
3032 destElem = (srcElem1 >> shiftAmt);
3036 if (shiftAmt >= sizeof(Element) * 8) {
3037 if (srcElem1 != 0) {
3038 destElem = mask(sizeof(Element) * 8);
3044 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3045 sizeof(Element) * 8 - shiftAmt)) {
3046 destElem = mask(sizeof(Element) * 8);
3049 destElem = srcElem1 << shiftAmt;
3055 threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
3057 threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
3059 threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
3060 uqrshlCode, scalar=True)
3063 FPSCR fpscr = (FPSCR) FpscrQc;
3064 if (imm > sizeof(srcElem1) * 8) {
3069 BigElement mid = (srcElem1 >> (imm - 1));
3070 uint64_t rBit = mid & 0x1;
3073 if (mid != (Element)mid) {
3074 destElem = mask(sizeof(Element) * 8);
3080 if (srcElem1 != (Element)srcElem1) {
3081 destElem = mask(sizeof(Element) * 8 - 1);
3084 destElem = srcElem1;
3089 twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
3090 uqrshrnCode, hasImm=True)
3091 twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
3092 smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
3093 twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
3094 smallUnsignedTypes, uqrshrnCode, hasImm=True,
3098 FPSCR fpscr = (FPSCR) FpscrQc;
3099 if (imm >= sizeof(Element) * 8) {
3100 if (srcElem1 != 0) {
3101 destElem = mask(sizeof(Element) * 8);
3107 destElem = (srcElem1 << imm);
3108 uint64_t topBits = bits((uint64_t)srcElem1,
3109 sizeof(Element) * 8 - 1,
3110 sizeof(Element) * 8 - imm);
3112 destElem = mask(sizeof(Element) * 8);
3116 destElem = srcElem1;
3120 twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
3121 uqshlImmCode, hasImm=True)
3122 twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
3123 uqshlImmCode, hasImm=True)
3124 twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
3125 uqshlImmCode, hasImm=True, scalar=True)
3128 int16_t shiftAmt = (int8_t)srcElem2;
3129 FPSCR fpscr = (FPSCR) FpscrQc;
3131 shiftAmt = -shiftAmt;
3132 if (shiftAmt >= sizeof(Element) * 8) {
3133 shiftAmt = sizeof(Element) * 8 - 1;
3136 destElem = (srcElem1 >> shiftAmt);
3138 } else if (shiftAmt > 0) {
3139 if (shiftAmt >= sizeof(Element) * 8) {
3140 if (srcElem1 != 0) {
3141 destElem = mask(sizeof(Element) * 8);
3147 if (bits(srcElem1, sizeof(Element) * 8 - 1,
3148 sizeof(Element) * 8 - shiftAmt)) {
3149 destElem = mask(sizeof(Element) * 8);
3152 destElem = srcElem1 << shiftAmt;
3156 destElem = srcElem1;
3160 threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
3162 threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
3164 threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
3165 uqshlCode, scalar=True)
3168 FPSCR fpscr = (FPSCR) FpscrQc;
3169 if (imm > sizeof(srcElem1) * 8) {
3174 BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
3175 if (mid != (Element)mid) {
3176 destElem = mask(sizeof(Element) * 8);
3182 destElem = srcElem1;
3186 twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
3187 uqshrnCode, hasImm=True)
3188 twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
3189 uqshrnCode, hasImm=True, hi=True)
3190 twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
3191 uqshrnCode, hasImm=True, scalar=True)
3194 destElem = srcElem1 - srcElem2;
3195 FPSCR fpscr = (FPSCR) FpscrQc;
3196 if (destElem > srcElem1) {
3202 threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
3204 threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
3206 threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
3207 uqsubCode, scalar=True)
3210 FPSCR fpscr = (FPSCR) FpscrQc;
3211 destElem = srcElem1;
3212 if ((BigElement)destElem != srcElem1) {
3214 destElem = mask(sizeof(Element) * 8);
3218 twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
3220 twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
3222 twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
3223 uqxtnCode, scalar=True)
3225 urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
3226 twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
3228 twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
3231 threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
3233 threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
3236 threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
3238 threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
3241 twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
3242 rshrCode, hasImm=True)
3243 twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
3244 rshrCode, hasImm=True)
3246 ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
3247 twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
3249 twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
3252 twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
3253 rsraCode, True, hasImm=True)
3254 twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
3255 rsraCode, True, hasImm=True)
3257 threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
3259 threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
3262 twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
3263 shllCode, hasImm=True)
3264 twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
3265 shllCode, hi=True, hasImm=True)
3267 twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
3268 shrCode, hasImm=True)
3269 twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
3270 shrCode, hasImm=True)
3273 FPSCR fpscr = (FPSCR) FpscrQc;
3274 Element tmp = destElem + srcElem1;
3275 if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
3276 if (tmp < srcElem1 || tmp < destElem) {
3277 destElem = (Element)(-1);
3283 Element absSrcElem1 = (~srcElem1) + 1;
3284 if (absSrcElem1 > destElem) {
3293 twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
3295 twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
3297 twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
3298 usqaddCode, True, scalar=True)
3300 twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
3301 sraCode, True, hasImm=True)
3302 twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
3303 sraCode, True, hasImm=True)
3305 threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
3307 threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
3310 threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
3312 threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
3314 # UXTL -> alias to USHLL
3318 for (unsigned i = 0; i < eCount / 2; i++) {
3319 destReg.elements[i] = srcReg1.elements[2 * i + part];
3320 destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
3323 threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
3325 threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
3328 threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
3330 threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
3333 xtnCode = "destElem = srcElem1;"
3334 twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
3335 twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
3340 for (unsigned i = 0; i < eCount / 2; i++) {
3341 destReg.elements[2 * i] = srcReg1.elements[base + i];
3342 destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
3345 threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
3347 threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
3350 threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
3351 zipCode % "eCount / 2")
3352 threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
3353 zipCode % "eCount / 2")