3 // Copyright (c) 2012-2014 ARM Limited
6 // The license below extends only to copyright in the software and shall
7 // not be construed as granting a license to any other intellectual
8 // property including but not limited to intellectual property relating
9 // to a hardware implementation of the functionality of the software
10 // licensed hereunder. You may use the software subject to the license
11 // terms below provided that you ensure that this notice is replicated
12 // unmodified and in its entirety in all distributions of the software,
13 // modified or unmodified, in source code or in binary form.
15 // Redistribution and use in source and binary forms, with or without
16 // modification, are permitted provided that the following conditions are
17 // met: redistributions of source code must retain the above copyright
18 // notice, this list of conditions and the following disclaimer;
19 // redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution;
22 // neither the name of the copyright holders nor the names of its
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 // Authors: Mbou Eyole
47 def mkMemAccMicroOp(name):
48 global header_output, decoder_output, exec_output
49 SPAlignmentCheckCodeNeon = '''
50 if (baseIsSP && bits(XURa, 3, 0) &&
51 SPAlignmentCheckEnabled(xc->tcBase())) {
52 return std::make_shared<SPAlignmentFault>();
55 eaCode = SPAlignmentCheckCodeNeon + '''
59 const int MaxNumBytes = 16;
61 uint8_t bytes[MaxNumBytes];
62 uint32_t floatRegBits[MaxNumBytes / 4];
66 # Do endian conversion for all the elements
70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71 (XReg) memUnion.floatRegBits[0];
72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73 (XReg) memUnion.floatRegBits[2];
75 const unsigned eCount = 16 / (1 << eSize);
77 if (isBigEndian64(xc->tcBase())) {
78 for (unsigned i = 0; i < eCount; i++) {
81 writeVecElem(&x, (XReg) gtobe(
82 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
85 writeVecElem(&x, (XReg) gtobe(
86 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
89 writeVecElem(&x, (XReg) gtobe(
90 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
93 break; // Nothing to do here
97 for (unsigned i = 0; i < eCount; i++) {
100 writeVecElem(&x, (XReg) gtole(
101 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
104 writeVecElem(&x, (XReg) gtole(
105 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
108 writeVecElem(&x, (XReg) gtole(
109 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
112 break; // Nothing to do here
117 memUnion.floatRegBits[0] = (uint32_t) x.lo;
118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119 memUnion.floatRegBits[2] = (uint32_t) x.hi;
120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
123 # Offload everything into registers
127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128 ''' % { 'reg' : reg }
130 # Pull everything in from registers
134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135 ''' % { 'reg' : reg }
137 loadMemAccCode = convCode + regSetCode
138 storeMemAccCode = regGetCode + convCode
140 loadIop = InstObjParams(name + 'ld',
143 { 'mem_decl' : memDecl,
144 'memacc_code' : loadMemAccCode,
145 'ea_code' : simd64EnabledCheckCode + eaCode,
147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148 storeIop = InstObjParams(name + 'st',
151 { 'mem_decl' : memDecl,
152 'memacc_code' : storeMemAccCode,
153 'ea_code' : simd64EnabledCheckCode + eaCode,
155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
157 exec_output += NeonLoadExecute64.subst(loadIop) + \
158 NeonLoadInitiateAcc64.subst(loadIop) + \
159 NeonLoadCompleteAcc64.subst(loadIop) + \
160 NeonStoreExecute64.subst(storeIop) + \
161 NeonStoreInitiateAcc64.subst(storeIop) + \
162 NeonStoreCompleteAcc64.subst(storeIop)
163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164 MicroNeonMemDeclare64.subst(storeIop)
166 def mkMarshalMicroOp(name, Name, numRegs=4):
167 global header_output, decoder_output, exec_output
169 getInputCodeOp1L = ''
170 for v in range(numRegs):
172 getInputCodeOp1L += '''
173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
175 ''' % { 'v' : v, 'p' : p }
177 getInputCodeOp1S = ''
178 for v in range(numRegs):
180 getInputCodeOp1S += '''
181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
183 ''' % { 'v' : v, 'p' : p }
185 if name == 'deint_neon_uop':
188 // input data from scratch area
189 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
190 VReg output[2]; // output data to arch. SIMD regs
197 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
199 eCode += getInputCodeOp1L
201 # Note that numRegs is not always the same as numStructElems; in
202 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
211 int eCount = dataSize / (8 << eSize);
212 int eSizeBytes = 1 << eSize; // element size in bytes
213 int numBytes = step * dataSize / 4;
214 int totNumBytes = numRegs * dataSize / 8;
216 int structElemNo, pos, a, b;
219 for (int r = 0; r < 2; ++r) {
220 for (int i = 0; i < eCount; ++i) {
221 if (numBytes < totNumBytes) {
222 structElemNo = r + (step * 2);
223 if (numStructElems == 1) {
224 pos = (eSizeBytes * i) +
225 (eCount * structElemNo * eSizeBytes);
227 pos = (numStructElems * eSizeBytes * i) +
228 (structElemNo * eSizeBytes);
231 b = (pos % 16) / eSizeBytes;
232 data = (XReg) readVecElem(input[a], (XReg) b,
234 writeVecElem(&output[r], data, i, eSize);
235 numBytes += eSizeBytes;
242 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
246 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
250 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
251 output[1], %(p)d, 0x2);
258 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
265 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
266 { 'code' : eCode, 'op_class' : 'No_OpClass' },
268 header_output += MicroNeonMixDeclare64.subst(iop)
269 exec_output += MicroNeonMixExecute64.subst(iop)
271 elif name == 'int_neon_uop':
274 // input data from arch. SIMD regs
275 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
276 VReg output[2]; // output data to scratch area
279 eCode += getInputCodeOp1S
281 # Note that numRegs is not always the same as numStructElems; in
282 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
286 int eCount = dataSize / (8 << eSize);
287 int eSizeBytes = 1 << eSize;
288 int totNumBytes = numRegs * dataSize / 8;
289 int numOutputElems = 128 / (8 << eSize);
290 int stepOffset = step * 32;
292 for (int i = 0; i < 2; ++i) {
297 int r = 0, k = 0, i, j;
300 for (int pos = stepOffset; pos < 32 + stepOffset;
302 if (pos < totNumBytes) {
303 if (numStructElems == 1) {
304 i = (pos / eSizeBytes) % eCount;
305 j = pos / (eCount * eSizeBytes);
307 i = pos / (numStructElems * eSizeBytes);
308 j = (pos % (numStructElems * eSizeBytes)) /
311 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
312 writeVecElem(&output[r], data, k, eSize);
314 if (k == numOutputElems){
324 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
325 output[%(v)d], %(p)d, 0x2);
326 ''' % { 'v': v, 'p': p}
328 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
329 { 'code' : eCode, 'op_class' : 'No_OpClass' },
331 header_output += MicroNeonMixDeclare64.subst(iop)
332 exec_output += MicroNeonMixExecute64.subst(iop)
334 elif name == 'unpack_neon_uop':
337 //input data from scratch area
338 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
339 VReg output[2]; //output data to arch. SIMD regs
342 eCode += getInputCodeOp1L
344 # Fill output regs with register data initially. Note that
345 # elements in output register outside indexed lanes are left
350 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
352 ''' % { 'v': v, 'p': p}
354 int eCount = dataSize / (8 << eSize);
355 int eCount128 = 128 / (8 << eSize);
356 int eSizeBytes = 1 << eSize;
357 int totNumBytes = numStructElems * eSizeBytes;
358 int numInputElems = eCount128;
359 int stepOffset = step * 2 * eSizeBytes;
360 int stepLimit = 2 * eSizeBytes;
365 for (int pos = stepOffset; pos < stepLimit + stepOffset;
367 if (pos < totNumBytes) {
368 r = pos / eSizeBytes;
369 j = r / numInputElems;
370 i = r % numInputElems;
371 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
374 for (int i = 0; i < eCount128; ++i) {
376 writeVecElem(&output[r % 2], data, i,
378 } else { // zero extend if necessary
379 writeVecElem(&output[r % 2], (XReg) 0, i,
384 writeVecElem(&output[r % 2], data, lane, eSize);
392 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
393 output[%(v)d], %(p)d, 0x2);
394 ''' % { 'v' : v, 'p' : p }
396 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
397 { 'code' : eCode }, ['IsMicroop'])
398 header_output += MicroNeonMixLaneDeclare64.subst(iop)
399 exec_output += MicroNeonMixExecute64.subst(iop)
401 elif name == 'pack_neon_uop':
404 // input data from arch. SIMD regs
405 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
406 VReg output[2]; // output data to scratch area
409 eCode += getInputCodeOp1S
412 int eSizeBytes = 1 << eSize;
413 int numOutputElems = 128 / (8 << eSize);
414 int totNumBytes = numStructElems * eSizeBytes;
415 int stepOffset = step * 32;
421 for (int i = 0; i < 2; ++i) {
426 for (int pos = stepOffset; pos < stepLimit + stepOffset;
428 if (pos < totNumBytes) {
430 j = pos / eSizeBytes;
431 i = (pos / eSizeBytes) % numOutputElems;
432 data = (XReg) readVecElem(input[j], lane, eSize);
433 writeVecElem(&output[r % 2], data, i, eSize);
441 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
442 output[%(v)d], %(p)d, 0x2);
443 ''' % { 'v' : v, 'p' : p }
445 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
446 { 'code' : eCode }, ['IsMicroop'])
447 header_output += MicroNeonMixLaneDeclare64.subst(iop)
448 exec_output += MicroNeonMixExecute64.subst(iop)
450 # Generate instructions
451 mkMemAccMicroOp('mem_neon_uop')
452 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
453 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
454 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
455 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
456 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
457 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
458 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
459 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
460 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
461 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
467 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
468 header_output += VMemMultDeclare64.subst(iop)
469 decoder_output += VMemMultConstructor64.subst(iop)
471 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
472 header_output += VMemMultDeclare64.subst(iop)
473 decoder_output += VMemMultConstructor64.subst(iop)
475 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
476 header_output += VMemSingleDeclare64.subst(iop)
477 decoder_output += VMemSingleConstructor64.subst(iop)
479 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
480 header_output += VMemSingleDeclare64.subst(iop)
481 decoder_output += VMemSingleConstructor64.subst(iop)