4d32412266ac7d83ac9958c53832404e649489e4
[gem5.git] / src / arch / arm / isa / insts / neon64_mem.isa
1 // -*- mode: c++ -*-
2
3 // Copyright (c) 2012-2014 ARM Limited
4 // All rights reserved
5 //
6 // The license below extends only to copyright in the software and shall
7 // not be construed as granting a license to any other intellectual
8 // property including but not limited to intellectual property relating
9 // to a hardware implementation of the functionality of the software
10 // licensed hereunder. You may use the software subject to the license
11 // terms below provided that you ensure that this notice is replicated
12 // unmodified and in its entirety in all distributions of the software,
13 // modified or unmodified, in source code or in binary form.
14 //
15 // Redistribution and use in source and binary forms, with or without
16 // modification, are permitted provided that the following conditions are
17 // met: redistributions of source code must retain the above copyright
18 // notice, this list of conditions and the following disclaimer;
19 // redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution;
22 // neither the name of the copyright holders nor the names of its
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Authors: Mbou Eyole
39 // Giacomo Gabrielli
40
41 let {{
42
43 header_output = ''
44 decoder_output = ''
45 exec_output = ''
46
47 def mkMemAccMicroOp(name):
48 global header_output, decoder_output, exec_output
49 SPAlignmentCheckCodeNeon = '''
50 if (baseIsSP && bits(XURa, 3, 0) &&
51 SPAlignmentCheckEnabled(xc->tcBase())) {
52 return std::make_shared<SPAlignmentFault>();
53 }
54 '''
55 eaCode = SPAlignmentCheckCodeNeon + '''
56 EA = XURa + imm;
57 '''
58 memDecl = '''
59 const int MaxNumBytes = 16;
60 union MemUnion {
61 uint8_t bytes[MaxNumBytes];
62 uint32_t floatRegBits[MaxNumBytes / 4];
63 };
64 '''
65
66 # Do endian conversion for all the elements
67 convCode = '''
68 VReg x = {0, 0};
69
70 x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
71 (XReg) memUnion.floatRegBits[0];
72 x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
73 (XReg) memUnion.floatRegBits[2];
74
75 const unsigned eCount = 16 / (1 << eSize);
76
77 if (isBigEndian64(xc->tcBase())) {
78 for (unsigned i = 0; i < eCount; i++) {
79 switch (eSize) {
80 case 0x3: // 64-bit
81 writeVecElem(&x, (XReg) gtobe(
82 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
83 break;
84 case 0x2: // 32-bit
85 writeVecElem(&x, (XReg) gtobe(
86 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
87 break;
88 case 0x1: // 16-bit
89 writeVecElem(&x, (XReg) gtobe(
90 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
91 break;
92 default: // 8-bit
93 break; // Nothing to do here
94 }
95 }
96 } else {
97 for (unsigned i = 0; i < eCount; i++) {
98 switch (eSize) {
99 case 0x3: // 64-bit
100 writeVecElem(&x, (XReg) gtole(
101 (uint64_t) readVecElem(x, i, eSize)), i, eSize);
102 break;
103 case 0x2: // 32-bit
104 writeVecElem(&x, (XReg) gtole(
105 (uint32_t) readVecElem(x, i, eSize)), i, eSize);
106 break;
107 case 0x1: // 16-bit
108 writeVecElem(&x, (XReg) gtole(
109 (uint16_t) readVecElem(x, i, eSize)), i, eSize);
110 break;
111 default: // 8-bit
112 break; // Nothing to do here
113 }
114 }
115 }
116
117 memUnion.floatRegBits[0] = (uint32_t) x.lo;
118 memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
119 memUnion.floatRegBits[2] = (uint32_t) x.hi;
120 memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
121 '''
122
123 # Offload everything into registers
124 regSetCode = ''
125 for reg in range(4):
126 regSetCode += '''
127 AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
128 ''' % { 'reg' : reg }
129
130 # Pull everything in from registers
131 regGetCode = ''
132 for reg in range(4):
133 regGetCode += '''
134 memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
135 ''' % { 'reg' : reg }
136
137 loadMemAccCode = convCode + regSetCode
138 storeMemAccCode = regGetCode + convCode
139
140 loadIop = InstObjParams(name + 'ld',
141 'MicroNeonLoad64',
142 'MicroNeonMemOp',
143 { 'mem_decl' : memDecl,
144 'memacc_code' : loadMemAccCode,
145 'ea_code' : simd64EnabledCheckCode + eaCode,
146 },
147 [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
148 storeIop = InstObjParams(name + 'st',
149 'MicroNeonStore64',
150 'MicroNeonMemOp',
151 { 'mem_decl' : memDecl,
152 'memacc_code' : storeMemAccCode,
153 'ea_code' : simd64EnabledCheckCode + eaCode,
154 },
155 [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
156
157 exec_output += NeonLoadExecute64.subst(loadIop) + \
158 NeonLoadInitiateAcc64.subst(loadIop) + \
159 NeonLoadCompleteAcc64.subst(loadIop) + \
160 NeonStoreExecute64.subst(storeIop) + \
161 NeonStoreInitiateAcc64.subst(storeIop) + \
162 NeonStoreCompleteAcc64.subst(storeIop)
163 header_output += MicroNeonMemDeclare64.subst(loadIop) + \
164 MicroNeonMemDeclare64.subst(storeIop)
165
166 def mkMarshalMicroOp(name, Name, numRegs=4):
167 global header_output, decoder_output, exec_output
168
169 getInputCodeOp1L = ''
170 for v in range(numRegs):
171 for p in range(4):
172 getInputCodeOp1L += '''
173 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
174 %(p)d, 0x2);
175 ''' % { 'v' : v, 'p' : p }
176
177 getInputCodeOp1S = ''
178 for v in range(numRegs):
179 for p in range(4):
180 getInputCodeOp1S += '''
181 writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
182 %(p)d, 0x2);
183 ''' % { 'v' : v, 'p' : p }
184
185 if name == 'deint_neon_uop':
186
187 eCode = '''
188 // input data from scratch area
189 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
190 VReg output[2]; // output data to arch. SIMD regs
191 VReg temp;
192 temp.lo = 0;
193 temp.hi = 0;
194 '''
195 for p in range(4):
196 eCode += '''
197 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
198 ''' % { 'p' : p }
199 eCode += getInputCodeOp1L
200
201 # Note that numRegs is not always the same as numStructElems; in
202 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
203 # 1, 2, 3 or 4
204
205 eCode += '''
206 output[0].lo = 0;
207 output[0].hi = 0;
208 output[1].lo = 0;
209 output[1].hi = 0;
210
211 int eCount = dataSize / (8 << eSize);
212 int eSizeBytes = 1 << eSize; // element size in bytes
213 int numBytes = step * dataSize / 4;
214 int totNumBytes = numRegs * dataSize / 8;
215
216 int structElemNo, pos, a, b;
217 XReg data;
218
219 for (int r = 0; r < 2; ++r) {
220 for (int i = 0; i < eCount; ++i) {
221 if (numBytes < totNumBytes) {
222 structElemNo = r + (step * 2);
223 if (numStructElems == 1) {
224 pos = (eSizeBytes * i) +
225 (eCount * structElemNo * eSizeBytes);
226 } else {
227 pos = (numStructElems * eSizeBytes * i) +
228 (structElemNo * eSizeBytes);
229 }
230 a = pos / 16;
231 b = (pos % 16) / eSizeBytes;
232 data = (XReg) readVecElem(input[a], (XReg) b,
233 eSize);
234 writeVecElem(&output[r], data, i, eSize);
235 numBytes += eSizeBytes;
236 }
237 }
238 }
239 '''
240 for p in range(4):
241 eCode += '''
242 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
243 %(p)d, 0x2);
244 ''' % { 'p' : p }
245 eCode += '''
246 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
247 '''
248 for p in range(4):
249 eCode += '''
250 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
251 output[1], %(p)d, 0x2);
252 ''' % { 'p' : p }
253 eCode += '''
254 } else {
255 '''
256 for p in range(4):
257 eCode += '''
258 AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
259 %(p)d, 0x2);
260 ''' % { 'p' : p }
261 eCode += '''
262 }
263 '''
264
265 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
266 { 'code' : eCode, 'op_class' : 'No_OpClass' },
267 ['IsMicroop'])
268 header_output += MicroNeonMixDeclare64.subst(iop)
269 exec_output += MicroNeonMixExecute64.subst(iop)
270
271 elif name == 'int_neon_uop':
272
273 eCode = '''
274 // input data from arch. SIMD regs
275 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
276 VReg output[2]; // output data to scratch area
277 '''
278
279 eCode += getInputCodeOp1S
280
281 # Note that numRegs is not always the same as numStructElems; in
282 # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
283 # 1, 2, 3 or 4
284
285 eCode += '''
286 int eCount = dataSize / (8 << eSize);
287 int eSizeBytes = 1 << eSize;
288 int totNumBytes = numRegs * dataSize / 8;
289 int numOutputElems = 128 / (8 << eSize);
290 int stepOffset = step * 32;
291
292 for (int i = 0; i < 2; ++i) {
293 output[i].lo = 0;
294 output[i].hi = 0;
295 }
296
297 int r = 0, k = 0, i, j;
298 XReg data;
299
300 for (int pos = stepOffset; pos < 32 + stepOffset;
301 pos += eSizeBytes) {
302 if (pos < totNumBytes) {
303 if (numStructElems == 1) {
304 i = (pos / eSizeBytes) % eCount;
305 j = pos / (eCount * eSizeBytes);
306 } else {
307 i = pos / (numStructElems * eSizeBytes);
308 j = (pos % (numStructElems * eSizeBytes)) /
309 eSizeBytes;
310 }
311 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
312 writeVecElem(&output[r], data, k, eSize);
313 k++;
314 if (k == numOutputElems){
315 k = 0;
316 ++r;
317 }
318 }
319 }
320 '''
321 for v in range(2):
322 for p in range(4):
323 eCode += '''
324 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
325 output[%(v)d], %(p)d, 0x2);
326 ''' % { 'v': v, 'p': p}
327
328 iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
329 { 'code' : eCode, 'op_class' : 'No_OpClass' },
330 ['IsMicroop'])
331 header_output += MicroNeonMixDeclare64.subst(iop)
332 exec_output += MicroNeonMixExecute64.subst(iop)
333
334 elif name == 'unpack_neon_uop':
335
336 eCode = '''
337 //input data from scratch area
338 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
339 VReg output[2]; //output data to arch. SIMD regs
340 '''
341
342 eCode += getInputCodeOp1L
343
344 # Fill output regs with register data initially. Note that
345 # elements in output register outside indexed lanes are left
346 # untouched
347 for v in range(2):
348 for p in range(4):
349 eCode += '''
350 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
351 %(p)d, 0x2);
352 ''' % { 'v': v, 'p': p}
353 eCode += '''
354 int eCount = dataSize / (8 << eSize);
355 int eCount128 = 128 / (8 << eSize);
356 int eSizeBytes = 1 << eSize;
357 int totNumBytes = numStructElems * eSizeBytes;
358 int numInputElems = eCount128;
359 int stepOffset = step * 2 * eSizeBytes;
360 int stepLimit = 2 * eSizeBytes;
361
362 int r = 0, i, j;
363 XReg data;
364
365 for (int pos = stepOffset; pos < stepLimit + stepOffset;
366 pos += eSizeBytes) {
367 if (pos < totNumBytes) {
368 r = pos / eSizeBytes;
369 j = r / numInputElems;
370 i = r % numInputElems;
371 data = (XReg) readVecElem(input[j], (XReg) i, eSize);
372
373 if (replicate) {
374 for (int i = 0; i < eCount128; ++i) {
375 if (i < eCount) {
376 writeVecElem(&output[r % 2], data, i,
377 eSize);
378 } else { // zero extend if necessary
379 writeVecElem(&output[r % 2], (XReg) 0, i,
380 eSize);
381 }
382 }
383 } else {
384 writeVecElem(&output[r % 2], data, lane, eSize);
385 }
386 }
387 }
388 '''
389 for v in range(2):
390 for p in range(4):
391 eCode += '''
392 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
393 output[%(v)d], %(p)d, 0x2);
394 ''' % { 'v' : v, 'p' : p }
395
396 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
397 { 'code' : eCode }, ['IsMicroop'])
398 header_output += MicroNeonMixLaneDeclare64.subst(iop)
399 exec_output += MicroNeonMixExecute64.subst(iop)
400
401 elif name == 'pack_neon_uop':
402
403 eCode = '''
404 // input data from arch. SIMD regs
405 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
406 VReg output[2]; // output data to scratch area
407 '''
408
409 eCode += getInputCodeOp1S
410
411 eCode += '''
412 int eSizeBytes = 1 << eSize;
413 int numOutputElems = 128 / (8 << eSize);
414 int totNumBytes = numStructElems * eSizeBytes;
415 int stepOffset = step * 32;
416 int stepLimit = 32;
417
418 int r = 0, i, j;
419 XReg data;
420
421 for (int i = 0; i < 2; ++i) {
422 output[i].lo = 0;
423 output[i].hi = 0;
424 }
425
426 for (int pos = stepOffset; pos < stepLimit + stepOffset;
427 pos += eSizeBytes) {
428 if (pos < totNumBytes) {
429 r = pos / 16;
430 j = pos / eSizeBytes;
431 i = (pos / eSizeBytes) % numOutputElems;
432 data = (XReg) readVecElem(input[j], lane, eSize);
433 writeVecElem(&output[r % 2], data, i, eSize);
434 }
435 }
436 '''
437
438 for v in range(2):
439 for p in range(4):
440 eCode += '''
441 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
442 output[%(v)d], %(p)d, 0x2);
443 ''' % { 'v' : v, 'p' : p }
444
445 iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
446 { 'code' : eCode }, ['IsMicroop'])
447 header_output += MicroNeonMixLaneDeclare64.subst(iop)
448 exec_output += MicroNeonMixExecute64.subst(iop)
449
450 # Generate instructions
451 mkMemAccMicroOp('mem_neon_uop')
452 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
453 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
454 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
455 mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
456 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
457 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
458 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
459 mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
460 mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
461 mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
462
463 }};
464
465 let {{
466
467 iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
468 header_output += VMemMultDeclare64.subst(iop)
469 decoder_output += VMemMultConstructor64.subst(iop)
470
471 iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
472 header_output += VMemMultDeclare64.subst(iop)
473 decoder_output += VMemMultConstructor64.subst(iop)
474
475 iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
476 header_output += VMemSingleDeclare64.subst(iop)
477 decoder_output += VMemSingleConstructor64.subst(iop)
478
479 iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
480 header_output += VMemSingleDeclare64.subst(iop)
481 decoder_output += VMemSingleConstructor64.subst(iop)
482
483 }};