src/arch/arm/isa/insts/neon64_mem.isa

   1 // -*- mode: c++ -*-
   2
   3 // Copyright (c) 2012-2014 ARM Limited
   4 // All rights reserved
   5 //
   6 // The license below extends only to copyright in the software and shall
   7 // not be construed as granting a license to any other intellectual
   8 // property including but not limited to intellectual property relating
   9 // to a hardware implementation of the functionality of the software
  10 // licensed hereunder.  You may use the software subject to the license
  11 // terms below provided that you ensure that this notice is replicated
  12 // unmodified and in its entirety in all distributions of the software,
  13 // modified or unmodified, in source code or in binary form.
  14 //
  15 // Redistribution and use in source and binary forms, with or without
  16 // modification, are permitted provided that the following conditions are
  17 // met: redistributions of source code must retain the above copyright
  18 // notice, this list of conditions and the following disclaimer;
  19 // redistributions in binary form must reproduce the above copyright
  20 // notice, this list of conditions and the following disclaimer in the
  21 // documentation and/or other materials provided with the distribution;
  22 // neither the name of the copyright holders nor the names of its
  23 // contributors may be used to endorse or promote products derived from
  24 // this software without specific prior written permission.
  25 //
  26 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  27 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  28 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  29 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  30 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  31 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  32 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  36 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  37 //
  38 // Authors: Mbou Eyole
  39 //          Giacomo Gabrielli
  40
  41 let {{
  42
  43     header_output = ''
  44     decoder_output = ''
  45     exec_output = ''
  46
  47     def mkMemAccMicroOp(name):
  48         global header_output, decoder_output, exec_output
  49         SPAlignmentCheckCodeNeon = '''
  50             if (baseIsSP && bits(XURa, 3, 0) &&
  51                 SPAlignmentCheckEnabled(xc->tcBase())) {
  52                 return std::make_shared<SPAlignmentFault>();
  53             }
  54         '''
  55         eaCode = SPAlignmentCheckCodeNeon + '''
  56             EA = XURa + imm;
  57         '''
  58         memDecl = '''
  59             const int MaxNumBytes = 16;
  60             union MemUnion {
  61                 uint8_t bytes[MaxNumBytes];
  62                 uint32_t floatRegBits[MaxNumBytes / 4];
  63             };
  64         '''
  65
  66         # Do endian conversion for all the elements
  67         convCode = '''
  68             VReg x = {0, 0};
  69
  70             x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) |
  71                 (XReg) memUnion.floatRegBits[0];
  72             x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) |
  73                 (XReg) memUnion.floatRegBits[2];
  74
  75             const unsigned eCount = 16 / (1 << eSize);
  76
  77             if (isBigEndian64(xc->tcBase())) {
  78                 for (unsigned i = 0; i < eCount; i++) {
  79                     switch (eSize) {
  80                       case 0x3:  // 64-bit
  81                         writeVecElem(&x, (XReg) gtobe(
  82                             (uint64_t) readVecElem(x, i, eSize)), i, eSize);
  83                         break;
  84                       case 0x2:  // 32-bit
  85                         writeVecElem(&x, (XReg) gtobe(
  86                             (uint32_t) readVecElem(x, i, eSize)), i, eSize);
  87                         break;
  88                       case 0x1:  // 16-bit
  89                         writeVecElem(&x, (XReg) gtobe(
  90                             (uint16_t) readVecElem(x, i, eSize)), i, eSize);
  91                         break;
  92                       default:  // 8-bit
  93                         break;  // Nothing to do here
  94                     }
  95                 }
  96             } else {
  97                 for (unsigned i = 0; i < eCount; i++) {
  98                     switch (eSize) {
  99                       case 0x3:  // 64-bit
 100                         writeVecElem(&x, (XReg) gtole(
 101                             (uint64_t) readVecElem(x, i, eSize)), i, eSize);
 102                         break;
 103                       case 0x2:  // 32-bit
 104                         writeVecElem(&x, (XReg) gtole(
 105                             (uint32_t) readVecElem(x, i, eSize)), i, eSize);
 106                         break;
 107                       case 0x1:  // 16-bit
 108                         writeVecElem(&x, (XReg) gtole(
 109                             (uint16_t) readVecElem(x, i, eSize)), i, eSize);
 110                         break;
 111                       default:  // 8-bit
 112                         break;  // Nothing to do here
 113                     }
 114                 }
 115             }
 116
 117             memUnion.floatRegBits[0] = (uint32_t) x.lo;
 118             memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32);
 119             memUnion.floatRegBits[2] = (uint32_t) x.hi;
 120             memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32);
 121         '''
 122
 123         # Offload everything into registers
 124         regSetCode = ''
 125         for reg in range(4):
 126             regSetCode += '''
 127             AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]);
 128             ''' % { 'reg' : reg }
 129
 130         # Pull everything in from registers
 131         regGetCode = ''
 132         for reg in range(4):
 133             regGetCode += '''
 134             memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw);
 135             ''' % { 'reg' : reg }
 136
 137         loadMemAccCode = convCode + regSetCode
 138         storeMemAccCode = regGetCode + convCode
 139
 140         loadIop = InstObjParams(name + 'ld',
 141                 'MicroNeonLoad64',
 142                 'MicroNeonMemOp',
 143             {   'mem_decl' : memDecl,
 144                 'memacc_code' : loadMemAccCode,
 145                 'ea_code' : simd64EnabledCheckCode + eaCode,
 146             },
 147             [ 'IsMicroop', 'IsMemRef', 'IsLoad' ])
 148         storeIop = InstObjParams(name + 'st',
 149                 'MicroNeonStore64',
 150                 'MicroNeonMemOp',
 151             {   'mem_decl' : memDecl,
 152                 'memacc_code' : storeMemAccCode,
 153                 'ea_code' : simd64EnabledCheckCode + eaCode,
 154             },
 155             [ 'IsMicroop', 'IsMemRef', 'IsStore' ])
 156
 157         exec_output += NeonLoadExecute64.subst(loadIop) + \
 158             NeonLoadInitiateAcc64.subst(loadIop) + \
 159             NeonLoadCompleteAcc64.subst(loadIop) + \
 160             NeonStoreExecute64.subst(storeIop) + \
 161             NeonStoreInitiateAcc64.subst(storeIop) + \
 162             NeonStoreCompleteAcc64.subst(storeIop)
 163         header_output += MicroNeonMemDeclare64.subst(loadIop) + \
 164             MicroNeonMemDeclare64.subst(storeIop)
 165
 166     def mkMarshalMicroOp(name, Name, numRegs=4):
 167         global header_output, decoder_output, exec_output
 168
 169         getInputCodeOp1L = ''
 170         for v in range(numRegs):
 171             for p in range(4):
 172                 getInputCodeOp1L += '''
 173             writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw,
 174                          %(p)d, 0x2);
 175             ''' % { 'v' : v, 'p' : p }
 176
 177         getInputCodeOp1S = ''
 178         for v in range(numRegs):
 179             for p in range(4):
 180                 getInputCodeOp1S += '''
 181             writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw,
 182                          %(p)d, 0x2);
 183             ''' % { 'v' : v, 'p' : p }
 184
 185         if name == 'deint_neon_uop':
 186
 187             eCode = '''
 188                 // input data from scratch area
 189                 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
 190                 VReg output[2];  // output data to arch. SIMD regs
 191                 VReg temp;
 192                 temp.lo = 0;
 193                 temp.hi = 0;
 194             '''
 195             for p in range(4):
 196                 eCode += '''
 197                 writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2);
 198                 ''' % { 'p' : p }
 199             eCode += getInputCodeOp1L
 200
 201             # Note that numRegs is not always the same as numStructElems; in
 202             # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
 203             # 1, 2, 3 or 4
 204
 205             eCode += '''
 206                 output[0].lo = 0;
 207                 output[0].hi = 0;
 208                 output[1].lo = 0;
 209                 output[1].hi = 0;
 210
 211                 int eCount = dataSize / (8 << eSize);
 212                 int eSizeBytes = 1 << eSize;  // element size in bytes
 213                 int numBytes = step * dataSize / 4;
 214                 int totNumBytes = numRegs * dataSize / 8;
 215
 216                 int structElemNo, pos, a, b;
 217                 XReg data;
 218
 219                 for (int r = 0; r < 2; ++r) {
 220                     for (int i = 0; i < eCount; ++i) {
 221                         if (numBytes < totNumBytes) {
 222                             structElemNo = r + (step * 2);
 223                             if (numStructElems == 1) {
 224                                 pos = (eSizeBytes * i) +
 225                                     (eCount * structElemNo * eSizeBytes);
 226                             } else {
 227                                 pos = (numStructElems * eSizeBytes * i) +
 228                                     (structElemNo * eSizeBytes);
 229                             }
 230                             a = pos / 16;
 231                             b = (pos % 16) / eSizeBytes;
 232                             data = (XReg) readVecElem(input[a], (XReg) b,
 233                                                       eSize);
 234                             writeVecElem(&output[r], data, i, eSize);
 235                             numBytes += eSizeBytes;
 236                         }
 237                     }
 238                 }
 239             '''
 240             for p in range(4):
 241                 eCode += '''
 242                 AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0],
 243                     %(p)d, 0x2);
 244                 ''' % { 'p' : p }
 245             eCode += '''
 246                 if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) {
 247             '''
 248             for p in range(4):
 249                 eCode += '''
 250                     AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(
 251                         output[1], %(p)d, 0x2);
 252                 ''' % { 'p' : p }
 253             eCode += '''
 254                 } else {
 255             '''
 256             for p in range(4):
 257                 eCode += '''
 258                     AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp,
 259                         %(p)d, 0x2);
 260                 ''' % { 'p' : p }
 261             eCode += '''
 262                 }
 263             '''
 264
 265             iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
 266                                 { 'code' : eCode, 'op_class' : 'No_OpClass' },
 267                                 ['IsMicroop'])
 268             header_output += MicroNeonMixDeclare64.subst(iop)
 269             exec_output += MicroNeonMixExecute64.subst(iop)
 270
 271         elif name == 'int_neon_uop':
 272
 273             eCode = '''
 274                 // input data from arch. SIMD regs
 275                 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
 276                 VReg output[2];  // output data to scratch area
 277             '''
 278
 279             eCode += getInputCodeOp1S
 280
 281             # Note that numRegs is not always the same as numStructElems; in
 282             # particular, for LD1/ST1, numStructElems is 1 but numRegs can be
 283             # 1, 2, 3 or 4
 284
 285             eCode += '''
 286                 int eCount = dataSize / (8 << eSize);
 287                 int eSizeBytes = 1 << eSize;
 288                 int totNumBytes = numRegs * dataSize / 8;
 289                 int numOutputElems = 128 / (8 << eSize);
 290                 int stepOffset = step * 32;
 291
 292                 for (int i = 0; i < 2; ++i) {
 293                     output[i].lo = 0;
 294                     output[i].hi = 0;
 295                 }
 296
 297                 int r = 0, k = 0, i, j;
 298                 XReg data;
 299
 300                 for (int pos = stepOffset; pos < 32 + stepOffset;
 301                         pos += eSizeBytes) {
 302                     if (pos < totNumBytes) {
 303                         if (numStructElems == 1) {
 304                             i = (pos / eSizeBytes) % eCount;
 305                             j = pos / (eCount * eSizeBytes);
 306                         } else {
 307                             i = pos / (numStructElems * eSizeBytes);
 308                             j = (pos % (numStructElems * eSizeBytes)) /
 309                                 eSizeBytes;
 310                         }
 311                         data = (XReg) readVecElem(input[j], (XReg) i, eSize);
 312                         writeVecElem(&output[r], data, k, eSize);
 313                         k++;
 314                         if (k == numOutputElems){
 315                             k = 0;
 316                             ++r;
 317                         }
 318                     }
 319                 }
 320                 '''
 321             for v in range(2):
 322                 for p in range(4):
 323                     eCode += '''
 324                 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
 325                     output[%(v)d], %(p)d, 0x2);
 326                 ''' % { 'v': v, 'p': p}
 327
 328             iop = InstObjParams(name, Name, 'MicroNeonMixOp64',
 329                                 { 'code' : eCode, 'op_class' : 'No_OpClass' },
 330                                 ['IsMicroop'])
 331             header_output += MicroNeonMixDeclare64.subst(iop)
 332             exec_output += MicroNeonMixExecute64.subst(iop)
 333
 334         elif name == 'unpack_neon_uop':
 335
 336             eCode = '''
 337                 //input data from scratch area
 338                 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
 339                 VReg output[2];  //output data to arch. SIMD regs
 340             '''
 341
 342             eCode += getInputCodeOp1L
 343
 344             # Fill output regs with register data initially.  Note that
 345             # elements in output register outside indexed lanes are left
 346             # untouched
 347             for v in range(2):
 348                 for p in range(4):
 349                     eCode += '''
 350                 writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw,
 351                              %(p)d, 0x2);
 352                 ''' % { 'v': v, 'p': p}
 353             eCode += '''
 354                 int eCount = dataSize / (8 << eSize);
 355                 int eCount128 = 128 / (8 << eSize);
 356                 int eSizeBytes = 1 << eSize;
 357                 int totNumBytes = numStructElems * eSizeBytes;
 358                 int numInputElems = eCount128;
 359                 int stepOffset = step * 2 * eSizeBytes;
 360                 int stepLimit = 2 * eSizeBytes;
 361
 362                 int r = 0, i, j;
 363                 XReg data;
 364
 365                 for (int pos = stepOffset; pos < stepLimit + stepOffset;
 366                         pos += eSizeBytes) {
 367                     if (pos < totNumBytes) {
 368                         r = pos / eSizeBytes;
 369                         j = r / numInputElems;
 370                         i = r % numInputElems;
 371                         data = (XReg) readVecElem(input[j], (XReg) i, eSize);
 372
 373                         if (replicate) {
 374                             for (int i = 0; i < eCount128; ++i) {
 375                                 if (i < eCount) {
 376                                     writeVecElem(&output[r % 2], data, i,
 377                                                  eSize);
 378                                 } else {  // zero extend if necessary
 379                                     writeVecElem(&output[r % 2], (XReg) 0, i,
 380                                                  eSize);
 381                                 }
 382                             }
 383                         } else {
 384                             writeVecElem(&output[r % 2], data, lane, eSize);
 385                         }
 386                     }
 387                 }
 388             '''
 389             for v in range(2):
 390                 for p in range(4):
 391                     eCode += '''
 392                 AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem(
 393                     output[%(v)d], %(p)d, 0x2);
 394                 ''' % { 'v' : v, 'p' : p }
 395
 396             iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
 397                                 { 'code' : eCode }, ['IsMicroop'])
 398             header_output += MicroNeonMixLaneDeclare64.subst(iop)
 399             exec_output += MicroNeonMixExecute64.subst(iop)
 400
 401         elif name == 'pack_neon_uop':
 402
 403             eCode = '''
 404                 // input data from arch. SIMD regs
 405                 VReg input[4] = { {0, 0}, {0, 0}, {0, 0}, {0, 0} };
 406                 VReg output[2];  // output data to scratch area
 407             '''
 408
 409             eCode += getInputCodeOp1S
 410
 411             eCode += '''
 412                 int eSizeBytes = 1 << eSize;
 413                 int numOutputElems = 128 / (8 << eSize);
 414                 int totNumBytes = numStructElems * eSizeBytes;
 415                 int stepOffset = step * 32;
 416                 int stepLimit = 32;
 417
 418                 int r = 0, i, j;
 419                 XReg data;
 420
 421                 for (int i = 0; i < 2; ++i) {
 422                     output[i].lo = 0;
 423                     output[i].hi = 0;
 424                 }
 425
 426                 for (int pos = stepOffset; pos < stepLimit + stepOffset;
 427                         pos += eSizeBytes) {
 428                     if (pos < totNumBytes) {
 429                         r = pos / 16;
 430                         j = pos / eSizeBytes;
 431                         i = (pos / eSizeBytes) %  numOutputElems;
 432                         data = (XReg) readVecElem(input[j], lane, eSize);
 433                         writeVecElem(&output[r % 2], data, i, eSize);
 434                     }
 435                 }
 436             '''
 437
 438             for v in range(2):
 439                 for p in range(4):
 440                     eCode += '''
 441                 AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem(
 442                     output[%(v)d], %(p)d, 0x2);
 443                 ''' % { 'v' : v, 'p' : p }
 444
 445             iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64',
 446                                 { 'code' : eCode }, ['IsMicroop'])
 447             header_output += MicroNeonMixLaneDeclare64.subst(iop)
 448             exec_output += MicroNeonMixExecute64.subst(iop)
 449
 450     # Generate instructions
 451     mkMemAccMicroOp('mem_neon_uop')
 452     mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_1Reg', numRegs=1)
 453     mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_2Reg', numRegs=2)
 454     mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_3Reg', numRegs=3)
 455     mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64_4Reg', numRegs=4)
 456     mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_1Reg', numRegs=1)
 457     mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_2Reg', numRegs=2)
 458     mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_3Reg', numRegs=3)
 459     mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64_4Reg', numRegs=4)
 460     mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64')
 461     mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64')
 462
 463 }};
 464
 465 let {{
 466
 467     iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', [])
 468     header_output += VMemMultDeclare64.subst(iop)
 469     decoder_output += VMemMultConstructor64.subst(iop)
 470
 471     iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', [])
 472     header_output += VMemMultDeclare64.subst(iop)
 473     decoder_output += VMemMultConstructor64.subst(iop)
 474
 475     iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', [])
 476     header_output += VMemSingleDeclare64.subst(iop)
 477     decoder_output += VMemSingleConstructor64.subst(iop)
 478
 479     iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', [])
 480     header_output += VMemSingleDeclare64.subst(iop)
 481     decoder_output += VMemSingleConstructor64.subst(iop)
 482
 483 }};