src/gpu-compute/gpu_dyn_inst.cc

   1 /*
   2  * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/gpu_dyn_inst.hh"
  35
  36 #include "debug/GPUMem.hh"
  37 #include "gpu-compute/gpu_static_inst.hh"
  38 #include "gpu-compute/scalar_register_file.hh"
  39 #include "gpu-compute/shader.hh"
  40 #include "gpu-compute/wavefront.hh"
  41
  42 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
  43                        GPUStaticInst *static_inst, InstSeqNum instSeqNum)
  44     : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
  45       (Addr)0), numScalarReqs(0), isSaveRestore(false),
  46       _staticInst(static_inst), _seqNum(instSeqNum)
  47 {
  48     statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
  49     tlbHitLevel.assign(computeUnit()->wfSize(), -1);
  50     // vector instructions can have up to 4 source/destination operands
  51     d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
  52     a_data = new uint8_t[computeUnit()->wfSize() * 8];
  53     x_data = new uint8_t[computeUnit()->wfSize() * 8];
  54     // scalar loads can read up to 16 Dwords of data (see publicly
  55     // available GCN3 ISA manual)
  56     scalar_data = new uint8_t[16 * sizeof(uint32_t)];
  57     for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
  58         scalar_data[i] = 0;
  59     }
  60     for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
  61         a_data[i] = 0;
  62         x_data[i] = 0;
  63     }
  64     for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
  65         d_data[i] = 0;
  66     }
  67     time = 0;
  68
  69     cu_id = _cu->cu_id;
  70     if (_wf) {
  71         simdId = _wf->simdId;
  72         wfDynId = _wf->wfDynId;
  73         kern_id = _wf->kernId;
  74         wg_id = _wf->wgId;
  75         wfSlotId = _wf->wfSlotId;
  76     } else {
  77         simdId = -1;
  78         wfDynId = -1;
  79         kern_id = -1;
  80         wg_id = -1;
  81         wfSlotId = -1;
  82     }
  83 }
  84
  85 GPUDynInst::~GPUDynInst()
  86 {
  87     delete[] d_data;
  88     delete[] a_data;
  89     delete[] x_data;
  90     delete[] scalar_data;
  91     delete _staticInst;
  92 }
  93
  94 void
  95 GPUDynInst::execute(GPUDynInstPtr gpuDynInst)
  96 {
  97     _staticInst->execute(gpuDynInst);
  98 }
  99
 100 int
 101 GPUDynInst::numSrcRegOperands()
 102 {
 103     return _staticInst->numSrcRegOperands();
 104 }
 105
 106 int
 107 GPUDynInst::numDstRegOperands()
 108 {
 109     return _staticInst->numDstRegOperands();
 110 }
 111
 112 int
 113 GPUDynInst::numSrcVecOperands()
 114 {
 115     return _staticInst->numSrcVecOperands();
 116 }
 117
 118 int
 119 GPUDynInst::numDstVecOperands()
 120 {
 121     return _staticInst->numDstVecOperands();
 122 }
 123
 124 int
 125 GPUDynInst::numSrcVecDWORDs()
 126 {
 127     return _staticInst->numSrcVecDWORDs();
 128 }
 129
 130 int
 131 GPUDynInst::numDstVecDWORDs()
 132 {
 133     return _staticInst->numDstVecDWORDs();
 134 }
 135
 136 int
 137 GPUDynInst::numOpdDWORDs(int operandIdx)
 138 {
 139     return _staticInst->numOpdDWORDs(operandIdx);
 140 }
 141
 142 int
 143 GPUDynInst::getNumOperands()
 144 {
 145     return _staticInst->getNumOperands();
 146 }
 147
 148 bool
 149 GPUDynInst::isVectorRegister(int operandIdx)
 150 {
 151     return _staticInst->isVectorRegister(operandIdx);
 152 }
 153
 154 bool
 155 GPUDynInst::isScalarRegister(int operandIdx)
 156 {
 157     return _staticInst->isScalarRegister(operandIdx);
 158 }
 159
 160 int
 161 GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
 162 {
 163     return _staticInst->getRegisterIndex(operandIdx, gpuDynInst);
 164 }
 165
 166 int
 167 GPUDynInst::getOperandSize(int operandIdx)
 168 {
 169     return _staticInst->getOperandSize(operandIdx);
 170 }
 171
 172 bool
 173 GPUDynInst::isDstOperand(int operandIdx)
 174 {
 175     return _staticInst->isDstOperand(operandIdx);
 176 }
 177
 178 bool
 179 GPUDynInst::isSrcOperand(int operandIdx)
 180 {
 181     return _staticInst->isSrcOperand(operandIdx);
 182 }
 183
 184 bool
 185 GPUDynInst::hasSourceSgpr() const
 186 {
 187     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 188         if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
 189             return true;
 190         }
 191     }
 192     return false;
 193 }
 194
 195 bool
 196 GPUDynInst::hasSourceVgpr() const
 197 {
 198     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 199         if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
 200             return true;
 201         }
 202     }
 203     return false;
 204 }
 205
 206 bool
 207 GPUDynInst::hasDestinationSgpr() const
 208 {
 209     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 210         if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
 211             return true;
 212         }
 213     }
 214     return false;
 215 }
 216
 217 bool
 218 GPUDynInst::srcIsVgpr(int index) const
 219 {
 220     assert(index >= 0 && index < _staticInst->getNumOperands());
 221     if (_staticInst->isVectorRegister(index) &&
 222         _staticInst->isSrcOperand(index)) {
 223         return true;
 224     }
 225     return false;
 226 }
 227
 228 bool
 229 GPUDynInst::hasDestinationVgpr() const
 230 {
 231     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 232         if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
 233             return true;
 234         }
 235     }
 236     return false;
 237 }
 238
 239 bool
 240 GPUDynInst::isOpcode(const std::string& opcodeStr,
 241                      const std::string& extStr) const
 242 {
 243     return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
 244         _staticInst->opcode().find(extStr) != std::string::npos;
 245 }
 246
 247 bool
 248 GPUDynInst::isOpcode(const std::string& opcodeStr) const
 249 {
 250     return _staticInst->opcode().find(opcodeStr) != std::string::npos;
 251 }
 252
 253 const std::string&
 254 GPUDynInst::disassemble() const
 255 {
 256     return _staticInst->disassemble();
 257 }
 258
 259 InstSeqNum
 260 GPUDynInst::seqNum() const
 261 {
 262     return _seqNum;
 263 }
 264
 265 Enums::StorageClassType
 266 GPUDynInst::executedAs()
 267 {
 268     return _staticInst->executed_as;
 269 }
 270
 271 bool
 272 GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
 273 {
 274     assert(s);
 275     for (int i = 0; i < getNumOperands(); ++i) {
 276         if (isVectorRegister(i) && isSrcOperand(i)) {
 277             for (int j = 0; j < s->getNumOperands(); ++j) {
 278                 if (s->isVectorRegister(j) && s->isDstOperand(j)) {
 279                     if (i == j)
 280                         return true;
 281                 }
 282             }
 283         }
 284     }
 285     return false;
 286 }
 287
 288 bool
 289 GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
 290 {
 291     assert(s);
 292     for (int i = 0; i < getNumOperands(); ++i) {
 293         if (isScalarRegister(i) && isSrcOperand(i)) {
 294             for (int j = 0; j < s->getNumOperands(); ++j) {
 295                 if (s->isScalarRegister(j) && s->isDstOperand(j)) {
 296                     if (i == j)
 297                         return true;
 298                 }
 299             }
 300         }
 301     }
 302     return false;
 303 }
 304
 305 // Process a memory instruction and (if necessary) submit timing request
 306 void
 307 GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
 308 {
 309     DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
 310             cu->cu_id, simdId, wfSlotId, exec_mask);
 311
 312     _staticInst->initiateAcc(gpuDynInst);
 313 }
 314
 315 void
 316 GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
 317 {
 318     DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
 319             "%#x\n complete",
 320             cu->cu_id, simdId, wfSlotId, exec_mask);
 321
 322     _staticInst->completeAcc(gpuDynInst);
 323 }
 324
 325 /**
 326  * accessor methods for the attributes of
 327  * the underlying GPU static instruction
 328  */
 329 bool
 330 GPUDynInst::isALU() const
 331 {
 332     return _staticInst->isALU();
 333 }
 334
 335 bool
 336 GPUDynInst::isBranch() const
 337 {
 338     return _staticInst->isBranch();
 339 }
 340
 341 bool
 342 GPUDynInst::isCondBranch() const
 343 {
 344     return _staticInst->isCondBranch();
 345 }
 346
 347 bool
 348 GPUDynInst::isNop() const
 349 {
 350     return _staticInst->isNop();
 351 }
 352
 353 bool
 354 GPUDynInst::isEndOfKernel() const
 355 {
 356     return _staticInst->isEndOfKernel();
 357 }
 358
 359 bool
 360 GPUDynInst::isKernelLaunch() const
 361 {
 362     return _staticInst->isKernelLaunch();
 363 }
 364
 365 bool
 366 GPUDynInst::isSDWAInst() const
 367 {
 368     return _staticInst->isSDWAInst();
 369 }
 370
 371 bool
 372 GPUDynInst::isDPPInst() const
 373 {
 374     return _staticInst->isDPPInst();
 375 }
 376
 377 bool
 378 GPUDynInst::isReturn() const
 379 {
 380     return _staticInst->isReturn();
 381 }
 382
 383 bool
 384 GPUDynInst::isUnconditionalJump() const
 385 {
 386     return _staticInst->isUnconditionalJump();
 387 }
 388
 389 bool
 390 GPUDynInst::isSpecialOp() const
 391 {
 392     return _staticInst->isSpecialOp();
 393 }
 394
 395 bool
 396 GPUDynInst::isWaitcnt() const
 397 {
 398     return _staticInst->isWaitcnt();
 399 }
 400
 401 bool
 402 GPUDynInst::isBarrier() const
 403 {
 404     return _staticInst->isBarrier();
 405 }
 406
 407 bool
 408 GPUDynInst::isMemSync() const
 409 {
 410     return _staticInst->isMemSync();
 411 }
 412
 413 bool
 414 GPUDynInst::isMemRef() const
 415 {
 416     return _staticInst->isMemRef();
 417 }
 418
 419 bool
 420 GPUDynInst::isFlat() const
 421 {
 422     return _staticInst->isFlat();
 423 }
 424
 425 bool
 426 GPUDynInst::isLoad() const
 427 {
 428     return _staticInst->isLoad();
 429 }
 430
 431 bool
 432 GPUDynInst::isStore() const
 433 {
 434     return _staticInst->isStore();
 435 }
 436
 437 bool
 438 GPUDynInst::isAtomic() const
 439 {
 440     return _staticInst->isAtomic();
 441 }
 442
 443 bool
 444 GPUDynInst::isAtomicNoRet() const
 445 {
 446     return _staticInst->isAtomicNoRet();
 447 }
 448
 449 bool
 450 GPUDynInst::isAtomicRet() const
 451 {
 452     return _staticInst->isAtomicRet();
 453 }
 454
 455 bool
 456 GPUDynInst::isVector() const
 457 {
 458     return !_staticInst->isScalar();
 459 }
 460
 461 bool
 462 GPUDynInst::isScalar() const
 463 {
 464     return _staticInst->isScalar();
 465 }
 466
 467 bool
 468 GPUDynInst::readsSCC() const
 469 {
 470     return _staticInst->readsSCC();
 471 }
 472
 473 bool
 474 GPUDynInst::writesSCC() const
 475 {
 476     return _staticInst->writesSCC();
 477 }
 478
 479 bool
 480 GPUDynInst::readsVCC() const
 481 {
 482     return _staticInst->readsVCC();
 483 }
 484
 485 bool
 486 GPUDynInst::writesVCC() const
 487 {
 488     return _staticInst->writesVCC();
 489 }
 490
 491 bool
 492 GPUDynInst::readsMode() const
 493 {
 494     return _staticInst->readsMode();
 495 }
 496
 497 bool
 498 GPUDynInst::writesMode() const
 499 {
 500     return _staticInst->writesMode();
 501 }
 502
 503 bool
 504 GPUDynInst::readsEXEC() const
 505 {
 506     return _staticInst->readsEXEC();
 507 }
 508
 509 bool
 510 GPUDynInst::writesEXEC() const
 511 {
 512     return _staticInst->writesEXEC();
 513 }
 514
 515 bool
 516 GPUDynInst::ignoreExec() const
 517 {
 518     return _staticInst->ignoreExec();
 519 }
 520
 521 bool
 522 GPUDynInst::writesExecMask() const
 523 {
 524     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 525         return _staticInst->isDstOperand(i) &&
 526             _staticInst->isExecMaskRegister(i);
 527     }
 528     return false;
 529 }
 530
 531 bool
 532 GPUDynInst::readsExecMask() const
 533 {
 534     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 535         return _staticInst->isSrcOperand(i) &&
 536             _staticInst->isExecMaskRegister(i);
 537     }
 538     return false;
 539 }
 540
 541 bool
 542 GPUDynInst::writesFlatScratch() const
 543 {
 544     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 545         if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
 546             return _staticInst->isFlatScratchRegister(i);
 547         }
 548     }
 549     return false;
 550 }
 551
 552 bool
 553 GPUDynInst::readsFlatScratch() const
 554 {
 555     for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
 556         if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
 557             return _staticInst->isFlatScratchRegister(i);
 558         }
 559     }
 560     return false;
 561 }
 562
 563 bool
 564 GPUDynInst::isAtomicAnd() const
 565 {
 566     return _staticInst->isAtomicAnd();
 567 }
 568
 569 bool
 570 GPUDynInst::isAtomicOr() const
 571 {
 572     return _staticInst->isAtomicOr();
 573 }
 574
 575 bool
 576 GPUDynInst::isAtomicXor() const
 577 {
 578     return _staticInst->isAtomicXor();
 579 }
 580
 581 bool
 582 GPUDynInst::isAtomicCAS() const
 583 {
 584     return _staticInst->isAtomicCAS();
 585 }
 586
 587 bool GPUDynInst::isAtomicExch() const
 588 {
 589     return _staticInst->isAtomicExch();
 590 }
 591
 592 bool
 593 GPUDynInst::isAtomicAdd() const
 594 {
 595     return _staticInst->isAtomicAdd();
 596 }
 597
 598 bool
 599 GPUDynInst::isAtomicSub() const
 600 {
 601     return _staticInst->isAtomicSub();
 602 }
 603
 604 bool
 605 GPUDynInst::isAtomicInc() const
 606 {
 607     return _staticInst->isAtomicInc();
 608 }
 609
 610 bool
 611 GPUDynInst::isAtomicDec() const
 612 {
 613     return _staticInst->isAtomicDec();
 614 }
 615
 616 bool
 617 GPUDynInst::isAtomicMax() const
 618 {
 619     return _staticInst->isAtomicMax();
 620 }
 621
 622 bool
 623 GPUDynInst::isAtomicMin() const
 624 {
 625     return _staticInst->isAtomicMin();
 626 }
 627
 628 bool
 629 GPUDynInst::isArgLoad() const
 630 {
 631     return _staticInst->isArgLoad();
 632 }
 633
 634 bool
 635 GPUDynInst::isGlobalMem() const
 636 {
 637     return _staticInst->isGlobalMem();
 638 }
 639
 640 bool
 641 GPUDynInst::isLocalMem() const
 642 {
 643     return _staticInst->isLocalMem();
 644 }
 645
 646 bool
 647 GPUDynInst::isArgSeg() const
 648 {
 649     return _staticInst->isArgSeg();
 650 }
 651
 652 bool
 653 GPUDynInst::isGlobalSeg() const
 654 {
 655     return _staticInst->isGlobalSeg();
 656 }
 657
 658 bool
 659 GPUDynInst::isGroupSeg() const
 660 {
 661     return _staticInst->isGroupSeg();
 662 }
 663
 664 bool
 665 GPUDynInst::isKernArgSeg() const
 666 {
 667     return _staticInst->isKernArgSeg();
 668 }
 669
 670 bool
 671 GPUDynInst::isPrivateSeg() const
 672 {
 673     return _staticInst->isPrivateSeg();
 674 }
 675
 676 bool
 677 GPUDynInst::isReadOnlySeg() const
 678 {
 679     return _staticInst->isReadOnlySeg();
 680 }
 681
 682 bool
 683 GPUDynInst::isSpillSeg() const
 684 {
 685     return _staticInst->isSpillSeg();
 686 }
 687
 688 bool
 689 GPUDynInst::isGloballyCoherent() const
 690 {
 691     return _staticInst->isGloballyCoherent();
 692 }
 693
 694 bool
 695 GPUDynInst::isSystemCoherent() const
 696 {
 697     return _staticInst->isSystemCoherent();
 698 }
 699
 700 bool
 701 GPUDynInst::isF16() const
 702 {
 703     return _staticInst->isF16();
 704 }
 705
 706 bool
 707 GPUDynInst::isF32() const
 708 {
 709     return _staticInst->isF32();
 710 }
 711
 712 bool
 713 GPUDynInst::isF64() const
 714 {
 715     return _staticInst->isF64();
 716 }
 717
 718 bool
 719 GPUDynInst::isFMA() const
 720 {
 721     return _staticInst->isFMA();
 722 }
 723
 724 bool
 725 GPUDynInst::isMAC() const
 726 {
 727     return _staticInst->isMAC();
 728 }
 729
 730 bool
 731 GPUDynInst::isMAD() const
 732 {
 733     return _staticInst->isMAD();
 734 }
 735
 736 void
 737 GPUDynInst::doApertureCheck(const VectorMask &mask)
 738 {
 739     assert(mask.any());
 740     // find the segment of the first active address, after
 741     // that we check that all other active addresses also
 742     // fall within the same APE
 743     for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
 744         if (mask[lane]) {
 745             if (computeUnit()->shader->isLdsApe(addr[lane])) {
 746                 // group segment
 747                 staticInstruction()->executed_as = Enums::SC_GROUP;
 748                 break;
 749             } else if (computeUnit()->shader->isScratchApe(addr[lane])) {
 750                 // private segment
 751                 staticInstruction()->executed_as = Enums::SC_PRIVATE;
 752                 break;
 753             } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
 754                 // we won't support GPUVM
 755                 fatal("flat access is in GPUVM APE\n");
 756             } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
 757                        bits(addr[lane], 63, 47)) {
 758                 // we are in the "hole", this is a memory violation
 759                 fatal("flat access at addr %#x has a memory violation\n",
 760                       addr[lane]);
 761             } else {
 762                 // global memory segment
 763                 staticInstruction()->executed_as = Enums::SC_GLOBAL;
 764                 break;
 765             }
 766         }
 767     }
 768
 769     // we should have found the segment
 770     assert(executedAs() != Enums::SC_NONE);
 771
 772     // flat accesses should not straddle multiple APEs so we
 773     // must check that all addresses fall within the same APE
 774     if (executedAs() == Enums::SC_GROUP) {
 775         for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
 776             if (mask[lane]) {
 777                 // if the first valid addr we found above was LDS,
 778                 // all the rest should be
 779                 assert(computeUnit()->shader->isLdsApe(addr[lane]));
 780             }
 781         }
 782     } else if (executedAs() == Enums::SC_PRIVATE) {
 783         for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
 784             if (mask[lane]) {
 785                 // if the first valid addr we found above was private,
 786                 // all the rest should be
 787                 assert(computeUnit()->shader->isScratchApe(addr[lane]));
 788             }
 789         }
 790     } else {
 791         for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
 792             if (mask[lane]) {
 793                 // if the first valid addr we found above was global,
 794                 // all the rest should be. because we don't have an
 795                 // explicit range of the global segment, we just make
 796                 // sure that the address fall in no other APE and that
 797                 // it is not a memory violation
 798                 assert(!computeUnit()->shader->isLdsApe(addr[lane]));
 799                 assert(!computeUnit()->shader->isScratchApe(addr[lane]));
 800                 assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
 801                 assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
 802                        && bits(addr[lane], 63, 47)));
 803             }
 804         }
 805     }
 806 }
 807
 808 void
 809 GPUDynInst::resolveFlatSegment(const VectorMask &mask)
 810 {
 811     doApertureCheck(mask);
 812
 813
 814     // Now that we know the aperature, do the following:
 815     // 1. Transform the flat address to its segmented equivalent.
 816     // 2. Set the execUnitId based an the aperture check.
 817     // 3. Decrement any extra resources that were reserved. Other
 818     //    resources are released as normal, below.
 819     if (executedAs() == Enums::SC_GLOBAL) {
 820         // no transormation for global segment
 821         wavefront()->execUnitId =  wavefront()->flatGmUnitId;
 822         if (isLoad()) {
 823             wavefront()->rdLmReqsInPipe--;
 824         } else if (isStore()) {
 825             wavefront()->wrLmReqsInPipe--;
 826         } else if (isAtomic() || isMemSync()) {
 827             wavefront()->wrLmReqsInPipe--;
 828             wavefront()->rdLmReqsInPipe--;
 829         } else {
 830             panic("Invalid memory operation!\n");
 831         }
 832     } else if (executedAs() == Enums::SC_GROUP) {
 833         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
 834             if (mask[lane]) {
 835                 // flat address calculation goes here.
 836                 // addr[lane] = segmented address
 837                 panic("Flat group memory operation is unimplemented!\n");
 838             }
 839         }
 840         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
 841         wavefront()->decVMemInstsIssued();
 842         if (isLoad()) {
 843             wavefront()->rdGmReqsInPipe--;
 844         } else if (isStore()) {
 845             wavefront()->wrGmReqsInPipe--;
 846         } else if (isAtomic() || isMemSync()) {
 847             wavefront()->rdGmReqsInPipe--;
 848             wavefront()->wrGmReqsInPipe--;
 849         } else {
 850             panic("Invalid memory operation!\n");
 851         }
 852     } else if (executedAs() == Enums::SC_PRIVATE) {
 853         /**
 854          * Flat instructions may resolve to the private segment (scratch),
 855          * which is backed by main memory and provides per-lane scratch
 856          * memory. Flat addressing uses apertures - registers that specify
 857          * the address range in the VA space where LDS/private memory is
 858          * mapped. The value of which is set by the kernel mode driver.
 859          * These apertures use addresses that are not used by x86 CPUs.
 860          * When the address of a Flat operation falls into one of the
 861          * apertures, the Flat operation is redirected to either LDS or
 862          * to the private memory segment.
 863          *
 864          * For private memory the SW runtime will allocate some space in
 865          * the VA space for each AQL queue. The base address of which is
 866          * stored in scalar registers per the AMD GPU ABI. The amd_queue_t
 867          * scratch_backing_memory_location provides the base address in
 868          * memory for the queue's private segment. Various other fields
 869          * loaded into register state during kernel launch specify per-WF
 870          * and per-work-item offsets so that individual lanes may access
 871          * their private segment allocation.
 872          *
 873          * For more details about flat addressing see:
 874          *     http://rocm-documentation.readthedocs.io/en/latest/
 875          *     ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
 876          *
 877          *     https://github.com/ROCm-Developer-Tools/
 878          *     ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
 879          *     #flat-addressing
 880          */
 881
 882         uint32_t numSgprs = wavefront()->maxSgprs;
 883         uint32_t physSgprIdx =
 884             wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
 885                                                           numSgprs - 3);
 886         uint32_t offset =
 887             wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
 888         physSgprIdx =
 889             wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
 890                                                           numSgprs - 4);
 891         uint32_t size =
 892             wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
 893         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
 894             if (mask[lane]) {
 895                 addr[lane] = addr[lane] + lane * size + offset +
 896                     wavefront()->computeUnit->shader->getHiddenPrivateBase() -
 897                     wavefront()->computeUnit->shader->getScratchBase();
 898             }
 899         }
 900         wavefront()->execUnitId =  wavefront()->flatLmUnitId;
 901         wavefront()->decLGKMInstsIssued();
 902         if (isLoad()) {
 903             wavefront()->rdGmReqsInPipe--;
 904         } else if (isStore()) {
 905             wavefront()->wrGmReqsInPipe--;
 906         } else if (isAtomic() || isMemSync()) {
 907             wavefront()->rdGmReqsInPipe--;
 908             wavefront()->wrGmReqsInPipe--;
 909         } else {
 910             panic("Invalid memory operation!\n");
 911         }
 912     } else {
 913         for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
 914             if (mask[lane]) {
 915                 panic("flat addr %#llx maps to bad segment %d\n",
 916                       addr[lane], executedAs());
 917             }
 918         }
 919     }
 920 }
 921
 922 TheGpuISA::ScalarRegU32
 923 GPUDynInst::srcLiteral() const
 924 {
 925     return _staticInst->srcLiteral();
 926 }
 927
 928 void
 929 GPUDynInst::updateStats()
 930 {
 931     if (_staticInst->isLocalMem()) {
 932         // access to LDS (shared) memory
 933         cu->stats.dynamicLMemInstrCnt++;
 934     } else if (_staticInst->isFlat()) {
 935         cu->stats.dynamicFlatMemInstrCnt++;
 936     } else {
 937         // access to global memory
 938
 939         // update PageDivergence histogram
 940         int number_pages_touched = cu->pagesTouched.size();
 941         assert(number_pages_touched);
 942         cu->stats.pageDivergenceDist.sample(number_pages_touched);
 943
 944         std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
 945
 946         for (auto it : cu->pagesTouched) {
 947             // see if this page has been touched before. if not, this also
 948             // inserts the page into the table.
 949             ret = cu->pageAccesses
 950                 .insert(ComputeUnit::pageDataStruct::value_type(it.first,
 951                         std::make_pair(1, it.second)));
 952
 953             // if yes, then update the stats
 954             if (!ret.second) {
 955                 ret.first->second.first++;
 956                 ret.first->second.second += it.second;
 957             }
 958         }
 959
 960         cu->pagesTouched.clear();
 961
 962         // total number of memory instructions (dynamic)
 963         // Atomics are counted as a single memory instruction.
 964         // this is # memory instructions per wavefronts, not per workitem
 965         cu->stats.dynamicGMemInstrCnt++;
 966     }
 967 }
 968
 969 void
 970 GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
 971 {
 972     // Only take the first measurement in the case of coalescing
 973     if (roundTripTime.size() > hopId)
 974         return;
 975
 976     roundTripTime.push_back(currentTime);
 977 }
 978
 979 void
 980 GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
 981 {
 982     if (lineAddressTime.count(addr)) {
 983         if (lineAddressTime[addr].size() > hopId) {
 984             return;
 985         }
 986
 987         lineAddressTime[addr].push_back(currentTime);
 988     } else if (hopId == 0) {
 989         auto addressTimeVec = std::vector<Tick> { currentTime };
 990         lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
 991     }
 992 }