src/gpu-compute/wavefront.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Lisa Hsu
  34  */
  35
  36 #include "gpu-compute/wavefront.hh"
  37
  38 #include "debug/GPUExec.hh"
  39 #include "debug/WavefrontStack.hh"
  40 #include "gpu-compute/compute_unit.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43 #include "gpu-compute/vector_register_file.hh"
  44
  45 Wavefront*
  46 WavefrontParams::create()
  47 {
  48     return new Wavefront(this);
  49 }
  50
  51 Wavefront::Wavefront(const Params *p)
  52   : SimObject(p), callArgMem(nullptr), _gpuISA()
  53 {
  54     lastTrace = 0;
  55     simdId = p->simdId;
  56     wfSlotId = p->wf_slot_id;
  57     status = S_STOPPED;
  58     reservedVectorRegs = 0;
  59     startVgprIndex = 0;
  60     outstandingReqs = 0;
  61     memReqsInPipe = 0;
  62     outstandingReqsWrGm = 0;
  63     outstandingReqsWrLm = 0;
  64     outstandingReqsRdGm = 0;
  65     outstandingReqsRdLm = 0;
  66     rdLmReqsInPipe = 0;
  67     rdGmReqsInPipe = 0;
  68     wrLmReqsInPipe = 0;
  69     wrGmReqsInPipe = 0;
  70
  71     barrierCnt = 0;
  72     oldBarrierCnt = 0;
  73     stalledAtBarrier = false;
  74
  75     memTraceBusy = 0;
  76     oldVgprTcnt = 0xffffffffffffffffll;
  77     oldDgprTcnt = 0xffffffffffffffffll;
  78     oldVgpr.resize(p->wfSize);
  79
  80     pendingFetch = false;
  81     dropFetch = false;
  82     condRegState = new ConditionRegisterState();
  83     maxSpVgprs = 0;
  84     maxDpVgprs = 0;
  85     lastAddr.resize(p->wfSize);
  86     workItemFlatId.resize(p->wfSize);
  87     oldDgpr.resize(p->wfSize);
  88     barCnt.resize(p->wfSize);
  89     for (int i = 0; i < 3; ++i) {
  90         workItemId[i].resize(p->wfSize);
  91     }
  92 }
  93
  94 void
  95 Wavefront::regStats()
  96 {
  97     SimObject::regStats();
  98
  99     srcRegOpDist
 100         .init(0, 4, 2)
 101         .name(name() + ".src_reg_operand_dist")
 102         .desc("number of executed instructions with N source register operands")
 103         ;
 104
 105     dstRegOpDist
 106         .init(0, 3, 2)
 107         .name(name() + ".dst_reg_operand_dist")
 108         .desc("number of executed instructions with N destination register "
 109               "operands")
 110         ;
 111
 112     // FIXME: the name of the WF needs to be unique
 113     numTimesBlockedDueWAXDependencies
 114         .name(name() + ".timesBlockedDueWAXDependencies")
 115         .desc("number of times the wf's instructions are blocked due to WAW "
 116               "or WAR dependencies")
 117         ;
 118
 119     // FIXME: the name of the WF needs to be unique
 120     numTimesBlockedDueRAWDependencies
 121         .name(name() + ".timesBlockedDueRAWDependencies")
 122         .desc("number of times the wf's instructions are blocked due to RAW "
 123               "dependencies")
 124         ;
 125
 126     // FIXME: the name of the WF needs to be unique
 127     numTimesBlockedDueVrfPortAvail
 128         .name(name() + ".timesBlockedDueVrfPortAvail")
 129         .desc("number of times instructions are blocked due to VRF port "
 130               "availability")
 131         ;
 132 }
 133
 134 void
 135 Wavefront::init()
 136 {
 137     reservedVectorRegs = 0;
 138     startVgprIndex = 0;
 139 }
 140
 141 void
 142 Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
 143 {
 144     condRegState->init(num_cregs);
 145     maxSpVgprs = num_sregs;
 146     maxDpVgprs = num_dregs;
 147 }
 148
 149 Wavefront::~Wavefront()
 150 {
 151     if (callArgMem)
 152         delete callArgMem;
 153     delete condRegState;
 154 }
 155
 156 void
 157 Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
 158 {
 159     wfDynId = _wf_dyn_id;
 160     basePtr = _base_ptr;
 161     status = S_RUNNING;
 162 }
 163
 164 bool
 165 Wavefront::isGmInstruction(GPUDynInstPtr ii)
 166 {
 167     if (ii->isGlobalMem() || ii->isFlat())
 168         return true;
 169
 170     return false;
 171 }
 172
 173 bool
 174 Wavefront::isLmInstruction(GPUDynInstPtr ii)
 175 {
 176     if (ii->isLocalMem()) {
 177         return true;
 178     }
 179
 180     return false;
 181 }
 182
 183 bool
 184 Wavefront::isOldestInstALU()
 185 {
 186     assert(!instructionBuffer.empty());
 187     GPUDynInstPtr ii = instructionBuffer.front();
 188
 189     if (status != S_STOPPED && (ii->isNop() ||
 190         ii->isReturn() || ii->isBranch() ||
 191         ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
 192         return true;
 193     }
 194
 195     return false;
 196 }
 197
 198 bool
 199 Wavefront::isOldestInstBarrier()
 200 {
 201     assert(!instructionBuffer.empty());
 202     GPUDynInstPtr ii = instructionBuffer.front();
 203
 204     if (status != S_STOPPED && ii->isBarrier()) {
 205         return true;
 206     }
 207
 208     return false;
 209 }
 210
 211 bool
 212 Wavefront::isOldestInstGMem()
 213 {
 214     assert(!instructionBuffer.empty());
 215     GPUDynInstPtr ii = instructionBuffer.front();
 216
 217     if (status != S_STOPPED && ii->isGlobalMem()) {
 218         return true;
 219     }
 220
 221     return false;
 222 }
 223
 224 bool
 225 Wavefront::isOldestInstLMem()
 226 {
 227     assert(!instructionBuffer.empty());
 228     GPUDynInstPtr ii = instructionBuffer.front();
 229
 230     if (status != S_STOPPED && ii->isLocalMem()) {
 231         return true;
 232     }
 233
 234     return false;
 235 }
 236
 237 bool
 238 Wavefront::isOldestInstPrivMem()
 239 {
 240     assert(!instructionBuffer.empty());
 241     GPUDynInstPtr ii = instructionBuffer.front();
 242
 243     if (status != S_STOPPED && ii->isPrivateSeg()) {
 244         return true;
 245     }
 246
 247     return false;
 248 }
 249
 250 bool
 251 Wavefront::isOldestInstFlatMem()
 252 {
 253     assert(!instructionBuffer.empty());
 254     GPUDynInstPtr ii = instructionBuffer.front();
 255
 256     if (status != S_STOPPED && ii->isFlat()) {
 257         return true;
 258     }
 259
 260     return false;
 261 }
 262
 263 // Return true if the Wavefront's instruction
 264 // buffer has branch instruction.
 265 bool
 266 Wavefront::instructionBufferHasBranch()
 267 {
 268     for (auto it : instructionBuffer) {
 269         GPUDynInstPtr ii = it;
 270
 271         if (ii->isReturn() || ii->isBranch()) {
 272             return true;
 273         }
 274     }
 275
 276     return false;
 277 }
 278
 279 // Remap HSAIL register to physical VGPR.
 280 // HSAIL register = virtual register assigned to an operand by HLC compiler
 281 uint32_t
 282 Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
 283 {
 284     assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
 285     // add the offset from where the VGPRs of the wavefront have been assigned
 286     uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
 287     // HSAIL double precision (DP) register: calculate the physical VGPR index
 288     // assuming that DP registers are placed after SP ones in the VRF. The DP
 289     // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
 290     // the DP VGPR index before mapping it to the physical VRF address space
 291     if (mode == 1 && size > 4) {
 292         physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
 293     }
 294
 295     assert((startVgprIndex <= physicalVgprIndex) &&
 296            (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
 297
 298     // calculate absolute physical VGPR index
 299     return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
 300 }
 301
 302 // Return true if this wavefront is ready
 303 // to execute an instruction of the specified type.
 304 int
 305 Wavefront::ready(itype_e type)
 306 {
 307     // Check to make sure wave is running
 308     if (status == S_STOPPED || status == S_RETURNING ||
 309         instructionBuffer.empty()) {
 310         return 0;
 311     }
 312
 313     // Is the wave waiting at a barrier
 314     if (stalledAtBarrier) {
 315         if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
 316                         computeUnit->getRefCounter(dispatchId, wgId))) {
 317             // Are all threads at barrier?
 318             return 0;
 319         }
 320         oldBarrierCnt = barrierCnt;
 321         stalledAtBarrier = false;
 322     }
 323
 324     // Read instruction
 325     GPUDynInstPtr ii = instructionBuffer.front();
 326
 327     bool ready_inst M5_VAR_USED = false;
 328     bool glbMemBusRdy = false;
 329     bool glbMemIssueRdy = false;
 330     if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
 331         for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
 332             if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
 333                 glbMemBusRdy = true;
 334             if (computeUnit->wfWait[j].prerdy())
 335                 glbMemIssueRdy = true;
 336         }
 337     }
 338     bool locMemBusRdy = false;
 339     bool locMemIssueRdy = false;
 340     if (type == I_SHARED || type == I_FLAT) {
 341         for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
 342             if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
 343                 locMemBusRdy = true;
 344             if (computeUnit->wfWait[j].prerdy())
 345                 locMemIssueRdy = true;
 346         }
 347     }
 348
 349     // The following code is very error prone and the entire process for
 350     // checking readiness will be fixed eventually.  In the meantime, let's
 351     // make sure that we do not silently let an instruction type slip
 352     // through this logic and always return not ready.
 353     if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
 354         ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
 355         ii->isMemFence() || ii->isFlat())) {
 356         panic("next instruction: %s is of unknown type\n", ii->disassemble());
 357     }
 358
 359     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
 360             computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
 361
 362     if (type == I_ALU && ii->isBarrier()) {
 363         // Here for ALU instruction (barrier)
 364         if (!computeUnit->wfWait[simdId].prerdy()) {
 365             // Is wave slot free?
 366             return 0;
 367         }
 368
 369         // Are there in pipe or outstanding memory requests?
 370         if ((outstandingReqs + memReqsInPipe) > 0) {
 371             return 0;
 372         }
 373
 374         ready_inst = true;
 375     } else if (type == I_ALU && ii->isNop()) {
 376         // Here for ALU instruction (nop)
 377         if (!computeUnit->wfWait[simdId].prerdy()) {
 378             // Is wave slot free?
 379             return 0;
 380         }
 381
 382         ready_inst = true;
 383     } else if (type == I_ALU && ii->isReturn()) {
 384         // Here for ALU instruction (return)
 385         if (!computeUnit->wfWait[simdId].prerdy()) {
 386             // Is wave slot free?
 387             return 0;
 388         }
 389
 390         // Are there in pipe or outstanding memory requests?
 391         if ((outstandingReqs + memReqsInPipe) > 0) {
 392             return 0;
 393         }
 394
 395         ready_inst = true;
 396     } else if (type == I_ALU && (ii->isBranch() ||
 397                ii->isALU() ||
 398                (ii->isKernArgSeg() && ii->isLoad()) ||
 399                ii->isArgSeg())) {
 400         // Here for ALU instruction (all others)
 401         if (!computeUnit->wfWait[simdId].prerdy()) {
 402             // Is alu slot free?
 403             return 0;
 404         }
 405         if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
 406                     VrfAccessType::RD_WR)) {
 407             return 0;
 408         }
 409
 410         if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
 411             return 0;
 412         }
 413         ready_inst = true;
 414     } else if (type == I_GLOBAL && ii->isGlobalMem()) {
 415         // Here Global memory instruction
 416         if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
 417             // Are there in pipe or outstanding global memory write requests?
 418             if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
 419                 return 0;
 420             }
 421         }
 422
 423         if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
 424             // Are there in pipe or outstanding global memory read requests?
 425             if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
 426                 return 0;
 427         }
 428
 429         if (!glbMemIssueRdy) {
 430             // Is WV issue slot free?
 431             return 0;
 432         }
 433
 434         if (!glbMemBusRdy) {
 435             // Is there an available VRF->Global memory read bus?
 436             return 0;
 437         }
 438
 439         if (!computeUnit->globalMemoryPipe.
 440             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
 441             // Can we insert a new request to the Global Mem Request FIFO?
 442             return 0;
 443         }
 444         // can we schedule source & destination operands on the VRF?
 445         if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
 446                     VrfAccessType::RD_WR)) {
 447             return 0;
 448         }
 449         if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
 450             return 0;
 451         }
 452         ready_inst = true;
 453     } else if (type == I_SHARED && ii->isLocalMem()) {
 454         // Here for Shared memory instruction
 455         if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
 456             if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
 457                 return 0;
 458             }
 459         }
 460
 461         if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
 462             if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
 463                 return 0;
 464             }
 465         }
 466
 467         if (!locMemBusRdy) {
 468             // Is there an available VRF->LDS read bus?
 469             return 0;
 470         }
 471         if (!locMemIssueRdy) {
 472             // Is wave slot free?
 473             return 0;
 474         }
 475
 476         if (!computeUnit->localMemoryPipe.
 477             isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
 478             // Can we insert a new request to the LDS Request FIFO?
 479             return 0;
 480         }
 481         // can we schedule source & destination operands on the VRF?
 482         if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
 483                     VrfAccessType::RD_WR)) {
 484             return 0;
 485         }
 486         if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
 487             return 0;
 488         }
 489         ready_inst = true;
 490     } else if (type == I_FLAT && ii->isFlat()) {
 491         if (!glbMemBusRdy) {
 492             // Is there an available VRF->Global memory read bus?
 493             return 0;
 494         }
 495
 496         if (!locMemBusRdy) {
 497             // Is there an available VRF->LDS read bus?
 498             return 0;
 499         }
 500
 501         if (!glbMemIssueRdy) {
 502             // Is wave slot free?
 503             return 0;
 504         }
 505
 506         if (!locMemIssueRdy) {
 507             return 0;
 508         }
 509         if (!computeUnit->globalMemoryPipe.
 510             isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
 511             // Can we insert a new request to the Global Mem Request FIFO?
 512             return 0;
 513         }
 514
 515         if (!computeUnit->localMemoryPipe.
 516             isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
 517             // Can we insert a new request to the LDS Request FIFO?
 518             return 0;
 519         }
 520         // can we schedule source & destination operands on the VRF?
 521         if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
 522                     VrfAccessType::RD_WR)) {
 523             return 0;
 524         }
 525         // are all the operands ready? (RAW, WAW and WAR depedencies met?)
 526         if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
 527             return 0;
 528         }
 529         ready_inst = true;
 530     } else {
 531         return 0;
 532     }
 533
 534     assert(ready_inst);
 535
 536     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
 537             simdId, wfSlotId, ii->disassemble());
 538     return 1;
 539 }
 540
 541 void
 542 Wavefront::updateResources()
 543 {
 544     // Get current instruction
 545     GPUDynInstPtr ii = instructionBuffer.front();
 546     assert(ii);
 547     computeUnit->vrf[simdId]->updateResources(this, ii);
 548     // Single precision ALU or Branch or Return or Special instruction
 549     if (ii->isALU() || ii->isSpecialOp() ||
 550         ii->isBranch() ||
 551         // FIXME: Kernel argument loads are currently treated as ALU operations
 552         // since we don't send memory packets at execution. If we fix that then
 553         // we should map them to one of the memory pipelines
 554         (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
 555         ii->isReturn()) {
 556         computeUnit->aluPipe[simdId].preset(computeUnit->shader->
 557                                             ticks(computeUnit->spBypassLength()));
 558         // this is to enforce a fixed number of cycles per issue slot per SIMD
 559         computeUnit->wfWait[simdId].preset(computeUnit->shader->
 560                                            ticks(computeUnit->issuePeriod));
 561     } else if (ii->isBarrier()) {
 562         computeUnit->wfWait[simdId].preset(computeUnit->shader->
 563                                            ticks(computeUnit->issuePeriod));
 564     } else if (ii->isLoad() && ii->isFlat()) {
 565         assert(Enums::SC_NONE != ii->executedAs());
 566         memReqsInPipe++;
 567         rdGmReqsInPipe++;
 568         if ( Enums::SC_SHARED == ii->executedAs() ) {
 569             computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 570                 preset(computeUnit->shader->ticks(4));
 571             computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 572                 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 573         } else {
 574             computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 575                 preset(computeUnit->shader->ticks(4));
 576             computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 577                 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 578         }
 579     } else if (ii->isStore() && ii->isFlat()) {
 580         assert(Enums::SC_NONE != ii->executedAs());
 581         memReqsInPipe++;
 582         wrGmReqsInPipe++;
 583         if (Enums::SC_SHARED == ii->executedAs()) {
 584             computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 585                 preset(computeUnit->shader->ticks(8));
 586             computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 587                 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 588         } else {
 589             computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 590                 preset(computeUnit->shader->ticks(8));
 591             computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 592                 preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 593         }
 594     } else if (ii->isLoad() && ii->isGlobalMem()) {
 595         memReqsInPipe++;
 596         rdGmReqsInPipe++;
 597         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 598             preset(computeUnit->shader->ticks(4));
 599         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 600             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 601     } else if (ii->isStore() && ii->isGlobalMem()) {
 602         memReqsInPipe++;
 603         wrGmReqsInPipe++;
 604         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 605             preset(computeUnit->shader->ticks(8));
 606         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 607             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 608     } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
 609         memReqsInPipe++;
 610         wrGmReqsInPipe++;
 611         rdGmReqsInPipe++;
 612         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 613             preset(computeUnit->shader->ticks(8));
 614         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 615             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 616     } else if (ii->isLoad() && ii->isLocalMem()) {
 617         memReqsInPipe++;
 618         rdLmReqsInPipe++;
 619         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 620             preset(computeUnit->shader->ticks(4));
 621         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 622             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 623     } else if (ii->isStore() && ii->isLocalMem()) {
 624         memReqsInPipe++;
 625         wrLmReqsInPipe++;
 626         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 627             preset(computeUnit->shader->ticks(8));
 628         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 629             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 630     } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
 631         memReqsInPipe++;
 632         wrLmReqsInPipe++;
 633         rdLmReqsInPipe++;
 634         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 635             preset(computeUnit->shader->ticks(8));
 636         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 637             preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
 638     }
 639 }
 640
 641 void
 642 Wavefront::exec()
 643 {
 644     // ---- Exit if wavefront is inactive ----------------------------- //
 645
 646     if (status == S_STOPPED || status == S_RETURNING ||
 647         instructionBuffer.empty()) {
 648         return;
 649     }
 650
 651     // Get current instruction
 652
 653     GPUDynInstPtr ii = instructionBuffer.front();
 654
 655     const uint32_t old_pc = pc();
 656     DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
 657             "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
 658             ii->disassemble(), old_pc);
 659
 660     // update the instruction stats in the CU
 661
 662     ii->execute(ii);
 663     computeUnit->updateInstStats(ii);
 664     // access the VRF
 665     computeUnit->vrf[simdId]->exec(ii, this);
 666     srcRegOpDist.sample(ii->numSrcRegOperands());
 667     dstRegOpDist.sample(ii->numDstRegOperands());
 668     computeUnit->numInstrExecuted++;
 669     computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
 670                                      computeUnit->lastExecCycle[simdId]);
 671     computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
 672     if (pc() == old_pc) {
 673         uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
 674         // PC not modified by instruction, proceed to next or pop frame
 675         pc(new_pc);
 676         if (new_pc == rpc()) {
 677             popFromReconvergenceStack();
 678             discardFetch();
 679         } else {
 680             instructionBuffer.pop_front();
 681         }
 682     } else {
 683         discardFetch();
 684     }
 685
 686     if (computeUnit->shader->hsail_mode==Shader::SIMT) {
 687         const int num_active_lanes = execMask().count();
 688         computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
 689         computeUnit->numVecOpsExecuted += num_active_lanes;
 690         if (isGmInstruction(ii)) {
 691             computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
 692         } else if (isLmInstruction(ii)) {
 693             computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
 694         }
 695     }
 696
 697     // ---- Update Vector ALU pipeline and other resources ------------------ //
 698     // Single precision ALU or Branch or Return or Special instruction
 699     if (ii->isALU() || ii->isSpecialOp() ||
 700         ii->isBranch() ||
 701         // FIXME: Kernel argument loads are currently treated as ALU operations
 702         // since we don't send memory packets at execution. If we fix that then
 703         // we should map them to one of the memory pipelines
 704         (ii->isKernArgSeg() && ii->isLoad()) ||
 705         ii->isArgSeg() ||
 706         ii->isReturn()) {
 707         computeUnit->aluPipe[simdId].set(computeUnit->shader->
 708                                          ticks(computeUnit->spBypassLength()));
 709
 710         // this is to enforce a fixed number of cycles per issue slot per SIMD
 711         computeUnit->wfWait[simdId].set(computeUnit->shader->
 712                                         ticks(computeUnit->issuePeriod));
 713     } else if (ii->isBarrier()) {
 714         computeUnit->wfWait[simdId].set(computeUnit->shader->
 715                                         ticks(computeUnit->issuePeriod));
 716     } else if (ii->isLoad() && ii->isFlat()) {
 717         assert(Enums::SC_NONE != ii->executedAs());
 718
 719         if (Enums::SC_SHARED == ii->executedAs()) {
 720             computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 721                 set(computeUnit->shader->ticks(4));
 722             computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 723                 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 724         } else {
 725             computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 726                 set(computeUnit->shader->ticks(4));
 727             computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 728                 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 729         }
 730     } else if (ii->isStore() && ii->isFlat()) {
 731         assert(Enums::SC_NONE != ii->executedAs());
 732         if (Enums::SC_SHARED == ii->executedAs()) {
 733             computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 734                 set(computeUnit->shader->ticks(8));
 735             computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 736                 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 737         } else {
 738             computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 739                 set(computeUnit->shader->ticks(8));
 740             computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 741                 set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 742         }
 743     } else if (ii->isLoad() && ii->isGlobalMem()) {
 744         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 745             set(computeUnit->shader->ticks(4));
 746         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 747             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 748     } else if (ii->isStore() && ii->isGlobalMem()) {
 749         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 750             set(computeUnit->shader->ticks(8));
 751         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 752             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 753     } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
 754         computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
 755             set(computeUnit->shader->ticks(8));
 756         computeUnit->wfWait[computeUnit->GlbMemUnitId()].
 757             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 758     } else if (ii->isLoad() && ii->isLocalMem()) {
 759         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 760             set(computeUnit->shader->ticks(4));
 761         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 762             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 763     } else if (ii->isStore() && ii->isLocalMem()) {
 764         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 765             set(computeUnit->shader->ticks(8));
 766         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 767             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 768     } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
 769         computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
 770             set(computeUnit->shader->ticks(8));
 771         computeUnit->wfWait[computeUnit->ShrMemUnitId()].
 772             set(computeUnit->shader->ticks(computeUnit->issuePeriod));
 773     }
 774 }
 775
 776 bool
 777 Wavefront::waitingAtBarrier(int lane)
 778 {
 779     return barCnt[lane] < maxBarCnt;
 780 }
 781
 782 void
 783 Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
 784                                     const VectorMask& mask)
 785 {
 786     assert(mask.count());
 787     reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
 788 }
 789
 790 void
 791 Wavefront::popFromReconvergenceStack()
 792 {
 793     assert(!reconvergenceStack.empty());
 794
 795     DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
 796             computeUnit->cu_id, simdId, wfSlotId, wfDynId,
 797             execMask().to_string<char, std::string::traits_type,
 798             std::string::allocator_type>().c_str(), pc());
 799
 800     reconvergenceStack.pop_back();
 801
 802     DPRINTF(WavefrontStack, "%3i %s\n", pc(),
 803             execMask().to_string<char, std::string::traits_type,
 804             std::string::allocator_type>().c_str());
 805
 806 }
 807
 808 void
 809 Wavefront::discardFetch()
 810 {
 811     instructionBuffer.clear();
 812     dropFetch |=pendingFetch;
 813 }
 814
 815 uint32_t
 816 Wavefront::pc() const
 817 {
 818     return reconvergenceStack.back()->pc;
 819 }
 820
 821 uint32_t
 822 Wavefront::rpc() const
 823 {
 824     return reconvergenceStack.back()->rpc;
 825 }
 826
 827 VectorMask
 828 Wavefront::execMask() const
 829 {
 830     return reconvergenceStack.back()->execMask;
 831 }
 832
 833 bool
 834 Wavefront::execMask(int lane) const
 835 {
 836     return reconvergenceStack.back()->execMask[lane];
 837 }
 838
 839
 840 void
 841 Wavefront::pc(uint32_t new_pc)
 842 {
 843     reconvergenceStack.back()->pc = new_pc;
 844 }
 845
 846 uint32_t
 847 Wavefront::getStaticContextSize() const
 848 {
 849     return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
 850            sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
 851            sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
 852            sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
 853            computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
 854 }
 855
 856 void
 857 Wavefront::getContext(const void *out)
 858 {
 859     uint8_t *iter = (uint8_t *)out;
 860     for (int i = 0; i < barCnt.size(); i++) {
 861         *(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
 862     }
 863     *(int *)iter = wfId; iter += sizeof(wfId);
 864     *(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
 865     *(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
 866     *(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
 867     *(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
 868     *(uint32_t *)iter = wgId; iter += sizeof(wgId);
 869     *(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
 870     *(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
 871     *(Addr *)iter = privBase; iter += sizeof(privBase);
 872     *(Addr *)iter = spillBase; iter += sizeof(spillBase);
 873
 874     int stackSize = reconvergenceStack.size();
 875     ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
 876                                     std::numeric_limits<uint32_t>::max(),
 877                                     std::numeric_limits<uint64_t>::max()};
 878     for (int i = 0; i < workItemId[0].size(); i++) {
 879         if (i < stackSize) {
 880             *(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
 881             iter += sizeof(ReconvergenceStackEntry);
 882             reconvergenceStack.pop_back();
 883         } else {
 884             *(ReconvergenceStackEntry *)iter = empty;
 885             iter += sizeof(ReconvergenceStackEntry);
 886         }
 887     }
 888
 889     int wf_size = computeUnit->wfSize();
 890     for (int i = 0; i < maxSpVgprs; i++) {
 891         uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
 892         for (int lane = 0; lane < wf_size; lane++) {
 893             uint32_t regVal = computeUnit->vrf[simdId]->
 894                             read<uint32_t>(vgprIdx,lane);
 895             *(uint32_t *)iter = regVal; iter += sizeof(regVal);
 896         }
 897     }
 898
 899     for (int i = 0; i < maxDpVgprs; i++) {
 900         uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
 901         for (int lane = 0; lane < wf_size; lane++) {
 902             uint64_t regVal = computeUnit->vrf[simdId]->
 903                             read<uint64_t>(vgprIdx,lane);
 904             *(uint64_t *)iter = regVal; iter += sizeof(regVal);
 905         }
 906     }
 907
 908     for (int i = 0; i < condRegState->numRegs(); i++) {
 909         for (int lane = 0; lane < wf_size; lane++) {
 910             uint64_t regVal = condRegState->read<uint64_t>(i, lane);
 911             *(uint64_t *)iter = regVal; iter += sizeof(regVal);
 912         }
 913     }
 914
 915     /* saving LDS content */
 916     if (ldsChunk)
 917         for (int i = 0; i < ldsChunk->size(); i++) {
 918             char val = ldsChunk->read<char>(i);
 919             *(char *) iter = val; iter += sizeof(val);
 920         }
 921 }
 922
 923 void
 924 Wavefront::setContext(const void *in)
 925 {
 926     uint8_t *iter = (uint8_t *)in;
 927     for (int i = 0; i < barCnt.size(); i++) {
 928         barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
 929     }
 930     wfId = *(int *)iter; iter += sizeof(wfId);
 931     maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
 932     oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
 933     barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
 934     computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
 935     wgId = *(uint32_t *)iter; iter += sizeof(wgId);
 936     barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
 937     initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
 938     privBase = *(Addr *)iter; iter += sizeof(privBase);
 939     spillBase = *(Addr *)iter; iter += sizeof(spillBase);
 940
 941     for (int i = 0; i < workItemId[0].size(); i++) {
 942         ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
 943         iter += sizeof(ReconvergenceStackEntry);
 944         if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
 945             pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
 946                                      newEntry.execMask);
 947         }
 948     }
 949     int wf_size = computeUnit->wfSize();
 950
 951     for (int i = 0; i < maxSpVgprs; i++) {
 952         uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
 953         for (int lane = 0; lane < wf_size; lane++) {
 954             uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
 955             computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
 956         }
 957     }
 958
 959     for (int i = 0; i < maxDpVgprs; i++) {
 960         uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
 961         for (int lane = 0; lane < wf_size; lane++) {
 962             uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
 963             computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
 964         }
 965     }
 966
 967     for (int i = 0; i < condRegState->numRegs(); i++) {
 968         for (int lane = 0; lane < wf_size; lane++) {
 969             uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
 970             condRegState->write<uint64_t>(i, lane, regVal);
 971         }
 972     }
 973     /** Restoring LDS contents */
 974     if (ldsChunk)
 975         for (int i = 0; i < ldsChunk->size(); i++) {
 976             char val = *(char *) iter; iter += sizeof(val);
 977             ldsChunk->write<char>(i, val);
 978         }
 979 }
 980
 981 void
 982 Wavefront::computeActualWgSz(NDRange *ndr)
 983 {
 984     actualWgSzTotal = 1;
 985     for (int d = 0; d < 3; ++d) {
 986         actualWgSz[d] = std::min(workGroupSz[d],
 987                                  gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
 988         actualWgSzTotal *= actualWgSz[d];
 989     }
 990 }