2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
36 #include "gpu-compute/wavefront.hh"
38 #include "debug/GPUExec.hh"
39 #include "debug/WavefrontStack.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/vector_register_file.hh"
46 WavefrontParams::create()
48 return new Wavefront(this);
51 Wavefront::Wavefront(const Params
*p
)
52 : SimObject(p
), callArgMem(nullptr), _gpuISA()
56 wfSlotId
= p
->wf_slot_id
;
58 reservedVectorRegs
= 0;
62 outstandingReqsWrGm
= 0;
63 outstandingReqsWrLm
= 0;
64 outstandingReqsRdGm
= 0;
65 outstandingReqsRdLm
= 0;
73 stalledAtBarrier
= false;
76 oldVgprTcnt
= 0xffffffffffffffffll
;
77 oldDgprTcnt
= 0xffffffffffffffffll
;
78 oldVgpr
.resize(p
->wfSize
);
82 condRegState
= new ConditionRegisterState();
85 lastAddr
.resize(p
->wfSize
);
86 workItemFlatId
.resize(p
->wfSize
);
87 oldDgpr
.resize(p
->wfSize
);
88 barCnt
.resize(p
->wfSize
);
89 for (int i
= 0; i
< 3; ++i
) {
90 workItemId
[i
].resize(p
->wfSize
);
97 SimObject::regStats();
101 .name(name() + ".src_reg_operand_dist")
102 .desc("number of executed instructions with N source register operands")
107 .name(name() + ".dst_reg_operand_dist")
108 .desc("number of executed instructions with N destination register "
112 // FIXME: the name of the WF needs to be unique
113 numTimesBlockedDueWAXDependencies
114 .name(name() + ".timesBlockedDueWAXDependencies")
115 .desc("number of times the wf's instructions are blocked due to WAW "
116 "or WAR dependencies")
119 // FIXME: the name of the WF needs to be unique
120 numTimesBlockedDueRAWDependencies
121 .name(name() + ".timesBlockedDueRAWDependencies")
122 .desc("number of times the wf's instructions are blocked due to RAW "
126 // FIXME: the name of the WF needs to be unique
127 numTimesBlockedDueVrfPortAvail
128 .name(name() + ".timesBlockedDueVrfPortAvail")
129 .desc("number of times instructions are blocked due to VRF port "
137 reservedVectorRegs
= 0;
142 Wavefront::resizeRegFiles(int num_cregs
, int num_sregs
, int num_dregs
)
144 condRegState
->init(num_cregs
);
145 maxSpVgprs
= num_sregs
;
146 maxDpVgprs
= num_dregs
;
149 Wavefront::~Wavefront()
157 Wavefront::start(uint64_t _wf_dyn_id
,uint64_t _base_ptr
)
159 wfDynId
= _wf_dyn_id
;
165 Wavefront::isGmInstruction(GPUDynInstPtr ii
)
167 if (ii
->isGlobalMem() || ii
->isFlat())
174 Wavefront::isLmInstruction(GPUDynInstPtr ii
)
176 if (ii
->isLocalMem()) {
184 Wavefront::isOldestInstALU()
186 assert(!instructionBuffer
.empty());
187 GPUDynInstPtr ii
= instructionBuffer
.front();
189 if (status
!= S_STOPPED
&& (ii
->isNop() ||
190 ii
->isReturn() || ii
->isBranch() ||
191 ii
->isALU() || (ii
->isKernArgSeg() && ii
->isLoad()))) {
199 Wavefront::isOldestInstBarrier()
201 assert(!instructionBuffer
.empty());
202 GPUDynInstPtr ii
= instructionBuffer
.front();
204 if (status
!= S_STOPPED
&& ii
->isBarrier()) {
212 Wavefront::isOldestInstGMem()
214 assert(!instructionBuffer
.empty());
215 GPUDynInstPtr ii
= instructionBuffer
.front();
217 if (status
!= S_STOPPED
&& ii
->isGlobalMem()) {
225 Wavefront::isOldestInstLMem()
227 assert(!instructionBuffer
.empty());
228 GPUDynInstPtr ii
= instructionBuffer
.front();
230 if (status
!= S_STOPPED
&& ii
->isLocalMem()) {
238 Wavefront::isOldestInstPrivMem()
240 assert(!instructionBuffer
.empty());
241 GPUDynInstPtr ii
= instructionBuffer
.front();
243 if (status
!= S_STOPPED
&& ii
->isPrivateSeg()) {
251 Wavefront::isOldestInstFlatMem()
253 assert(!instructionBuffer
.empty());
254 GPUDynInstPtr ii
= instructionBuffer
.front();
256 if (status
!= S_STOPPED
&& ii
->isFlat()) {
263 // Return true if the Wavefront's instruction
264 // buffer has branch instruction.
266 Wavefront::instructionBufferHasBranch()
268 for (auto it
: instructionBuffer
) {
269 GPUDynInstPtr ii
= it
;
271 if (ii
->isReturn() || ii
->isBranch()) {
279 // Remap HSAIL register to physical VGPR.
280 // HSAIL register = virtual register assigned to an operand by HLC compiler
282 Wavefront::remap(uint32_t vgprIndex
, uint32_t size
, uint8_t mode
)
284 assert((vgprIndex
< reservedVectorRegs
) && (reservedVectorRegs
> 0));
285 // add the offset from where the VGPRs of the wavefront have been assigned
286 uint32_t physicalVgprIndex
= startVgprIndex
+ vgprIndex
;
287 // HSAIL double precision (DP) register: calculate the physical VGPR index
288 // assuming that DP registers are placed after SP ones in the VRF. The DP
289 // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
290 // the DP VGPR index before mapping it to the physical VRF address space
291 if (mode
== 1 && size
> 4) {
292 physicalVgprIndex
= startVgprIndex
+ maxSpVgprs
+ (2 * vgprIndex
);
295 assert((startVgprIndex
<= physicalVgprIndex
) &&
296 (startVgprIndex
+ reservedVectorRegs
- 1) >= physicalVgprIndex
);
298 // calculate absolute physical VGPR index
299 return physicalVgprIndex
% computeUnit
->vrf
[simdId
]->numRegs();
302 // Return true if this wavefront is ready
303 // to execute an instruction of the specified type.
305 Wavefront::ready(itype_e type
)
307 // Check to make sure wave is running
308 if (status
== S_STOPPED
|| status
== S_RETURNING
||
309 instructionBuffer
.empty()) {
313 // Is the wave waiting at a barrier
314 if (stalledAtBarrier
) {
315 if (!computeUnit
->AllAtBarrier(barrierId
,barrierCnt
,
316 computeUnit
->getRefCounter(dispatchId
, wgId
))) {
317 // Are all threads at barrier?
320 oldBarrierCnt
= barrierCnt
;
321 stalledAtBarrier
= false;
325 GPUDynInstPtr ii
= instructionBuffer
.front();
327 bool ready_inst M5_VAR_USED
= false;
328 bool glbMemBusRdy
= false;
329 bool glbMemIssueRdy
= false;
330 if (type
== I_GLOBAL
|| type
== I_FLAT
|| type
== I_PRIVATE
) {
331 for (int j
=0; j
< computeUnit
->numGlbMemUnits
; ++j
) {
332 if (computeUnit
->vrfToGlobalMemPipeBus
[j
].prerdy())
334 if (computeUnit
->wfWait
[j
].prerdy())
335 glbMemIssueRdy
= true;
338 bool locMemBusRdy
= false;
339 bool locMemIssueRdy
= false;
340 if (type
== I_SHARED
|| type
== I_FLAT
) {
341 for (int j
=0; j
< computeUnit
->numLocMemUnits
; ++j
) {
342 if (computeUnit
->vrfToLocalMemPipeBus
[j
].prerdy())
344 if (computeUnit
->wfWait
[j
].prerdy())
345 locMemIssueRdy
= true;
349 // The following code is very error prone and the entire process for
350 // checking readiness will be fixed eventually. In the meantime, let's
351 // make sure that we do not silently let an instruction type slip
352 // through this logic and always return not ready.
353 if (!(ii
->isBarrier() || ii
->isNop() || ii
->isReturn() || ii
->isBranch() ||
354 ii
->isALU() || ii
->isLoad() || ii
->isStore() || ii
->isAtomic() ||
355 ii
->isMemFence() || ii
->isFlat())) {
356 panic("next instruction: %s is of unknown type\n", ii
->disassemble());
359 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
360 computeUnit
->cu_id
, simdId
, wfSlotId
, ii
->disassemble());
362 if (type
== I_ALU
&& ii
->isBarrier()) {
363 // Here for ALU instruction (barrier)
364 if (!computeUnit
->wfWait
[simdId
].prerdy()) {
365 // Is wave slot free?
369 // Are there in pipe or outstanding memory requests?
370 if ((outstandingReqs
+ memReqsInPipe
) > 0) {
375 } else if (type
== I_ALU
&& ii
->isNop()) {
376 // Here for ALU instruction (nop)
377 if (!computeUnit
->wfWait
[simdId
].prerdy()) {
378 // Is wave slot free?
383 } else if (type
== I_ALU
&& ii
->isReturn()) {
384 // Here for ALU instruction (return)
385 if (!computeUnit
->wfWait
[simdId
].prerdy()) {
386 // Is wave slot free?
390 // Are there in pipe or outstanding memory requests?
391 if ((outstandingReqs
+ memReqsInPipe
) > 0) {
396 } else if (type
== I_ALU
&& (ii
->isBranch() ||
398 (ii
->isKernArgSeg() && ii
->isLoad()) ||
400 // Here for ALU instruction (all others)
401 if (!computeUnit
->wfWait
[simdId
].prerdy()) {
405 if (!computeUnit
->vrf
[simdId
]->vrfOperandAccessReady(this, ii
,
406 VrfAccessType::RD_WR
)) {
410 if (!computeUnit
->vrf
[simdId
]->operandsReady(this, ii
)) {
414 } else if (type
== I_GLOBAL
&& ii
->isGlobalMem()) {
415 // Here Global memory instruction
416 if (ii
->isLoad() || ii
->isAtomic() || ii
->isMemFence()) {
417 // Are there in pipe or outstanding global memory write requests?
418 if ((outstandingReqsWrGm
+ wrGmReqsInPipe
) > 0) {
423 if (ii
->isStore() || ii
->isAtomic() || ii
->isMemFence()) {
424 // Are there in pipe or outstanding global memory read requests?
425 if ((outstandingReqsRdGm
+ rdGmReqsInPipe
) > 0)
429 if (!glbMemIssueRdy
) {
430 // Is WV issue slot free?
435 // Is there an available VRF->Global memory read bus?
439 if (!computeUnit
->globalMemoryPipe
.
440 isGMReqFIFOWrRdy(rdGmReqsInPipe
+ wrGmReqsInPipe
)) {
441 // Can we insert a new request to the Global Mem Request FIFO?
444 // can we schedule source & destination operands on the VRF?
445 if (!computeUnit
->vrf
[simdId
]->vrfOperandAccessReady(this, ii
,
446 VrfAccessType::RD_WR
)) {
449 if (!computeUnit
->vrf
[simdId
]->operandsReady(this, ii
)) {
453 } else if (type
== I_SHARED
&& ii
->isLocalMem()) {
454 // Here for Shared memory instruction
455 if (ii
->isLoad() || ii
->isAtomic() || ii
->isMemFence()) {
456 if ((outstandingReqsWrLm
+ wrLmReqsInPipe
) > 0) {
461 if (ii
->isStore() || ii
->isAtomic() || ii
->isMemFence()) {
462 if ((outstandingReqsRdLm
+ rdLmReqsInPipe
) > 0) {
468 // Is there an available VRF->LDS read bus?
471 if (!locMemIssueRdy
) {
472 // Is wave slot free?
476 if (!computeUnit
->localMemoryPipe
.
477 isLMReqFIFOWrRdy(rdLmReqsInPipe
+ wrLmReqsInPipe
)) {
478 // Can we insert a new request to the LDS Request FIFO?
481 // can we schedule source & destination operands on the VRF?
482 if (!computeUnit
->vrf
[simdId
]->vrfOperandAccessReady(this, ii
,
483 VrfAccessType::RD_WR
)) {
486 if (!computeUnit
->vrf
[simdId
]->operandsReady(this, ii
)) {
490 } else if (type
== I_FLAT
&& ii
->isFlat()) {
492 // Is there an available VRF->Global memory read bus?
497 // Is there an available VRF->LDS read bus?
501 if (!glbMemIssueRdy
) {
502 // Is wave slot free?
506 if (!locMemIssueRdy
) {
509 if (!computeUnit
->globalMemoryPipe
.
510 isGMReqFIFOWrRdy(rdGmReqsInPipe
+ wrGmReqsInPipe
)) {
511 // Can we insert a new request to the Global Mem Request FIFO?
515 if (!computeUnit
->localMemoryPipe
.
516 isLMReqFIFOWrRdy(rdLmReqsInPipe
+ wrLmReqsInPipe
)) {
517 // Can we insert a new request to the LDS Request FIFO?
520 // can we schedule source & destination operands on the VRF?
521 if (!computeUnit
->vrf
[simdId
]->vrfOperandAccessReady(this, ii
,
522 VrfAccessType::RD_WR
)) {
525 // are all the operands ready? (RAW, WAW and WAR depedencies met?)
526 if (!computeUnit
->vrf
[simdId
]->operandsReady(this, ii
)) {
536 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit
->cu_id
,
537 simdId
, wfSlotId
, ii
->disassemble());
542 Wavefront::updateResources()
544 // Get current instruction
545 GPUDynInstPtr ii
= instructionBuffer
.front();
547 computeUnit
->vrf
[simdId
]->updateResources(this, ii
);
548 // Single precision ALU or Branch or Return or Special instruction
549 if (ii
->isALU() || ii
->isSpecialOp() ||
551 // FIXME: Kernel argument loads are currently treated as ALU operations
552 // since we don't send memory packets at execution. If we fix that then
553 // we should map them to one of the memory pipelines
554 (ii
->isKernArgSeg() && ii
->isLoad()) || ii
->isArgSeg() ||
556 computeUnit
->aluPipe
[simdId
].preset(computeUnit
->shader
->
557 ticks(computeUnit
->spBypassLength()));
558 // this is to enforce a fixed number of cycles per issue slot per SIMD
559 computeUnit
->wfWait
[simdId
].preset(computeUnit
->shader
->
560 ticks(computeUnit
->issuePeriod
));
561 } else if (ii
->isBarrier()) {
562 computeUnit
->wfWait
[simdId
].preset(computeUnit
->shader
->
563 ticks(computeUnit
->issuePeriod
));
564 } else if (ii
->isLoad() && ii
->isFlat()) {
565 assert(Enums::SC_NONE
!= ii
->executedAs());
568 if ( Enums::SC_SHARED
== ii
->executedAs() ) {
569 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
570 preset(computeUnit
->shader
->ticks(4));
571 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
572 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
574 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
575 preset(computeUnit
->shader
->ticks(4));
576 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
577 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
579 } else if (ii
->isStore() && ii
->isFlat()) {
580 assert(Enums::SC_NONE
!= ii
->executedAs());
583 if (Enums::SC_SHARED
== ii
->executedAs()) {
584 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
585 preset(computeUnit
->shader
->ticks(8));
586 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
587 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
589 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
590 preset(computeUnit
->shader
->ticks(8));
591 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
592 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
594 } else if (ii
->isLoad() && ii
->isGlobalMem()) {
597 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
598 preset(computeUnit
->shader
->ticks(4));
599 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
600 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
601 } else if (ii
->isStore() && ii
->isGlobalMem()) {
604 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
605 preset(computeUnit
->shader
->ticks(8));
606 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
607 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
608 } else if ((ii
->isAtomic() || ii
->isMemFence()) && ii
->isGlobalMem()) {
612 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
613 preset(computeUnit
->shader
->ticks(8));
614 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
615 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
616 } else if (ii
->isLoad() && ii
->isLocalMem()) {
619 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
620 preset(computeUnit
->shader
->ticks(4));
621 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
622 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
623 } else if (ii
->isStore() && ii
->isLocalMem()) {
626 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
627 preset(computeUnit
->shader
->ticks(8));
628 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
629 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
630 } else if ((ii
->isAtomic() || ii
->isMemFence()) && ii
->isLocalMem()) {
634 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
635 preset(computeUnit
->shader
->ticks(8));
636 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
637 preset(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
644 // ---- Exit if wavefront is inactive ----------------------------- //
646 if (status
== S_STOPPED
|| status
== S_RETURNING
||
647 instructionBuffer
.empty()) {
651 // Get current instruction
653 GPUDynInstPtr ii
= instructionBuffer
.front();
655 const uint32_t old_pc
= pc();
656 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
657 "(pc: %i)\n", computeUnit
->cu_id
, simdId
, wfSlotId
, wfDynId
,
658 ii
->disassemble(), old_pc
);
660 // update the instruction stats in the CU
663 computeUnit
->updateInstStats(ii
);
665 computeUnit
->vrf
[simdId
]->exec(ii
, this);
666 srcRegOpDist
.sample(ii
->numSrcRegOperands());
667 dstRegOpDist
.sample(ii
->numDstRegOperands());
668 computeUnit
->numInstrExecuted
++;
669 computeUnit
->execRateDist
.sample(computeUnit
->totalCycles
.value() -
670 computeUnit
->lastExecCycle
[simdId
]);
671 computeUnit
->lastExecCycle
[simdId
] = computeUnit
->totalCycles
.value();
672 if (pc() == old_pc
) {
673 uint32_t new_pc
= _gpuISA
.advancePC(old_pc
, ii
);
674 // PC not modified by instruction, proceed to next or pop frame
676 if (new_pc
== rpc()) {
677 popFromReconvergenceStack();
680 instructionBuffer
.pop_front();
686 if (computeUnit
->shader
->hsail_mode
==Shader::SIMT
) {
687 const int num_active_lanes
= execMask().count();
688 computeUnit
->controlFlowDivergenceDist
.sample(num_active_lanes
);
689 computeUnit
->numVecOpsExecuted
+= num_active_lanes
;
690 if (isGmInstruction(ii
)) {
691 computeUnit
->activeLanesPerGMemInstrDist
.sample(num_active_lanes
);
692 } else if (isLmInstruction(ii
)) {
693 computeUnit
->activeLanesPerLMemInstrDist
.sample(num_active_lanes
);
697 // ---- Update Vector ALU pipeline and other resources ------------------ //
698 // Single precision ALU or Branch or Return or Special instruction
699 if (ii
->isALU() || ii
->isSpecialOp() ||
701 // FIXME: Kernel argument loads are currently treated as ALU operations
702 // since we don't send memory packets at execution. If we fix that then
703 // we should map them to one of the memory pipelines
704 (ii
->isKernArgSeg() && ii
->isLoad()) ||
707 computeUnit
->aluPipe
[simdId
].set(computeUnit
->shader
->
708 ticks(computeUnit
->spBypassLength()));
710 // this is to enforce a fixed number of cycles per issue slot per SIMD
711 computeUnit
->wfWait
[simdId
].set(computeUnit
->shader
->
712 ticks(computeUnit
->issuePeriod
));
713 } else if (ii
->isBarrier()) {
714 computeUnit
->wfWait
[simdId
].set(computeUnit
->shader
->
715 ticks(computeUnit
->issuePeriod
));
716 } else if (ii
->isLoad() && ii
->isFlat()) {
717 assert(Enums::SC_NONE
!= ii
->executedAs());
719 if (Enums::SC_SHARED
== ii
->executedAs()) {
720 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
721 set(computeUnit
->shader
->ticks(4));
722 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
723 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
725 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
726 set(computeUnit
->shader
->ticks(4));
727 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
728 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
730 } else if (ii
->isStore() && ii
->isFlat()) {
731 assert(Enums::SC_NONE
!= ii
->executedAs());
732 if (Enums::SC_SHARED
== ii
->executedAs()) {
733 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
734 set(computeUnit
->shader
->ticks(8));
735 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
736 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
738 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
739 set(computeUnit
->shader
->ticks(8));
740 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
741 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
743 } else if (ii
->isLoad() && ii
->isGlobalMem()) {
744 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
745 set(computeUnit
->shader
->ticks(4));
746 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
747 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
748 } else if (ii
->isStore() && ii
->isGlobalMem()) {
749 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
750 set(computeUnit
->shader
->ticks(8));
751 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
752 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
753 } else if ((ii
->isAtomic() || ii
->isMemFence()) && ii
->isGlobalMem()) {
754 computeUnit
->vrfToGlobalMemPipeBus
[computeUnit
->nextGlbRdBus()].
755 set(computeUnit
->shader
->ticks(8));
756 computeUnit
->wfWait
[computeUnit
->GlbMemUnitId()].
757 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
758 } else if (ii
->isLoad() && ii
->isLocalMem()) {
759 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
760 set(computeUnit
->shader
->ticks(4));
761 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
762 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
763 } else if (ii
->isStore() && ii
->isLocalMem()) {
764 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
765 set(computeUnit
->shader
->ticks(8));
766 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
767 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
768 } else if ((ii
->isAtomic() || ii
->isMemFence()) && ii
->isLocalMem()) {
769 computeUnit
->vrfToLocalMemPipeBus
[computeUnit
->nextLocRdBus()].
770 set(computeUnit
->shader
->ticks(8));
771 computeUnit
->wfWait
[computeUnit
->ShrMemUnitId()].
772 set(computeUnit
->shader
->ticks(computeUnit
->issuePeriod
));
777 Wavefront::waitingAtBarrier(int lane
)
779 return barCnt
[lane
] < maxBarCnt
;
783 Wavefront::pushToReconvergenceStack(uint32_t pc
, uint32_t rpc
,
784 const VectorMask
& mask
)
786 assert(mask
.count());
787 reconvergenceStack
.emplace_back(new ReconvergenceStackEntry
{pc
, rpc
, mask
});
791 Wavefront::popFromReconvergenceStack()
793 assert(!reconvergenceStack
.empty());
795 DPRINTF(WavefrontStack
, "[%2d, %2d, %2d, %2d] %s %3i => ",
796 computeUnit
->cu_id
, simdId
, wfSlotId
, wfDynId
,
797 execMask().to_string
<char, std::string::traits_type
,
798 std::string::allocator_type
>().c_str(), pc());
800 reconvergenceStack
.pop_back();
802 DPRINTF(WavefrontStack
, "%3i %s\n", pc(),
803 execMask().to_string
<char, std::string::traits_type
,
804 std::string::allocator_type
>().c_str());
809 Wavefront::discardFetch()
811 instructionBuffer
.clear();
812 dropFetch
|=pendingFetch
;
816 Wavefront::pc() const
818 return reconvergenceStack
.back()->pc
;
822 Wavefront::rpc() const
824 return reconvergenceStack
.back()->rpc
;
828 Wavefront::execMask() const
830 return reconvergenceStack
.back()->execMask
;
834 Wavefront::execMask(int lane
) const
836 return reconvergenceStack
.back()->execMask
[lane
];
841 Wavefront::pc(uint32_t new_pc
)
843 reconvergenceStack
.back()->pc
= new_pc
;
847 Wavefront::getStaticContextSize() const
849 return barCnt
.size() * sizeof(int) + sizeof(wfId
) + sizeof(maxBarCnt
) +
850 sizeof(oldBarrierCnt
) + sizeof(barrierCnt
) + sizeof(wgId
) +
851 sizeof(computeUnit
->cu_id
) + sizeof(barrierId
) + sizeof(initMask
) +
852 sizeof(privBase
) + sizeof(spillBase
) + sizeof(ldsChunk
) +
853 computeUnit
->wfSize() * sizeof(ReconvergenceStackEntry
);
857 Wavefront::getContext(const void *out
)
859 uint8_t *iter
= (uint8_t *)out
;
860 for (int i
= 0; i
< barCnt
.size(); i
++) {
861 *(int *)iter
= barCnt
[i
]; iter
+= sizeof(barCnt
[i
]);
863 *(int *)iter
= wfId
; iter
+= sizeof(wfId
);
864 *(int *)iter
= maxBarCnt
; iter
+= sizeof(maxBarCnt
);
865 *(int *)iter
= oldBarrierCnt
; iter
+= sizeof(oldBarrierCnt
);
866 *(int *)iter
= barrierCnt
; iter
+= sizeof(barrierCnt
);
867 *(int *)iter
= computeUnit
->cu_id
; iter
+= sizeof(computeUnit
->cu_id
);
868 *(uint32_t *)iter
= wgId
; iter
+= sizeof(wgId
);
869 *(uint32_t *)iter
= barrierId
; iter
+= sizeof(barrierId
);
870 *(uint64_t *)iter
= initMask
.to_ullong(); iter
+= sizeof(initMask
.to_ullong());
871 *(Addr
*)iter
= privBase
; iter
+= sizeof(privBase
);
872 *(Addr
*)iter
= spillBase
; iter
+= sizeof(spillBase
);
874 int stackSize
= reconvergenceStack
.size();
875 ReconvergenceStackEntry empty
= {std::numeric_limits
<uint32_t>::max(),
876 std::numeric_limits
<uint32_t>::max(),
877 std::numeric_limits
<uint64_t>::max()};
878 for (int i
= 0; i
< workItemId
[0].size(); i
++) {
880 *(ReconvergenceStackEntry
*)iter
= *reconvergenceStack
.back();
881 iter
+= sizeof(ReconvergenceStackEntry
);
882 reconvergenceStack
.pop_back();
884 *(ReconvergenceStackEntry
*)iter
= empty
;
885 iter
+= sizeof(ReconvergenceStackEntry
);
889 int wf_size
= computeUnit
->wfSize();
890 for (int i
= 0; i
< maxSpVgprs
; i
++) {
891 uint32_t vgprIdx
= remap(i
, sizeof(uint32_t), 1);
892 for (int lane
= 0; lane
< wf_size
; lane
++) {
893 uint32_t regVal
= computeUnit
->vrf
[simdId
]->
894 read
<uint32_t>(vgprIdx
,lane
);
895 *(uint32_t *)iter
= regVal
; iter
+= sizeof(regVal
);
899 for (int i
= 0; i
< maxDpVgprs
; i
++) {
900 uint32_t vgprIdx
= remap(i
, sizeof(uint64_t), 1);
901 for (int lane
= 0; lane
< wf_size
; lane
++) {
902 uint64_t regVal
= computeUnit
->vrf
[simdId
]->
903 read
<uint64_t>(vgprIdx
,lane
);
904 *(uint64_t *)iter
= regVal
; iter
+= sizeof(regVal
);
908 for (int i
= 0; i
< condRegState
->numRegs(); i
++) {
909 for (int lane
= 0; lane
< wf_size
; lane
++) {
910 uint64_t regVal
= condRegState
->read
<uint64_t>(i
, lane
);
911 *(uint64_t *)iter
= regVal
; iter
+= sizeof(regVal
);
915 /* saving LDS content */
917 for (int i
= 0; i
< ldsChunk
->size(); i
++) {
918 char val
= ldsChunk
->read
<char>(i
);
919 *(char *) iter
= val
; iter
+= sizeof(val
);
924 Wavefront::setContext(const void *in
)
926 uint8_t *iter
= (uint8_t *)in
;
927 for (int i
= 0; i
< barCnt
.size(); i
++) {
928 barCnt
[i
] = *(int *)iter
; iter
+= sizeof(barCnt
[i
]);
930 wfId
= *(int *)iter
; iter
+= sizeof(wfId
);
931 maxBarCnt
= *(int *)iter
; iter
+= sizeof(maxBarCnt
);
932 oldBarrierCnt
= *(int *)iter
; iter
+= sizeof(oldBarrierCnt
);
933 barrierCnt
= *(int *)iter
; iter
+= sizeof(barrierCnt
);
934 computeUnit
->cu_id
= *(int *)iter
; iter
+= sizeof(computeUnit
->cu_id
);
935 wgId
= *(uint32_t *)iter
; iter
+= sizeof(wgId
);
936 barrierId
= *(uint32_t *)iter
; iter
+= sizeof(barrierId
);
937 initMask
= VectorMask(*(uint64_t *)iter
); iter
+= sizeof(initMask
);
938 privBase
= *(Addr
*)iter
; iter
+= sizeof(privBase
);
939 spillBase
= *(Addr
*)iter
; iter
+= sizeof(spillBase
);
941 for (int i
= 0; i
< workItemId
[0].size(); i
++) {
942 ReconvergenceStackEntry newEntry
= *(ReconvergenceStackEntry
*)iter
;
943 iter
+= sizeof(ReconvergenceStackEntry
);
944 if (newEntry
.pc
!= std::numeric_limits
<uint32_t>::max()) {
945 pushToReconvergenceStack(newEntry
.pc
, newEntry
.rpc
,
949 int wf_size
= computeUnit
->wfSize();
951 for (int i
= 0; i
< maxSpVgprs
; i
++) {
952 uint32_t vgprIdx
= remap(i
, sizeof(uint32_t), 1);
953 for (int lane
= 0; lane
< wf_size
; lane
++) {
954 uint32_t regVal
= *(uint32_t *)iter
; iter
+= sizeof(regVal
);
955 computeUnit
->vrf
[simdId
]->write
<uint32_t>(vgprIdx
, regVal
, lane
);
959 for (int i
= 0; i
< maxDpVgprs
; i
++) {
960 uint32_t vgprIdx
= remap(i
, sizeof(uint64_t), 1);
961 for (int lane
= 0; lane
< wf_size
; lane
++) {
962 uint64_t regVal
= *(uint64_t *)iter
; iter
+= sizeof(regVal
);
963 computeUnit
->vrf
[simdId
]->write
<uint64_t>(vgprIdx
, regVal
, lane
);
967 for (int i
= 0; i
< condRegState
->numRegs(); i
++) {
968 for (int lane
= 0; lane
< wf_size
; lane
++) {
969 uint64_t regVal
= *(uint64_t *)iter
; iter
+= sizeof(regVal
);
970 condRegState
->write
<uint64_t>(i
, lane
, regVal
);
973 /** Restoring LDS contents */
975 for (int i
= 0; i
< ldsChunk
->size(); i
++) {
976 char val
= *(char *) iter
; iter
+= sizeof(val
);
977 ldsChunk
->write
<char>(i
, val
);
982 Wavefront::computeActualWgSz(NDRange
*ndr
)
985 for (int d
= 0; d
< 3; ++d
) {
986 actualWgSz
[d
] = std::min(workGroupSz
[d
],
987 gridSz
[d
] - ndr
->wgId
[d
] * workGroupSz
[d
]);
988 actualWgSzTotal
*= actualWgSz
[d
];