2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #include "gpu-compute/wavefront.hh"
36 #include "base/bitfield.hh"
37 #include "debug/GPUExec.hh"
38 #include "debug/GPUInitAbi.hh"
39 #include "debug/WavefrontStack.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/scalar_register_file.hh"
43 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/simple_pool_manager.hh"
45 #include "gpu-compute/vector_register_file.hh"
47 Wavefront::Wavefront(const Params
&p
)
48 : SimObject(p
), wfSlotId(p
.wf_slot_id
), simdId(p
.simdId
),
49 maxIbSize(p
.max_ib_size
), _gpuISA(*this),
50 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
51 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
52 barId(WFBarrier::InvalidID
)
57 reservedVectorRegs
= 0;
58 reservedScalarRegs
= 0;
62 outstandingReqsWrGm
= 0;
63 outstandingReqsWrLm
= 0;
64 outstandingReqsRdGm
= 0;
65 outstandingReqsRdLm
= 0;
70 scalarRdGmReqsInPipe
= 0;
71 scalarWrGmReqsInPipe
= 0;
72 scalarOutstandingReqsRdGm
= 0;
73 scalarOutstandingReqsWrGm
= 0;
78 oldVgprTcnt
= 0xffffffffffffffffll
;
79 oldDgprTcnt
= 0xffffffffffffffffll
;
80 oldVgpr
.resize(p
.wf_size
);
87 lastAddr
.resize(p
.wf_size
);
88 workItemFlatId
.resize(p
.wf_size
);
89 oldDgpr
.resize(p
.wf_size
);
90 for (int i
= 0; i
< 3; ++i
) {
91 workItemId
[i
].resize(p
.wf_size
);
101 Wavefront::regStats()
103 SimObject::regStats();
105 // FIXME: the name of the WF needs to be unique
106 numTimesBlockedDueWAXDependencies
107 .name(name() + ".timesBlockedDueWAXDependencies")
108 .desc("number of times the wf's instructions are blocked due to WAW "
109 "or WAR dependencies")
112 // FIXME: the name of the WF needs to be unique
113 numTimesBlockedDueRAWDependencies
114 .name(name() + ".timesBlockedDueRAWDependencies")
115 .desc("number of times the wf's instructions are blocked due to RAW "
120 .name(name() + ".num_instr_executed")
121 .desc("number of instructions executed by this WF slot")
125 .name(name() + ".sch_cycles")
126 .desc("number of cycles spent in schedule stage")
130 .name(name() + ".sch_stalls")
131 .desc("number of cycles WF is stalled in SCH stage")
135 .name(name() + ".sch_rf_access_stalls")
136 .desc("number of cycles wave selected in SCH but RF denied adding "
141 .name(name() + ".sch_resource_stalls")
142 .desc("number of cycles stalled in sch by resource not available")
146 .name(name() + ".sch_opd_nrdy_stalls")
147 .desc("number of cycles stalled in sch waiting for RF reads to "
152 .name(name() + ".sch_lds_arb_stalls")
153 .desc("number of cycles wave stalled due to LDS-VRF arbitration")
158 .name(name() + ".vec_raw_distance")
159 .desc("Count of RAW distance in dynamic instructions for this WF")
164 .name(name() + ".vec_reads_per_write")
165 .desc("Count of Vector reads per write for this WF")
172 reservedVectorRegs
= 0;
173 reservedScalarRegs
= 0;
177 scalarAlu
= computeUnit
->mapWaveToScalarAlu(this);
178 scalarAluGlobalIdx
= computeUnit
->mapWaveToScalarAluGlobalIdx(this);
179 globalMem
= computeUnit
->mapWaveToGlobalMem(this);
180 localMem
= computeUnit
->mapWaveToLocalMem(this);
181 scalarMem
= computeUnit
->mapWaveToScalarMem(this);
185 Wavefront::initRegState(HSAQueueEntry
*task
, int wgSizeInWorkItems
)
189 // iterate over all the init fields and check which
191 for (int en_bit
= 0; en_bit
< NumScalarInitFields
; ++en_bit
) {
193 if (task
->sgprBitEnabled(en_bit
)) {
195 uint32_t wiCount
= 0;
196 uint32_t firstWave
= 0;
197 int orderedAppendTerm
= 0;
199 uint32_t finalValue
= 0;
200 Addr host_disp_pkt_addr
= task
->hostDispPktAddr();
201 Addr kernarg_addr
= task
->kernargAddr();
202 Addr
hidden_priv_base(0);
207 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
208 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
209 task
->amdQueue
.scratch_resource_descriptor
[0]);
211 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
212 "Setting PrivateSegBuffer: s[%d] = %x\n",
213 computeUnit
->cu_id
, simdId
,
214 wfSlotId
, wfDynId
, physSgprIdx
,
215 task
->amdQueue
.scratch_resource_descriptor
[0]);
218 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
219 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
220 task
->amdQueue
.scratch_resource_descriptor
[1]);
222 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
223 "Setting PrivateSegBuffer: s[%d] = %x\n",
224 computeUnit
->cu_id
, simdId
,
225 wfSlotId
, wfDynId
, physSgprIdx
,
226 task
->amdQueue
.scratch_resource_descriptor
[1]);
229 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
230 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
231 task
->amdQueue
.scratch_resource_descriptor
[2]);
233 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
234 "Setting PrivateSegBuffer: s[%d] = %x\n",
235 computeUnit
->cu_id
, simdId
,
236 wfSlotId
, wfDynId
, physSgprIdx
,
237 task
->amdQueue
.scratch_resource_descriptor
[2]);
240 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
241 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
242 task
->amdQueue
.scratch_resource_descriptor
[3]);
245 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
246 "Setting PrivateSegBuffer: s[%d] = %x\n",
247 computeUnit
->cu_id
, simdId
,
248 wfSlotId
, wfDynId
, physSgprIdx
,
249 task
->amdQueue
.scratch_resource_descriptor
[3]);
253 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
254 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
255 bits(host_disp_pkt_addr
, 31, 0));
257 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
258 "Setting DispatchPtr: s[%d] = %x\n",
259 computeUnit
->cu_id
, simdId
,
260 wfSlotId
, wfDynId
, physSgprIdx
,
261 bits(host_disp_pkt_addr
, 31, 0));
264 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
265 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
266 bits(host_disp_pkt_addr
, 63, 32));
267 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
268 "Setting DispatchPtr: s[%d] = %x\n",
269 computeUnit
->cu_id
, simdId
,
270 wfSlotId
, wfDynId
, physSgprIdx
,
271 bits(host_disp_pkt_addr
, 63, 32));
277 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
278 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
279 bits(task
->hostAMDQueueAddr
, 31, 0));
281 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
282 "Setting QueuePtr: s[%d] = %x\n",
283 computeUnit
->cu_id
, simdId
,
284 wfSlotId
, wfDynId
, physSgprIdx
,
285 bits(task
->hostAMDQueueAddr
, 31, 0));
288 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
289 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
290 bits(task
->hostAMDQueueAddr
, 63, 32));
291 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
292 "Setting QueuePtr: s[%d] = %x\n",
293 computeUnit
->cu_id
, simdId
,
294 wfSlotId
, wfDynId
, physSgprIdx
,
295 bits(task
->hostAMDQueueAddr
, 63, 32));
301 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
302 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
303 bits(kernarg_addr
, 31, 0));
305 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
306 "Setting KernargSegPtr: s[%d] = %x\n",
307 computeUnit
->cu_id
, simdId
,
308 wfSlotId
, wfDynId
, physSgprIdx
,
309 bits(kernarg_addr
, 31, 0));
312 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
313 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
314 bits(kernarg_addr
, 63, 32));
315 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
316 "Setting KernargSegPtr: s[%d] = %x\n",
317 computeUnit
->cu_id
, simdId
,
318 wfSlotId
, wfDynId
, physSgprIdx
,
319 bits(kernarg_addr
, 63, 32));
323 case FlatScratchInit
:
325 = computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
326 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
327 (TheGpuISA::ScalarRegU32
)(task
->amdQueue
328 .scratch_backing_memory_location
& 0xffffffff));
330 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
331 "Setting FlatScratch Addr: s[%d] = %x\n",
332 computeUnit
->cu_id
, simdId
,
333 wfSlotId
, wfDynId
, physSgprIdx
,
334 (TheGpuISA::ScalarRegU32
)(task
->amdQueue
335 .scratch_backing_memory_location
& 0xffffffff));
338 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
339 // This vallue should be sizeof(DWORD) aligned, that is
341 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
342 task
->amdQueue
.scratch_workitem_byte_size
);
344 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
345 "Setting FlatScratch size: s[%d] = %x\n",
346 computeUnit
->cu_id
, simdId
,
347 wfSlotId
, wfDynId
, physSgprIdx
,
348 task
->amdQueue
.scratch_workitem_byte_size
);
350 * Since flat scratch init is needed for this kernel, this
351 * kernel is going to have flat memory instructions and we
352 * need to initialize the hidden private base for this queue.
353 * scratch_resource_descriptor[0] has this queue's scratch
354 * base address. scratch_backing_memory_location has the
355 * offset to this queue's scratch base address from the
356 * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
357 * queue's scratch base address for address calculation
358 * (stored in scratch_resource_descriptor[0]). But that
359 * address calculation shoule be done by first finding the
360 * queue's scratch base address using the calculation
361 * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
362 * SH_HIDDEN_PRIVATE_BASE_VMID.
364 * For more details see:
365 * http://rocm-documentation.readthedocs.io/en/latest/
366 * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
368 * https://github.com/ROCm-Developer-Tools/
369 * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
373 (uint64_t)task
->amdQueue
.scratch_resource_descriptor
[0] |
374 (((uint64_t)task
->amdQueue
.scratch_resource_descriptor
[1]
375 & 0x000000000000ffff) << 32);
376 computeUnit
->shader
->initShHiddenPrivateBase(
378 task
->amdQueue
.scratch_backing_memory_location
);
380 case GridWorkgroupCountX
:
382 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
383 wiCount
= ((task
->gridSize(0) +
384 task
->wgSize(0) - 1) /
386 computeUnit
->srf
[simdId
]->write(physSgprIdx
, wiCount
);
389 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
390 "Setting num WG X: s[%d] = %x\n",
391 computeUnit
->cu_id
, simdId
,
392 wfSlotId
, wfDynId
, physSgprIdx
, wiCount
);
394 case GridWorkgroupCountY
:
396 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
397 wiCount
= ((task
->gridSize(1) +
398 task
->wgSize(1) - 1) /
400 computeUnit
->srf
[simdId
]->write(physSgprIdx
, wiCount
);
403 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
404 "Setting num WG Y: s[%d] = %x\n",
405 computeUnit
->cu_id
, simdId
,
406 wfSlotId
, wfDynId
, physSgprIdx
, wiCount
);
408 case GridWorkgroupCountZ
:
410 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
411 wiCount
= ((task
->gridSize(2) +
412 task
->wgSize(2) - 1) /
414 computeUnit
->srf
[simdId
]->write(physSgprIdx
, wiCount
);
417 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
418 "Setting num WG Z: s[%d] = %x\n",
419 computeUnit
->cu_id
, simdId
,
420 wfSlotId
, wfDynId
, physSgprIdx
, wiCount
);
424 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
425 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
429 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
430 "Setting WG ID X: s[%d] = %x\n",
431 computeUnit
->cu_id
, simdId
,
432 wfSlotId
, wfDynId
, physSgprIdx
, workGroupId
[0]);
436 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
437 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
441 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
442 "Setting WG ID Y: s[%d] = %x\n",
443 computeUnit
->cu_id
, simdId
,
444 wfSlotId
, wfDynId
, physSgprIdx
, workGroupId
[1]);
448 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
449 computeUnit
->srf
[simdId
]->write(physSgprIdx
,
453 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
454 "Setting WG ID Z: s[%d] = %x\n",
455 computeUnit
->cu_id
, simdId
,
456 wfSlotId
, wfDynId
, physSgprIdx
, workGroupId
[2]);
458 case PrivSegWaveByteOffset
:
460 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
462 * the compute_tmpring_size_wavesize specifies the number of
463 * kB allocated per wavefront, hence the multiplication by
466 * to get the per wavefront offset into the scratch
467 * memory, we also multiply this by the wfId. the wfId stored
468 * in the Wavefront class, however, is the wave ID within the
469 * WG, whereas here we need the global WFID because the
470 * scratch space will be divided amongst all waves in the
471 * kernel. to get the global ID we multiply the WGID by
472 * the WG size, then add the WFID of the wave within its WG.
474 computeUnit
->srf
[simdId
]->write(physSgprIdx
, 1024 *
475 (wgId
* (wgSz
/ 64) + wfId
) *
476 task
->amdQueue
.compute_tmpring_size_wavesize
);
479 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
480 "Setting Private Seg Offset: s[%d] = %x\n",
481 computeUnit
->cu_id
, simdId
,
482 wfSlotId
, wfDynId
, physSgprIdx
,
483 1024 * (wgId
* (wgSz
/ 64) + wfId
) *
484 task
->amdQueue
.compute_tmpring_size_wavesize
);
487 firstWave
= (wfId
== 0) ? 1 : 0;
488 numWfsInWg
= divCeil(wgSizeInWorkItems
,
489 computeUnit
->wfSize());
490 finalValue
= firstWave
<< ((sizeof(uint32_t) * 8) - 1);
491 finalValue
|= (orderedAppendTerm
<< 6);
492 finalValue
|= numWfsInWg
;
494 computeUnit
->registerManager
->mapSgpr(this, regInitIdx
);
495 computeUnit
->srf
[simdId
]->
496 write(physSgprIdx
, finalValue
);
499 DPRINTF(GPUInitAbi
, "CU%d: WF[%d][%d]: wave[%d] "
500 "Setting WG Info: s[%d] = %x\n",
501 computeUnit
->cu_id
, simdId
,
502 wfSlotId
, wfDynId
, physSgprIdx
, finalValue
);
505 fatal("SGPR enable bit %i not supported\n", en_bit
);
513 // iterate over all the init fields and check which
515 for (int en_bit
= 0; en_bit
< NumVectorInitFields
; ++en_bit
) {
516 if (task
->vgprBitEnabled(en_bit
)) {
517 uint32_t physVgprIdx
= 0;
518 TheGpuISA::VecRegContainerU32 raw_vgpr
;
523 physVgprIdx
= computeUnit
->registerManager
524 ->mapVgpr(this, regInitIdx
);
525 TheGpuISA::VecRegU32 vgpr_x
526 = raw_vgpr
.as
<TheGpuISA::VecElemU32
>();
528 for (int lane
= 0; lane
< workItemId
[0].size(); ++lane
) {
529 vgpr_x
[lane
] = workItemId
[0][lane
];
532 computeUnit
->vrf
[simdId
]->write(physVgprIdx
, raw_vgpr
);
533 rawDist
[regInitIdx
] = 0;
539 physVgprIdx
= computeUnit
->registerManager
540 ->mapVgpr(this, regInitIdx
);
541 TheGpuISA::VecRegU32 vgpr_y
542 = raw_vgpr
.as
<TheGpuISA::VecElemU32
>();
544 for (int lane
= 0; lane
< workItemId
[1].size(); ++lane
) {
545 vgpr_y
[lane
] = workItemId
[1][lane
];
548 computeUnit
->vrf
[simdId
]->write(physVgprIdx
, raw_vgpr
);
549 rawDist
[regInitIdx
] = 0;
555 physVgprIdx
= computeUnit
->registerManager
->
556 mapVgpr(this, regInitIdx
);
557 TheGpuISA::VecRegU32 vgpr_z
558 = raw_vgpr
.as
<TheGpuISA::VecElemU32
>();
560 for (int lane
= 0; lane
< workItemId
[2].size(); ++lane
) {
561 vgpr_z
[lane
] = workItemId
[2][lane
];
564 computeUnit
->vrf
[simdId
]->write(physVgprIdx
, raw_vgpr
);
565 rawDist
[regInitIdx
] = 0;
575 Wavefront::resizeRegFiles(int num_vregs
, int num_sregs
)
577 maxVgprs
= num_vregs
;
578 maxSgprs
= num_sregs
;
581 Wavefront::~Wavefront()
586 Wavefront::setStatus(status_e newStatus
)
588 if (computeUnit
->idleCUTimeout
> 0) {
589 // Wavefront's status transitions to stalled or stopped
590 if ((newStatus
== S_STOPPED
|| newStatus
== S_STALLED
||
591 newStatus
== S_WAITCNT
|| newStatus
== S_BARRIER
) &&
592 (status
!= newStatus
)) {
593 computeUnit
->idleWfs
++;
594 assert(computeUnit
->idleWfs
<=
595 (computeUnit
->shader
->n_wf
* computeUnit
->numVectorALUs
));
596 if (computeUnit
->idleWfs
==
597 (computeUnit
->shader
->n_wf
* computeUnit
->numVectorALUs
)) {
598 lastNonIdleTick
= curTick();
600 // Wavefront's status transitions to an active state (from
601 // a stopped or stalled state)
602 } else if ((status
== S_STOPPED
|| status
== S_STALLED
||
603 status
== S_WAITCNT
|| status
== S_BARRIER
) &&
604 (status
!= newStatus
)) {
605 // if all WFs in the CU were idle then check if the idleness
606 // period exceeded the timeout threshold
607 if (computeUnit
->idleWfs
==
608 (computeUnit
->shader
->n_wf
* computeUnit
->numVectorALUs
)) {
609 panic_if((curTick() - lastNonIdleTick
) >=
610 computeUnit
->idleCUTimeout
,
611 "CU%d has been idle for %d ticks at tick %d",
612 computeUnit
->cu_id
, computeUnit
->idleCUTimeout
,
615 computeUnit
->idleWfs
--;
616 assert(computeUnit
->idleWfs
>= 0);
623 Wavefront::start(uint64_t _wf_dyn_id
, Addr init_pc
)
625 wfDynId
= _wf_dyn_id
;
630 vecReads
.resize(maxVgprs
, 0);
634 Wavefront::isGmInstruction(GPUDynInstPtr ii
)
636 if (ii
->isGlobalMem() ||
637 (ii
->isFlat() && ii
->executedAs() == Enums::SC_GLOBAL
)) {
645 Wavefront::isLmInstruction(GPUDynInstPtr ii
)
647 if (ii
->isLocalMem() ||
648 (ii
->isFlat() && ii
->executedAs() == Enums::SC_GROUP
)) {
656 Wavefront::isOldestInstWaitcnt()
658 if (instructionBuffer
.empty())
661 GPUDynInstPtr ii
= instructionBuffer
.front();
663 if (ii
->isWaitcnt()) {
664 // waitcnt is a scalar
665 assert(ii
->isScalar());
673 Wavefront::isOldestInstScalarALU()
675 assert(!instructionBuffer
.empty());
676 GPUDynInstPtr ii
= instructionBuffer
.front();
678 if (status
!= S_STOPPED
&& ii
->isScalar() && (ii
->isNop() || ii
->isReturn()
679 || ii
->isEndOfKernel() || ii
->isBranch() || ii
->isALU() ||
680 (ii
->isKernArgSeg() && ii
->isLoad()))) {
688 Wavefront::isOldestInstVectorALU()
690 assert(!instructionBuffer
.empty());
691 GPUDynInstPtr ii
= instructionBuffer
.front();
693 if (status
!= S_STOPPED
&& !ii
->isScalar() && (ii
->isNop() ||
694 ii
->isReturn() || ii
->isBranch() || ii
->isALU() || ii
->isEndOfKernel()
695 || (ii
->isKernArgSeg() && ii
->isLoad()))) {
703 Wavefront::isOldestInstBarrier()
705 assert(!instructionBuffer
.empty());
706 GPUDynInstPtr ii
= instructionBuffer
.front();
708 if (status
!= S_STOPPED
&& ii
->isBarrier()) {
716 Wavefront::isOldestInstGMem()
718 assert(!instructionBuffer
.empty());
719 GPUDynInstPtr ii
= instructionBuffer
.front();
721 if (status
!= S_STOPPED
&& !ii
->isScalar() && ii
->isGlobalMem()) {
729 Wavefront::isOldestInstScalarMem()
731 assert(!instructionBuffer
.empty());
732 GPUDynInstPtr ii
= instructionBuffer
.front();
734 if (status
!= S_STOPPED
&& ii
->isScalar() && ii
->isGlobalMem()) {
742 Wavefront::isOldestInstLMem()
744 assert(!instructionBuffer
.empty());
745 GPUDynInstPtr ii
= instructionBuffer
.front();
747 if (status
!= S_STOPPED
&& ii
->isLocalMem()) {
755 Wavefront::isOldestInstPrivMem()
757 assert(!instructionBuffer
.empty());
758 GPUDynInstPtr ii
= instructionBuffer
.front();
760 if (status
!= S_STOPPED
&& ii
->isPrivateSeg()) {
768 Wavefront::isOldestInstFlatMem()
770 assert(!instructionBuffer
.empty());
771 GPUDynInstPtr ii
= instructionBuffer
.front();
773 if (status
!= S_STOPPED
&& ii
->isFlat()) {
781 Wavefront::stopFetch()
783 for (auto it
: instructionBuffer
) {
784 GPUDynInstPtr ii
= it
;
785 if (ii
->isReturn() || ii
->isBranch() ||
786 ii
->isEndOfKernel()) {
795 Wavefront::freeResources()
800 void Wavefront::validateRequestCounters()
802 panic_if(wrGmReqsInPipe
< 0 || rdGmReqsInPipe
< 0 ||
803 wrLmReqsInPipe
< 0 || rdLmReqsInPipe
< 0 ||
805 "Negative requests in pipe for WF%d for slot%d"
806 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
807 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
808 " Outstanding Reqs=%d\n",
809 wfDynId
, wfSlotId
, simdId
, rdGmReqsInPipe
, wrGmReqsInPipe
,
810 rdLmReqsInPipe
, wrLmReqsInPipe
, outstandingReqs
);
814 Wavefront::reserveGmResource(GPUDynInstPtr ii
)
816 if (!ii
->isScalar()) {
819 } else if (ii
->isStore()) {
821 } else if (ii
->isAtomic() || ii
->isMemSync()) {
825 panic("Invalid memory operation!\n");
827 execUnitId
= globalMem
;
830 scalarRdGmReqsInPipe
++;
831 } else if (ii
->isStore()) {
832 scalarWrGmReqsInPipe
++;
833 } else if (ii
->isAtomic() || ii
->isMemSync()) {
834 scalarWrGmReqsInPipe
++;
835 scalarRdGmReqsInPipe
++;
837 panic("Invalid memory operation!\n");
839 execUnitId
= scalarMem
;
844 Wavefront::reserveLmResource(GPUDynInstPtr ii
)
846 fatal_if(ii
->isScalar(),
847 "Scalar instructions can not access Shared memory!!!");
850 } else if (ii
->isStore()) {
852 } else if (ii
->isAtomic() || ii
->isMemSync()) {
856 panic("Invalid memory operation!\n");
858 execUnitId
= localMem
;
862 Wavefront::reserveResources()
864 // vector of execution unit IDs to return to schedule stage
865 // this return is only used for debugging and an assertion...
866 std::vector
<int> execUnitIds
;
868 // Get current instruction
869 GPUDynInstPtr ii
= instructionBuffer
.front();
872 // Single precision ALU or Branch or Return or Special instruction
873 if (ii
->isALU() || ii
->isSpecialOp() ||
874 ii
->isBranch() || ii
->isNop() ||
875 (ii
->isKernArgSeg() && ii
->isLoad()) || ii
->isArgSeg() ||
876 ii
->isReturn() || ii
->isEndOfKernel()) {
877 if (!ii
->isScalar()) {
880 execUnitId
= scalarAluGlobalIdx
;
882 // this is to enforce a fixed number of cycles per issue slot per SIMD
883 } else if (ii
->isBarrier()) {
884 execUnitId
= ii
->isScalar() ? scalarAluGlobalIdx
: simdId
;
885 } else if (ii
->isFlat()) {
886 assert(!ii
->isScalar());
887 reserveLmResource(ii
);
888 // add execUnitId, reserved by reserveLmResource, list before it is
889 // overwriten by reserveGmResource
890 execUnitIds
.push_back(execUnitId
);
891 flatLmUnitId
= execUnitId
;
892 reserveGmResource(ii
);
893 flatGmUnitId
= execUnitId
;
894 execUnitIds
.push_back(flatGmUnitId
);
896 } else if (ii
->isGlobalMem()) {
897 reserveGmResource(ii
);
898 } else if (ii
->isLocalMem()) {
899 reserveLmResource(ii
);
900 } else if (ii
->isPrivateSeg()) {
901 fatal_if(ii
->isScalar(),
902 "Scalar instructions can not access Private memory!!!");
903 reserveGmResource(ii
);
905 panic("reserveResources -> Couldn't process op!\n");
908 if (execUnitId
!= -1) {
909 execUnitIds
.push_back(execUnitId
);
911 assert(execUnitIds
.size());
918 // ---- Exit if wavefront is inactive ----------------------------- //
920 if (status
== S_STOPPED
|| status
== S_RETURNING
||
921 status
==S_STALLED
|| instructionBuffer
.empty()) {
925 if (status
== S_WAITCNT
) {
927 * if this wave is in S_WAITCNT state, then
928 * it should enter exec() precisely one time
929 * before the waitcnts are satisfied, in order
930 * to execute the waitcnt instruction itself
931 * thus we assert that the waitcnt is the
932 * oldest instruction. if we enter exec() with
933 * active waitcnts, and we're not executing
934 * the waitcnt instruction, something must be
937 assert(isOldestInstWaitcnt());
940 // Get current instruction
942 GPUDynInstPtr ii
= instructionBuffer
.front();
944 const Addr old_pc
= pc();
945 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
946 "(pc: %#x; seqNum: %d)\n", computeUnit
->cu_id
, simdId
, wfSlotId
,
947 wfDynId
, ii
->disassemble(), old_pc
, ii
->seqNum());
950 // delete the dynamic instruction from the pipeline map
951 computeUnit
->deleteFromPipeMap(this);
952 // update the instruction stats in the CU
953 computeUnit
->updateInstStats(ii
);
955 // inform VRF of instruction execution to schedule write-back
956 // and scoreboard ready for registers
957 if (!ii
->isScalar()) {
958 computeUnit
->vrf
[simdId
]->waveExecuteInst(this, ii
);
960 computeUnit
->srf
[simdId
]->waveExecuteInst(this, ii
);
962 computeUnit
->shader
->vectorInstSrcOperand
[ii
->numSrcVecOperands()]++;
963 computeUnit
->shader
->vectorInstDstOperand
[ii
->numDstVecOperands()]++;
964 computeUnit
->numInstrExecuted
++;
966 computeUnit
->instExecPerSimd
[simdId
]++;
967 computeUnit
->execRateDist
.sample(computeUnit
->totalCycles
.value() -
968 computeUnit
->lastExecCycle
[simdId
]);
969 computeUnit
->lastExecCycle
[simdId
] = computeUnit
->totalCycles
.value();
972 computeUnit
->instInterleave
[simdId
].
973 sample(computeUnit
->instExecPerSimd
[simdId
] - lastInstExec
);
975 lastInstExec
= computeUnit
->instExecPerSimd
[simdId
];
978 // number of reads that occur per value written
980 // vector RAW dependency tracking
981 for (int i
= 0; i
< ii
->getNumOperands(); i
++) {
982 if (ii
->isVectorRegister(i
)) {
983 int vgpr
= ii
->getRegisterIndex(i
, ii
);
984 int nReg
= ii
->getOperandSize(i
) <= 4 ? 1 :
985 ii
->getOperandSize(i
) / 4;
986 for (int n
= 0; n
< nReg
; n
++) {
987 if (ii
->isSrcOperand(i
)) {
988 // This check should never fail, but to be safe we check
989 if (rawDist
.find(vgpr
+n
) != rawDist
.end()) {
991 sample(numInstrExecuted
.value() - rawDist
[vgpr
+n
]);
993 // increment number of reads to this register
995 } else if (ii
->isDstOperand(i
)) {
996 // rawDist is set on writes, but will not be set
997 // for the first write to each physical register
998 if (rawDist
.find(vgpr
+n
) != rawDist
.end()) {
999 // sample the number of reads that were performed
1000 readsPerWrite
.sample(vecReads
[vgpr
+n
]);
1002 // on a write, reset count of reads to 0
1003 vecReads
[vgpr
+n
] = 0;
1005 rawDist
[vgpr
+n
] = numInstrExecuted
.value();
1011 if (pc() == old_pc
) {
1012 // PC not modified by instruction, proceed to next
1013 _gpuISA
.advancePC(ii
);
1014 instructionBuffer
.pop_front();
1016 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1017 computeUnit
->cu_id
, simdId
, wfSlotId
, wfDynId
,
1021 DPRINTF(GPUExec
, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1022 computeUnit
->cu_id
, simdId
, wfSlotId
, wfDynId
, pc());
1024 if (computeUnit
->shader
->hsail_mode
==Shader::SIMT
) {
1025 const int num_active_lanes
= execMask().count();
1026 computeUnit
->controlFlowDivergenceDist
.sample(num_active_lanes
);
1027 computeUnit
->numVecOpsExecuted
+= num_active_lanes
;
1029 if (ii
->isF16() && ii
->isALU()) {
1030 if (ii
->isF32() || ii
->isF64()) {
1031 fatal("Instruction is tagged as both (1) F16, and (2)"
1032 "either F32 or F64.");
1034 computeUnit
->numVecOpsExecutedF16
+= num_active_lanes
;
1036 computeUnit
->numVecOpsExecutedFMA16
+= num_active_lanes
;
1037 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1039 else if (ii
->isMAC()) {
1040 computeUnit
->numVecOpsExecutedMAC16
+= num_active_lanes
;
1041 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1043 else if (ii
->isMAD()) {
1044 computeUnit
->numVecOpsExecutedMAD16
+= num_active_lanes
;
1045 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1048 if (ii
->isF32() && ii
->isALU()) {
1049 if (ii
->isF16() || ii
->isF64()) {
1050 fatal("Instruction is tagged as both (1) F32, and (2)"
1051 "either F16 or F64.");
1053 computeUnit
->numVecOpsExecutedF32
+= num_active_lanes
;
1055 computeUnit
->numVecOpsExecutedFMA32
+= num_active_lanes
;
1056 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1058 else if (ii
->isMAC()) {
1059 computeUnit
->numVecOpsExecutedMAC32
+= num_active_lanes
;
1060 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1062 else if (ii
->isMAD()) {
1063 computeUnit
->numVecOpsExecutedMAD32
+= num_active_lanes
;
1064 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1067 if (ii
->isF64() && ii
->isALU()) {
1068 if (ii
->isF16() || ii
->isF32()) {
1069 fatal("Instruction is tagged as both (1) F64, and (2)"
1070 "either F16 or F32.");
1072 computeUnit
->numVecOpsExecutedF64
+= num_active_lanes
;
1074 computeUnit
->numVecOpsExecutedFMA64
+= num_active_lanes
;
1075 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1077 else if (ii
->isMAC()) {
1078 computeUnit
->numVecOpsExecutedMAC64
+= num_active_lanes
;
1079 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1081 else if (ii
->isMAD()) {
1082 computeUnit
->numVecOpsExecutedMAD64
+= num_active_lanes
;
1083 computeUnit
->numVecOpsExecutedTwoOpFP
+= num_active_lanes
;
1086 if (isGmInstruction(ii
)) {
1087 computeUnit
->activeLanesPerGMemInstrDist
.sample(num_active_lanes
);
1088 } else if (isLmInstruction(ii
)) {
1089 computeUnit
->activeLanesPerLMemInstrDist
.sample(num_active_lanes
);
1094 * we return here to avoid spurious errors related to flat insts
1095 * and their address segment resolution.
1097 if (execMask().none() && ii
->isFlat()) {
1098 computeUnit
->getTokenManager()->recvTokens(1);
1102 // Update Vector ALU pipeline and other resources
1103 bool flat_as_gm
= false;
1104 bool flat_as_lm
= false;
1106 flat_as_gm
= (ii
->executedAs() == Enums::SC_GLOBAL
) ||
1107 (ii
->executedAs() == Enums::SC_PRIVATE
);
1108 flat_as_lm
= (ii
->executedAs() == Enums::SC_GROUP
);
1111 // Single precision ALU or Branch or Return or Special instruction
1112 // Note, we use the same timing regardless of SP or DP ALU operation.
1113 if (ii
->isALU() || ii
->isSpecialOp() ||
1114 ii
->isBranch() || ii
->isNop() ||
1115 (ii
->isKernArgSeg() && ii
->isLoad()) ||
1116 ii
->isArgSeg() || ii
->isEndOfKernel() || ii
->isReturn()) {
1117 // this is to enforce a fixed number of cycles per issue slot per SIMD
1118 if (!ii
->isScalar()) {
1119 computeUnit
->vectorALUs
[simdId
].set(computeUnit
->
1120 cyclesToTicks(computeUnit
->issuePeriod
));
1122 computeUnit
->scalarALUs
[scalarAlu
].set(computeUnit
->
1123 cyclesToTicks(computeUnit
->issuePeriod
));
1125 // Barrier on Scalar ALU
1126 } else if (ii
->isBarrier()) {
1127 computeUnit
->scalarALUs
[scalarAlu
].set(computeUnit
->
1128 cyclesToTicks(computeUnit
->issuePeriod
));
1129 // GM or Flat as GM Load
1130 } else if (ii
->isLoad() && (ii
->isGlobalMem() || flat_as_gm
)) {
1131 if (!ii
->isScalar()) {
1132 computeUnit
->vrfToGlobalMemPipeBus
.set(
1133 computeUnit
->cyclesToTicks(computeUnit
->vrf_gm_bus_latency
));
1134 computeUnit
->vectorGlobalMemUnit
.
1135 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1136 computeUnit
->instCyclesVMemPerSimd
[simdId
] +=
1137 computeUnit
->vrf_gm_bus_latency
;
1139 computeUnit
->srfToScalarMemPipeBus
.set(computeUnit
->
1140 cyclesToTicks(computeUnit
->srf_scm_bus_latency
));
1141 computeUnit
->scalarMemUnit
.
1142 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1143 computeUnit
->instCyclesScMemPerSimd
[simdId
] +=
1144 computeUnit
->srf_scm_bus_latency
;
1146 // GM or Flat as GM Store
1147 } else if (ii
->isStore() && (ii
->isGlobalMem() || flat_as_gm
)) {
1148 if (!ii
->isScalar()) {
1149 computeUnit
->vrfToGlobalMemPipeBus
.set(computeUnit
->
1150 cyclesToTicks(Cycles(2 * computeUnit
->vrf_gm_bus_latency
)));
1151 computeUnit
->vectorGlobalMemUnit
.
1152 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1153 computeUnit
->instCyclesVMemPerSimd
[simdId
] +=
1154 (2 * computeUnit
->vrf_gm_bus_latency
);
1156 computeUnit
->srfToScalarMemPipeBus
.set(computeUnit
->
1157 cyclesToTicks(Cycles(2 * computeUnit
->srf_scm_bus_latency
)));
1158 computeUnit
->scalarMemUnit
.
1159 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1160 computeUnit
->instCyclesScMemPerSimd
[simdId
] +=
1161 (2 * computeUnit
->srf_scm_bus_latency
);
1163 } else if ((ii
->isAtomic() || ii
->isMemSync()) &&
1164 (ii
->isGlobalMem() || flat_as_gm
)) {
1165 if (!ii
->isScalar()) {
1166 computeUnit
->vrfToGlobalMemPipeBus
.set(computeUnit
->
1167 cyclesToTicks(Cycles(2 * computeUnit
->vrf_gm_bus_latency
)));
1168 computeUnit
->vectorGlobalMemUnit
.
1169 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1170 computeUnit
->instCyclesVMemPerSimd
[simdId
] +=
1171 (2 * computeUnit
->vrf_gm_bus_latency
);
1173 computeUnit
->srfToScalarMemPipeBus
.set(computeUnit
->
1174 cyclesToTicks(Cycles(2 * computeUnit
->srf_scm_bus_latency
)));
1175 computeUnit
->scalarMemUnit
.
1176 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1177 computeUnit
->instCyclesScMemPerSimd
[simdId
] +=
1178 (2 * computeUnit
->srf_scm_bus_latency
);
1180 // LM or Flat as LM Load
1181 } else if (ii
->isLoad() && (ii
->isLocalMem() || flat_as_lm
)) {
1182 computeUnit
->vrfToLocalMemPipeBus
.set(computeUnit
->
1183 cyclesToTicks(computeUnit
->vrf_lm_bus_latency
));
1184 computeUnit
->vectorSharedMemUnit
.
1185 set(computeUnit
->shader
->cyclesToTicks(computeUnit
->issuePeriod
));
1186 computeUnit
->instCyclesLdsPerSimd
[simdId
] +=
1187 computeUnit
->vrf_lm_bus_latency
;
1188 // LM or Flat as LM Store
1189 } else if (ii
->isStore() && (ii
->isLocalMem() || flat_as_lm
)) {
1190 computeUnit
->vrfToLocalMemPipeBus
.set(computeUnit
->
1191 cyclesToTicks(Cycles(2 * computeUnit
->vrf_lm_bus_latency
)));
1192 computeUnit
->vectorSharedMemUnit
.
1193 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1194 computeUnit
->instCyclesLdsPerSimd
[simdId
] +=
1195 (2 * computeUnit
->vrf_lm_bus_latency
);
1196 // LM or Flat as LM, Atomic or MemFence
1197 } else if ((ii
->isAtomic() || ii
->isMemSync()) &&
1198 (ii
->isLocalMem() || flat_as_lm
)) {
1199 computeUnit
->vrfToLocalMemPipeBus
.set(computeUnit
->
1200 cyclesToTicks(Cycles(2 * computeUnit
->vrf_lm_bus_latency
)));
1201 computeUnit
->vectorSharedMemUnit
.
1202 set(computeUnit
->cyclesToTicks(computeUnit
->issuePeriod
));
1203 computeUnit
->instCyclesLdsPerSimd
[simdId
] +=
1204 (2 * computeUnit
->vrf_lm_bus_latency
);
1206 panic("Bad instruction type!\n");
1211 Wavefront::nextInstr()
1213 // Read next instruction from instruction buffer
1214 GPUDynInstPtr ii
= instructionBuffer
.front();
1215 // if the WF has been dispatched in the schedule stage then
1216 // check the next oldest instruction for readiness
1217 if (computeUnit
->pipeMap
.find(ii
->seqNum()) !=
1218 computeUnit
->pipeMap
.end()) {
1219 if (instructionBuffer
.size() > 1) {
1220 auto it
= instructionBuffer
.begin() + 1;
1222 } else { // No new instructions to check
1230 Wavefront::discardFetch()
1232 instructionBuffer
.clear();
1233 dropFetch
|= pendingFetch
;
1236 * clear the fetch buffer for this wave in order to
1237 * remove any stale inst data
1239 computeUnit
->fetchStage
.fetchUnit(simdId
).flushBuf(wfSlotId
);
1243 Wavefront::waitCntsSatisfied()
1245 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1246 // waitCnt instruction has been dispatched but not executed yet: next
1247 // instruction should be blocked until waitCnt is executed.
1248 if (vmWaitCnt
== -1 && expWaitCnt
== -1 && lgkmWaitCnt
== -1) {
1253 * If we reach here, that means an s_waitcnt instruction was executed
1254 * and the waitcnts are set by the execute method. Check if waitcnts
1257 if (vmWaitCnt
!= -1) {
1258 if (vmemInstsIssued
> vmWaitCnt
) {
1259 // vmWaitCnt not satisfied
1264 if (expWaitCnt
!= -1) {
1265 if (expInstsIssued
> expWaitCnt
) {
1266 // expWaitCnt not satisfied
1271 if (lgkmWaitCnt
!= -1) {
1272 if (lgkmInstsIssued
> lgkmWaitCnt
) {
1273 // lgkmWaitCnt not satisfied
1278 // if we get here all outstanding waitcnts must
1279 // be satisfied, so we resume normal operation
1286 Wavefront::setWaitCnts(int vm_wait_cnt
, int exp_wait_cnt
, int lgkm_wait_cnt
)
1288 // the scoreboard should have set the status
1289 // to S_WAITCNT once a waitcnt instruction
1290 // was marked as ready
1291 assert(status
== S_WAITCNT
);
1293 // waitcnt instruction shouldn't be sending
1295 assert(vm_wait_cnt
>= 0);
1296 assert(exp_wait_cnt
>= 0);
1297 assert(lgkm_wait_cnt
>= 0);
1298 // waitcnts are a max of 15 because we have
1299 // only 1 nibble (4 bits) to set the counts
1300 assert(vm_wait_cnt
<= 0xf);
1301 assert(exp_wait_cnt
<= 0x7);
1302 assert(lgkm_wait_cnt
<= 0x1f);
1305 * prior waitcnts should be satisfied,
1306 * at which time the WF resets them
1307 * back to -1, indicating they are no
1310 assert(vmWaitCnt
== -1);
1311 assert(expWaitCnt
== -1);
1312 assert(lgkmWaitCnt
== -1);
1315 * if the instruction encoding
1316 * indicates a waitcnt of 0xf,
1317 * that means the waitcnt is
1320 if (vm_wait_cnt
!= 0xf)
1321 vmWaitCnt
= vm_wait_cnt
;
1323 if (exp_wait_cnt
!= 0x7)
1324 expWaitCnt
= exp_wait_cnt
;
1326 if (lgkm_wait_cnt
!= 0x1f)
1327 lgkmWaitCnt
= lgkm_wait_cnt
;
1331 Wavefront::clearWaitCnts()
1333 // reset the waitcnts back to
1334 // -1, indicating they are no
1340 // resume running normally
1345 Wavefront::incVMemInstsIssued()
1351 Wavefront::incExpInstsIssued()
1357 Wavefront::incLGKMInstsIssued()
1363 Wavefront::decVMemInstsIssued()
1369 Wavefront::decExpInstsIssued()
1375 Wavefront::decLGKMInstsIssued()
1381 Wavefront::pc() const
1387 Wavefront::pc(Addr new_pc
)
1393 Wavefront::execMask()
1399 Wavefront::execMask(int lane
) const
1401 return _execMask
[lane
];
1405 Wavefront::freeRegisterFile()
1407 /* clear busy registers */
1408 for (int i
=0; i
< maxVgprs
; i
++) {
1409 int vgprIdx
= computeUnit
->registerManager
->mapVgpr(this, i
);
1410 computeUnit
->vrf
[simdId
]->markReg(vgprIdx
, false);
1413 /* Free registers used by this wavefront */
1414 uint32_t endIndex
= (startVgprIndex
+ reservedVectorRegs
- 1) %
1415 computeUnit
->vrf
[simdId
]->numRegs();
1416 computeUnit
->registerManager
->vrfPoolMgrs
[simdId
]->
1417 freeRegion(startVgprIndex
, endIndex
);
1421 Wavefront::computeActualWgSz(HSAQueueEntry
*task
)
1423 actualWgSzTotal
= 1;
1424 for (int d
= 0; d
< HSAQueueEntry::MAX_DIM
; ++d
) {
1425 actualWgSz
[d
] = std::min(workGroupSz
[d
], gridSz
[d
]
1426 - task
->wgId(d
) * workGroupSz
[d
]);
1427 actualWgSzTotal
*= actualWgSz
[d
];
1432 Wavefront::barrierId(int bar_id
)
1434 assert(bar_id
>= WFBarrier::InvalidID
);
1435 assert(bar_id
< computeUnit
->numBarrierSlots());
1440 Wavefront::barrierId() const
1446 Wavefront::hasBarrier() const
1448 return barId
> WFBarrier::InvalidID
;
1452 Wavefront::releaseBarrier()
1454 barId
= WFBarrier::InvalidID
;