misc: Delete the now unnecessary create methods.
[gem5.git] / src / gpu-compute / wavefront.cc
1 /*
2 * Copyright (c) 2011-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/wavefront.hh"
35
36 #include "base/bitfield.hh"
37 #include "debug/GPUExec.hh"
38 #include "debug/GPUInitAbi.hh"
39 #include "debug/WavefrontStack.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/scalar_register_file.hh"
43 #include "gpu-compute/shader.hh"
44 #include "gpu-compute/simple_pool_manager.hh"
45 #include "gpu-compute/vector_register_file.hh"
46
47 Wavefront::Wavefront(const Params &p)
48 : SimObject(p), wfSlotId(p.wf_slot_id), simdId(p.simdId),
49 maxIbSize(p.max_ib_size), _gpuISA(*this),
50 vmWaitCnt(-1), expWaitCnt(-1), lgkmWaitCnt(-1),
51 vmemInstsIssued(0), expInstsIssued(0), lgkmInstsIssued(0),
52 barId(WFBarrier::InvalidID)
53 {
54 lastTrace = 0;
55 execUnitId = -1;
56 status = S_STOPPED;
57 reservedVectorRegs = 0;
58 reservedScalarRegs = 0;
59 startVgprIndex = 0;
60 startSgprIndex = 0;
61 outstandingReqs = 0;
62 outstandingReqsWrGm = 0;
63 outstandingReqsWrLm = 0;
64 outstandingReqsRdGm = 0;
65 outstandingReqsRdLm = 0;
66 rdLmReqsInPipe = 0;
67 rdGmReqsInPipe = 0;
68 wrLmReqsInPipe = 0;
69 wrGmReqsInPipe = 0;
70 scalarRdGmReqsInPipe = 0;
71 scalarWrGmReqsInPipe = 0;
72 scalarOutstandingReqsRdGm = 0;
73 scalarOutstandingReqsWrGm = 0;
74 lastNonIdleTick = 0;
75 ldsChunk = nullptr;
76
77 memTraceBusy = 0;
78 oldVgprTcnt = 0xffffffffffffffffll;
79 oldDgprTcnt = 0xffffffffffffffffll;
80 oldVgpr.resize(p.wf_size);
81
82 pendingFetch = false;
83 dropFetch = false;
84 maxVgprs = 0;
85 maxSgprs = 0;
86
87 lastAddr.resize(p.wf_size);
88 workItemFlatId.resize(p.wf_size);
89 oldDgpr.resize(p.wf_size);
90 for (int i = 0; i < 3; ++i) {
91 workItemId[i].resize(p.wf_size);
92 }
93
94 _execMask.set();
95 rawDist.clear();
96 lastInstExec = 0;
97 vecReads.clear();
98 }
99
100 void
101 Wavefront::regStats()
102 {
103 SimObject::regStats();
104
105 // FIXME: the name of the WF needs to be unique
106 numTimesBlockedDueWAXDependencies
107 .name(name() + ".timesBlockedDueWAXDependencies")
108 .desc("number of times the wf's instructions are blocked due to WAW "
109 "or WAR dependencies")
110 ;
111
112 // FIXME: the name of the WF needs to be unique
113 numTimesBlockedDueRAWDependencies
114 .name(name() + ".timesBlockedDueRAWDependencies")
115 .desc("number of times the wf's instructions are blocked due to RAW "
116 "dependencies")
117 ;
118
119 numInstrExecuted
120 .name(name() + ".num_instr_executed")
121 .desc("number of instructions executed by this WF slot")
122 ;
123
124 schCycles
125 .name(name() + ".sch_cycles")
126 .desc("number of cycles spent in schedule stage")
127 ;
128
129 schStalls
130 .name(name() + ".sch_stalls")
131 .desc("number of cycles WF is stalled in SCH stage")
132 ;
133
134 schRfAccessStalls
135 .name(name() + ".sch_rf_access_stalls")
136 .desc("number of cycles wave selected in SCH but RF denied adding "
137 "instruction")
138 ;
139
140 schResourceStalls
141 .name(name() + ".sch_resource_stalls")
142 .desc("number of cycles stalled in sch by resource not available")
143 ;
144
145 schOpdNrdyStalls
146 .name(name() + ".sch_opd_nrdy_stalls")
147 .desc("number of cycles stalled in sch waiting for RF reads to "
148 "complete")
149 ;
150
151 schLdsArbStalls
152 .name(name() + ".sch_lds_arb_stalls")
153 .desc("number of cycles wave stalled due to LDS-VRF arbitration")
154 ;
155
156 vecRawDistance
157 .init(0,20,1)
158 .name(name() + ".vec_raw_distance")
159 .desc("Count of RAW distance in dynamic instructions for this WF")
160 ;
161
162 readsPerWrite
163 .init(0,4,1)
164 .name(name() + ".vec_reads_per_write")
165 .desc("Count of Vector reads per write for this WF")
166 ;
167 }
168
169 void
170 Wavefront::init()
171 {
172 reservedVectorRegs = 0;
173 reservedScalarRegs = 0;
174 startVgprIndex = 0;
175 startSgprIndex = 0;
176
177 scalarAlu = computeUnit->mapWaveToScalarAlu(this);
178 scalarAluGlobalIdx = computeUnit->mapWaveToScalarAluGlobalIdx(this);
179 globalMem = computeUnit->mapWaveToGlobalMem(this);
180 localMem = computeUnit->mapWaveToLocalMem(this);
181 scalarMem = computeUnit->mapWaveToScalarMem(this);
182 }
183
184 void
185 Wavefront::initRegState(HSAQueueEntry *task, int wgSizeInWorkItems)
186 {
187 int regInitIdx = 0;
188
189 // iterate over all the init fields and check which
190 // bits are enabled
191 for (int en_bit = 0; en_bit < NumScalarInitFields; ++en_bit) {
192
193 if (task->sgprBitEnabled(en_bit)) {
194 int physSgprIdx = 0;
195 uint32_t wiCount = 0;
196 uint32_t firstWave = 0;
197 int orderedAppendTerm = 0;
198 int numWfsInWg = 0;
199 uint32_t finalValue = 0;
200 Addr host_disp_pkt_addr = task->hostDispPktAddr();
201 Addr kernarg_addr = task->kernargAddr();
202 Addr hidden_priv_base(0);
203
204 switch (en_bit) {
205 case PrivateSegBuf:
206 physSgprIdx =
207 computeUnit->registerManager->mapSgpr(this, regInitIdx);
208 computeUnit->srf[simdId]->write(physSgprIdx,
209 task->amdQueue.scratch_resource_descriptor[0]);
210 ++regInitIdx;
211 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
212 "Setting PrivateSegBuffer: s[%d] = %x\n",
213 computeUnit->cu_id, simdId,
214 wfSlotId, wfDynId, physSgprIdx,
215 task->amdQueue.scratch_resource_descriptor[0]);
216
217 physSgprIdx =
218 computeUnit->registerManager->mapSgpr(this, regInitIdx);
219 computeUnit->srf[simdId]->write(physSgprIdx,
220 task->amdQueue.scratch_resource_descriptor[1]);
221 ++regInitIdx;
222 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
223 "Setting PrivateSegBuffer: s[%d] = %x\n",
224 computeUnit->cu_id, simdId,
225 wfSlotId, wfDynId, physSgprIdx,
226 task->amdQueue.scratch_resource_descriptor[1]);
227
228 physSgprIdx =
229 computeUnit->registerManager->mapSgpr(this, regInitIdx);
230 computeUnit->srf[simdId]->write(physSgprIdx,
231 task->amdQueue.scratch_resource_descriptor[2]);
232 ++regInitIdx;
233 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
234 "Setting PrivateSegBuffer: s[%d] = %x\n",
235 computeUnit->cu_id, simdId,
236 wfSlotId, wfDynId, physSgprIdx,
237 task->amdQueue.scratch_resource_descriptor[2]);
238
239 physSgprIdx =
240 computeUnit->registerManager->mapSgpr(this, regInitIdx);
241 computeUnit->srf[simdId]->write(physSgprIdx,
242 task->amdQueue.scratch_resource_descriptor[3]);
243
244 ++regInitIdx;
245 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
246 "Setting PrivateSegBuffer: s[%d] = %x\n",
247 computeUnit->cu_id, simdId,
248 wfSlotId, wfDynId, physSgprIdx,
249 task->amdQueue.scratch_resource_descriptor[3]);
250 break;
251 case DispatchPtr:
252 physSgprIdx =
253 computeUnit->registerManager->mapSgpr(this, regInitIdx);
254 computeUnit->srf[simdId]->write(physSgprIdx,
255 bits(host_disp_pkt_addr, 31, 0));
256 ++regInitIdx;
257 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
258 "Setting DispatchPtr: s[%d] = %x\n",
259 computeUnit->cu_id, simdId,
260 wfSlotId, wfDynId, physSgprIdx,
261 bits(host_disp_pkt_addr, 31, 0));
262
263 physSgprIdx =
264 computeUnit->registerManager->mapSgpr(this, regInitIdx);
265 computeUnit->srf[simdId]->write(physSgprIdx,
266 bits(host_disp_pkt_addr, 63, 32));
267 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
268 "Setting DispatchPtr: s[%d] = %x\n",
269 computeUnit->cu_id, simdId,
270 wfSlotId, wfDynId, physSgprIdx,
271 bits(host_disp_pkt_addr, 63, 32));
272
273 ++regInitIdx;
274 break;
275 case QueuePtr:
276 physSgprIdx =
277 computeUnit->registerManager->mapSgpr(this, regInitIdx);
278 computeUnit->srf[simdId]->write(physSgprIdx,
279 bits(task->hostAMDQueueAddr, 31, 0));
280 ++regInitIdx;
281 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
282 "Setting QueuePtr: s[%d] = %x\n",
283 computeUnit->cu_id, simdId,
284 wfSlotId, wfDynId, physSgprIdx,
285 bits(task->hostAMDQueueAddr, 31, 0));
286
287 physSgprIdx =
288 computeUnit->registerManager->mapSgpr(this, regInitIdx);
289 computeUnit->srf[simdId]->write(physSgprIdx,
290 bits(task->hostAMDQueueAddr, 63, 32));
291 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
292 "Setting QueuePtr: s[%d] = %x\n",
293 computeUnit->cu_id, simdId,
294 wfSlotId, wfDynId, physSgprIdx,
295 bits(task->hostAMDQueueAddr, 63, 32));
296
297 ++regInitIdx;
298 break;
299 case KernargSegPtr:
300 physSgprIdx =
301 computeUnit->registerManager->mapSgpr(this, regInitIdx);
302 computeUnit->srf[simdId]->write(physSgprIdx,
303 bits(kernarg_addr, 31, 0));
304 ++regInitIdx;
305 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
306 "Setting KernargSegPtr: s[%d] = %x\n",
307 computeUnit->cu_id, simdId,
308 wfSlotId, wfDynId, physSgprIdx,
309 bits(kernarg_addr, 31, 0));
310
311 physSgprIdx =
312 computeUnit->registerManager->mapSgpr(this, regInitIdx);
313 computeUnit->srf[simdId]->write(physSgprIdx,
314 bits(kernarg_addr, 63, 32));
315 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
316 "Setting KernargSegPtr: s[%d] = %x\n",
317 computeUnit->cu_id, simdId,
318 wfSlotId, wfDynId, physSgprIdx,
319 bits(kernarg_addr, 63, 32));
320
321 ++regInitIdx;
322 break;
323 case FlatScratchInit:
324 physSgprIdx
325 = computeUnit->registerManager->mapSgpr(this, regInitIdx);
326 computeUnit->srf[simdId]->write(physSgprIdx,
327 (TheGpuISA::ScalarRegU32)(task->amdQueue
328 .scratch_backing_memory_location & 0xffffffff));
329 ++regInitIdx;
330 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
331 "Setting FlatScratch Addr: s[%d] = %x\n",
332 computeUnit->cu_id, simdId,
333 wfSlotId, wfDynId, physSgprIdx,
334 (TheGpuISA::ScalarRegU32)(task->amdQueue
335 .scratch_backing_memory_location & 0xffffffff));
336
337 physSgprIdx =
338 computeUnit->registerManager->mapSgpr(this, regInitIdx);
339 // This vallue should be sizeof(DWORD) aligned, that is
340 // 4 byte aligned
341 computeUnit->srf[simdId]->write(physSgprIdx,
342 task->amdQueue.scratch_workitem_byte_size);
343 ++regInitIdx;
344 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
345 "Setting FlatScratch size: s[%d] = %x\n",
346 computeUnit->cu_id, simdId,
347 wfSlotId, wfDynId, physSgprIdx,
348 task->amdQueue.scratch_workitem_byte_size);
349 /**
350 * Since flat scratch init is needed for this kernel, this
351 * kernel is going to have flat memory instructions and we
352 * need to initialize the hidden private base for this queue.
353 * scratch_resource_descriptor[0] has this queue's scratch
354 * base address. scratch_backing_memory_location has the
355 * offset to this queue's scratch base address from the
356 * SH_HIDDEN_PRIVATE_BASE_VMID. Ideally, we only require this
357 * queue's scratch base address for address calculation
358 * (stored in scratch_resource_descriptor[0]). But that
359 * address calculation shoule be done by first finding the
360 * queue's scratch base address using the calculation
361 * "SH_HIDDEN_PRIVATE_BASE_VMID + offset". So, we initialize
362 * SH_HIDDEN_PRIVATE_BASE_VMID.
363 *
364 * For more details see:
365 * http://rocm-documentation.readthedocs.io/en/latest/
366 * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
367 *
368 * https://github.com/ROCm-Developer-Tools/
369 * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
370 * #flat-addressing
371 */
372 hidden_priv_base =
373 (uint64_t)task->amdQueue.scratch_resource_descriptor[0] |
374 (((uint64_t)task->amdQueue.scratch_resource_descriptor[1]
375 & 0x000000000000ffff) << 32);
376 computeUnit->shader->initShHiddenPrivateBase(
377 hidden_priv_base,
378 task->amdQueue.scratch_backing_memory_location);
379 break;
380 case GridWorkgroupCountX:
381 physSgprIdx =
382 computeUnit->registerManager->mapSgpr(this, regInitIdx);
383 wiCount = ((task->gridSize(0) +
384 task->wgSize(0) - 1) /
385 task->wgSize(0));
386 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
387
388 ++regInitIdx;
389 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
390 "Setting num WG X: s[%d] = %x\n",
391 computeUnit->cu_id, simdId,
392 wfSlotId, wfDynId, physSgprIdx, wiCount);
393 break;
394 case GridWorkgroupCountY:
395 physSgprIdx =
396 computeUnit->registerManager->mapSgpr(this, regInitIdx);
397 wiCount = ((task->gridSize(1) +
398 task->wgSize(1) - 1) /
399 task->wgSize(1));
400 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
401
402 ++regInitIdx;
403 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
404 "Setting num WG Y: s[%d] = %x\n",
405 computeUnit->cu_id, simdId,
406 wfSlotId, wfDynId, physSgprIdx, wiCount);
407 break;
408 case GridWorkgroupCountZ:
409 physSgprIdx =
410 computeUnit->registerManager->mapSgpr(this, regInitIdx);
411 wiCount = ((task->gridSize(2) +
412 task->wgSize(2) - 1) /
413 task->wgSize(2));
414 computeUnit->srf[simdId]->write(physSgprIdx, wiCount);
415
416 ++regInitIdx;
417 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
418 "Setting num WG Z: s[%d] = %x\n",
419 computeUnit->cu_id, simdId,
420 wfSlotId, wfDynId, physSgprIdx, wiCount);
421 break;
422 case WorkgroupIdX:
423 physSgprIdx =
424 computeUnit->registerManager->mapSgpr(this, regInitIdx);
425 computeUnit->srf[simdId]->write(physSgprIdx,
426 workGroupId[0]);
427
428 ++regInitIdx;
429 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
430 "Setting WG ID X: s[%d] = %x\n",
431 computeUnit->cu_id, simdId,
432 wfSlotId, wfDynId, physSgprIdx, workGroupId[0]);
433 break;
434 case WorkgroupIdY:
435 physSgprIdx =
436 computeUnit->registerManager->mapSgpr(this, regInitIdx);
437 computeUnit->srf[simdId]->write(physSgprIdx,
438 workGroupId[1]);
439
440 ++regInitIdx;
441 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
442 "Setting WG ID Y: s[%d] = %x\n",
443 computeUnit->cu_id, simdId,
444 wfSlotId, wfDynId, physSgprIdx, workGroupId[1]);
445 break;
446 case WorkgroupIdZ:
447 physSgprIdx =
448 computeUnit->registerManager->mapSgpr(this, regInitIdx);
449 computeUnit->srf[simdId]->write(physSgprIdx,
450 workGroupId[2]);
451
452 ++regInitIdx;
453 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
454 "Setting WG ID Z: s[%d] = %x\n",
455 computeUnit->cu_id, simdId,
456 wfSlotId, wfDynId, physSgprIdx, workGroupId[2]);
457 break;
458 case PrivSegWaveByteOffset:
459 physSgprIdx =
460 computeUnit->registerManager->mapSgpr(this, regInitIdx);
461 /**
462 * the compute_tmpring_size_wavesize specifies the number of
463 * kB allocated per wavefront, hence the multiplication by
464 * 1024.
465 *
466 * to get the per wavefront offset into the scratch
467 * memory, we also multiply this by the wfId. the wfId stored
468 * in the Wavefront class, however, is the wave ID within the
469 * WG, whereas here we need the global WFID because the
470 * scratch space will be divided amongst all waves in the
471 * kernel. to get the global ID we multiply the WGID by
472 * the WG size, then add the WFID of the wave within its WG.
473 */
474 computeUnit->srf[simdId]->write(physSgprIdx, 1024 *
475 (wgId * (wgSz / 64) + wfId) *
476 task->amdQueue.compute_tmpring_size_wavesize);
477
478 ++regInitIdx;
479 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
480 "Setting Private Seg Offset: s[%d] = %x\n",
481 computeUnit->cu_id, simdId,
482 wfSlotId, wfDynId, physSgprIdx,
483 1024 * (wgId * (wgSz / 64) + wfId) *
484 task->amdQueue.compute_tmpring_size_wavesize);
485 break;
486 case WorkgroupInfo:
487 firstWave = (wfId == 0) ? 1 : 0;
488 numWfsInWg = divCeil(wgSizeInWorkItems,
489 computeUnit->wfSize());
490 finalValue = firstWave << ((sizeof(uint32_t) * 8) - 1);
491 finalValue |= (orderedAppendTerm << 6);
492 finalValue |= numWfsInWg;
493 physSgprIdx =
494 computeUnit->registerManager->mapSgpr(this, regInitIdx);
495 computeUnit->srf[simdId]->
496 write(physSgprIdx, finalValue);
497
498 ++regInitIdx;
499 DPRINTF(GPUInitAbi, "CU%d: WF[%d][%d]: wave[%d] "
500 "Setting WG Info: s[%d] = %x\n",
501 computeUnit->cu_id, simdId,
502 wfSlotId, wfDynId, physSgprIdx, finalValue);
503 break;
504 default:
505 fatal("SGPR enable bit %i not supported\n", en_bit);
506 break;
507 }
508 }
509 }
510
511 regInitIdx = 0;
512
513 // iterate over all the init fields and check which
514 // bits are enabled
515 for (int en_bit = 0; en_bit < NumVectorInitFields; ++en_bit) {
516 if (task->vgprBitEnabled(en_bit)) {
517 uint32_t physVgprIdx = 0;
518 TheGpuISA::VecRegContainerU32 raw_vgpr;
519
520 switch (en_bit) {
521 case WorkitemIdX:
522 {
523 physVgprIdx = computeUnit->registerManager
524 ->mapVgpr(this, regInitIdx);
525 TheGpuISA::VecRegU32 vgpr_x
526 = raw_vgpr.as<TheGpuISA::VecElemU32>();
527
528 for (int lane = 0; lane < workItemId[0].size(); ++lane) {
529 vgpr_x[lane] = workItemId[0][lane];
530 }
531
532 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
533 rawDist[regInitIdx] = 0;
534 ++regInitIdx;
535 }
536 break;
537 case WorkitemIdY:
538 {
539 physVgprIdx = computeUnit->registerManager
540 ->mapVgpr(this, regInitIdx);
541 TheGpuISA::VecRegU32 vgpr_y
542 = raw_vgpr.as<TheGpuISA::VecElemU32>();
543
544 for (int lane = 0; lane < workItemId[1].size(); ++lane) {
545 vgpr_y[lane] = workItemId[1][lane];
546 }
547
548 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
549 rawDist[regInitIdx] = 0;
550 ++regInitIdx;
551 }
552 break;
553 case WorkitemIdZ:
554 {
555 physVgprIdx = computeUnit->registerManager->
556 mapVgpr(this, regInitIdx);
557 TheGpuISA::VecRegU32 vgpr_z
558 = raw_vgpr.as<TheGpuISA::VecElemU32>();
559
560 for (int lane = 0; lane < workItemId[2].size(); ++lane) {
561 vgpr_z[lane] = workItemId[2][lane];
562 }
563
564 computeUnit->vrf[simdId]->write(physVgprIdx, raw_vgpr);
565 rawDist[regInitIdx] = 0;
566 ++regInitIdx;
567 }
568 break;
569 }
570 }
571 }
572 }
573
574 void
575 Wavefront::resizeRegFiles(int num_vregs, int num_sregs)
576 {
577 maxVgprs = num_vregs;
578 maxSgprs = num_sregs;
579 }
580
581 Wavefront::~Wavefront()
582 {
583 }
584
585 void
586 Wavefront::setStatus(status_e newStatus)
587 {
588 if (computeUnit->idleCUTimeout > 0) {
589 // Wavefront's status transitions to stalled or stopped
590 if ((newStatus == S_STOPPED || newStatus == S_STALLED ||
591 newStatus == S_WAITCNT || newStatus == S_BARRIER) &&
592 (status != newStatus)) {
593 computeUnit->idleWfs++;
594 assert(computeUnit->idleWfs <=
595 (computeUnit->shader->n_wf * computeUnit->numVectorALUs));
596 if (computeUnit->idleWfs ==
597 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
598 lastNonIdleTick = curTick();
599 }
600 // Wavefront's status transitions to an active state (from
601 // a stopped or stalled state)
602 } else if ((status == S_STOPPED || status == S_STALLED ||
603 status == S_WAITCNT || status == S_BARRIER) &&
604 (status != newStatus)) {
605 // if all WFs in the CU were idle then check if the idleness
606 // period exceeded the timeout threshold
607 if (computeUnit->idleWfs ==
608 (computeUnit->shader->n_wf * computeUnit->numVectorALUs)) {
609 panic_if((curTick() - lastNonIdleTick) >=
610 computeUnit->idleCUTimeout,
611 "CU%d has been idle for %d ticks at tick %d",
612 computeUnit->cu_id, computeUnit->idleCUTimeout,
613 curTick());
614 }
615 computeUnit->idleWfs--;
616 assert(computeUnit->idleWfs >= 0);
617 }
618 }
619 status = newStatus;
620 }
621
622 void
623 Wavefront::start(uint64_t _wf_dyn_id, Addr init_pc)
624 {
625 wfDynId = _wf_dyn_id;
626 _pc = init_pc;
627
628 status = S_RUNNING;
629
630 vecReads.resize(maxVgprs, 0);
631 }
632
633 bool
634 Wavefront::isGmInstruction(GPUDynInstPtr ii)
635 {
636 if (ii->isGlobalMem() ||
637 (ii->isFlat() && ii->executedAs() == Enums::SC_GLOBAL)) {
638 return true;
639 }
640
641 return false;
642 }
643
644 bool
645 Wavefront::isLmInstruction(GPUDynInstPtr ii)
646 {
647 if (ii->isLocalMem() ||
648 (ii->isFlat() && ii->executedAs() == Enums::SC_GROUP)) {
649 return true;
650 }
651
652 return false;
653 }
654
655 bool
656 Wavefront::isOldestInstWaitcnt()
657 {
658 if (instructionBuffer.empty())
659 return false;
660
661 GPUDynInstPtr ii = instructionBuffer.front();
662
663 if (ii->isWaitcnt()) {
664 // waitcnt is a scalar
665 assert(ii->isScalar());
666 return true;
667 }
668
669 return false;
670 }
671
672 bool
673 Wavefront::isOldestInstScalarALU()
674 {
675 assert(!instructionBuffer.empty());
676 GPUDynInstPtr ii = instructionBuffer.front();
677
678 if (status != S_STOPPED && ii->isScalar() && (ii->isNop() || ii->isReturn()
679 || ii->isEndOfKernel() || ii->isBranch() || ii->isALU() ||
680 (ii->isKernArgSeg() && ii->isLoad()))) {
681 return true;
682 }
683
684 return false;
685 }
686
687 bool
688 Wavefront::isOldestInstVectorALU()
689 {
690 assert(!instructionBuffer.empty());
691 GPUDynInstPtr ii = instructionBuffer.front();
692
693 if (status != S_STOPPED && !ii->isScalar() && (ii->isNop() ||
694 ii->isReturn() || ii->isBranch() || ii->isALU() || ii->isEndOfKernel()
695 || (ii->isKernArgSeg() && ii->isLoad()))) {
696 return true;
697 }
698
699 return false;
700 }
701
702 bool
703 Wavefront::isOldestInstBarrier()
704 {
705 assert(!instructionBuffer.empty());
706 GPUDynInstPtr ii = instructionBuffer.front();
707
708 if (status != S_STOPPED && ii->isBarrier()) {
709 return true;
710 }
711
712 return false;
713 }
714
715 bool
716 Wavefront::isOldestInstGMem()
717 {
718 assert(!instructionBuffer.empty());
719 GPUDynInstPtr ii = instructionBuffer.front();
720
721 if (status != S_STOPPED && !ii->isScalar() && ii->isGlobalMem()) {
722 return true;
723 }
724
725 return false;
726 }
727
728 bool
729 Wavefront::isOldestInstScalarMem()
730 {
731 assert(!instructionBuffer.empty());
732 GPUDynInstPtr ii = instructionBuffer.front();
733
734 if (status != S_STOPPED && ii->isScalar() && ii->isGlobalMem()) {
735 return true;
736 }
737
738 return false;
739 }
740
741 bool
742 Wavefront::isOldestInstLMem()
743 {
744 assert(!instructionBuffer.empty());
745 GPUDynInstPtr ii = instructionBuffer.front();
746
747 if (status != S_STOPPED && ii->isLocalMem()) {
748 return true;
749 }
750
751 return false;
752 }
753
754 bool
755 Wavefront::isOldestInstPrivMem()
756 {
757 assert(!instructionBuffer.empty());
758 GPUDynInstPtr ii = instructionBuffer.front();
759
760 if (status != S_STOPPED && ii->isPrivateSeg()) {
761 return true;
762 }
763
764 return false;
765 }
766
767 bool
768 Wavefront::isOldestInstFlatMem()
769 {
770 assert(!instructionBuffer.empty());
771 GPUDynInstPtr ii = instructionBuffer.front();
772
773 if (status != S_STOPPED && ii->isFlat()) {
774 return true;
775 }
776
777 return false;
778 }
779
780 bool
781 Wavefront::stopFetch()
782 {
783 for (auto it : instructionBuffer) {
784 GPUDynInstPtr ii = it;
785 if (ii->isReturn() || ii->isBranch() ||
786 ii->isEndOfKernel()) {
787 return true;
788 }
789 }
790
791 return false;
792 }
793
794 void
795 Wavefront::freeResources()
796 {
797 execUnitId = -1;
798 }
799
800 void Wavefront::validateRequestCounters()
801 {
802 panic_if(wrGmReqsInPipe < 0 || rdGmReqsInPipe < 0 ||
803 wrLmReqsInPipe < 0 || rdLmReqsInPipe < 0 ||
804 outstandingReqs < 0,
805 "Negative requests in pipe for WF%d for slot%d"
806 " and SIMD%d: Rd GlobalMem Reqs=%d, Wr GlobalMem Reqs=%d,"
807 " Rd LocalMem Reqs=%d, Wr LocalMem Reqs=%d,"
808 " Outstanding Reqs=%d\n",
809 wfDynId, wfSlotId, simdId, rdGmReqsInPipe, wrGmReqsInPipe,
810 rdLmReqsInPipe, wrLmReqsInPipe, outstandingReqs);
811 }
812
813 void
814 Wavefront::reserveGmResource(GPUDynInstPtr ii)
815 {
816 if (!ii->isScalar()) {
817 if (ii->isLoad()) {
818 rdGmReqsInPipe++;
819 } else if (ii->isStore()) {
820 wrGmReqsInPipe++;
821 } else if (ii->isAtomic() || ii->isMemSync()) {
822 rdGmReqsInPipe++;
823 wrGmReqsInPipe++;
824 } else {
825 panic("Invalid memory operation!\n");
826 }
827 execUnitId = globalMem;
828 } else {
829 if (ii->isLoad()) {
830 scalarRdGmReqsInPipe++;
831 } else if (ii->isStore()) {
832 scalarWrGmReqsInPipe++;
833 } else if (ii->isAtomic() || ii->isMemSync()) {
834 scalarWrGmReqsInPipe++;
835 scalarRdGmReqsInPipe++;
836 } else {
837 panic("Invalid memory operation!\n");
838 }
839 execUnitId = scalarMem;
840 }
841 }
842
843 void
844 Wavefront::reserveLmResource(GPUDynInstPtr ii)
845 {
846 fatal_if(ii->isScalar(),
847 "Scalar instructions can not access Shared memory!!!");
848 if (ii->isLoad()) {
849 rdLmReqsInPipe++;
850 } else if (ii->isStore()) {
851 wrLmReqsInPipe++;
852 } else if (ii->isAtomic() || ii->isMemSync()) {
853 wrLmReqsInPipe++;
854 rdLmReqsInPipe++;
855 } else {
856 panic("Invalid memory operation!\n");
857 }
858 execUnitId = localMem;
859 }
860
861 std::vector<int>
862 Wavefront::reserveResources()
863 {
864 // vector of execution unit IDs to return to schedule stage
865 // this return is only used for debugging and an assertion...
866 std::vector<int> execUnitIds;
867
868 // Get current instruction
869 GPUDynInstPtr ii = instructionBuffer.front();
870 assert(ii);
871
872 // Single precision ALU or Branch or Return or Special instruction
873 if (ii->isALU() || ii->isSpecialOp() ||
874 ii->isBranch() || ii->isNop() ||
875 (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
876 ii->isReturn() || ii->isEndOfKernel()) {
877 if (!ii->isScalar()) {
878 execUnitId = simdId;
879 } else {
880 execUnitId = scalarAluGlobalIdx;
881 }
882 // this is to enforce a fixed number of cycles per issue slot per SIMD
883 } else if (ii->isBarrier()) {
884 execUnitId = ii->isScalar() ? scalarAluGlobalIdx : simdId;
885 } else if (ii->isFlat()) {
886 assert(!ii->isScalar());
887 reserveLmResource(ii);
888 // add execUnitId, reserved by reserveLmResource, list before it is
889 // overwriten by reserveGmResource
890 execUnitIds.push_back(execUnitId);
891 flatLmUnitId = execUnitId;
892 reserveGmResource(ii);
893 flatGmUnitId = execUnitId;
894 execUnitIds.push_back(flatGmUnitId);
895 execUnitId = -1;
896 } else if (ii->isGlobalMem()) {
897 reserveGmResource(ii);
898 } else if (ii->isLocalMem()) {
899 reserveLmResource(ii);
900 } else if (ii->isPrivateSeg()) {
901 fatal_if(ii->isScalar(),
902 "Scalar instructions can not access Private memory!!!");
903 reserveGmResource(ii);
904 } else {
905 panic("reserveResources -> Couldn't process op!\n");
906 }
907
908 if (execUnitId != -1) {
909 execUnitIds.push_back(execUnitId);
910 }
911 assert(execUnitIds.size());
912 return execUnitIds;
913 }
914
915 void
916 Wavefront::exec()
917 {
918 // ---- Exit if wavefront is inactive ----------------------------- //
919
920 if (status == S_STOPPED || status == S_RETURNING ||
921 status==S_STALLED || instructionBuffer.empty()) {
922 return;
923 }
924
925 if (status == S_WAITCNT) {
926 /**
927 * if this wave is in S_WAITCNT state, then
928 * it should enter exec() precisely one time
929 * before the waitcnts are satisfied, in order
930 * to execute the waitcnt instruction itself
931 * thus we assert that the waitcnt is the
932 * oldest instruction. if we enter exec() with
933 * active waitcnts, and we're not executing
934 * the waitcnt instruction, something must be
935 * wrong
936 */
937 assert(isOldestInstWaitcnt());
938 }
939
940 // Get current instruction
941
942 GPUDynInstPtr ii = instructionBuffer.front();
943
944 const Addr old_pc = pc();
945 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
946 "(pc: %#x; seqNum: %d)\n", computeUnit->cu_id, simdId, wfSlotId,
947 wfDynId, ii->disassemble(), old_pc, ii->seqNum());
948
949 ii->execute(ii);
950 // delete the dynamic instruction from the pipeline map
951 computeUnit->deleteFromPipeMap(this);
952 // update the instruction stats in the CU
953 computeUnit->updateInstStats(ii);
954
955 // inform VRF of instruction execution to schedule write-back
956 // and scoreboard ready for registers
957 if (!ii->isScalar()) {
958 computeUnit->vrf[simdId]->waveExecuteInst(this, ii);
959 }
960 computeUnit->srf[simdId]->waveExecuteInst(this, ii);
961
962 computeUnit->shader->vectorInstSrcOperand[ii->numSrcVecOperands()]++;
963 computeUnit->shader->vectorInstDstOperand[ii->numDstVecOperands()]++;
964 computeUnit->numInstrExecuted++;
965 numInstrExecuted++;
966 computeUnit->instExecPerSimd[simdId]++;
967 computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
968 computeUnit->lastExecCycle[simdId]);
969 computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
970
971 if (lastInstExec) {
972 computeUnit->instInterleave[simdId].
973 sample(computeUnit->instExecPerSimd[simdId] - lastInstExec);
974 }
975 lastInstExec = computeUnit->instExecPerSimd[simdId];
976
977 // want to track:
978 // number of reads that occur per value written
979
980 // vector RAW dependency tracking
981 for (int i = 0; i < ii->getNumOperands(); i++) {
982 if (ii->isVectorRegister(i)) {
983 int vgpr = ii->getRegisterIndex(i, ii);
984 int nReg = ii->getOperandSize(i) <= 4 ? 1 :
985 ii->getOperandSize(i) / 4;
986 for (int n = 0; n < nReg; n++) {
987 if (ii->isSrcOperand(i)) {
988 // This check should never fail, but to be safe we check
989 if (rawDist.find(vgpr+n) != rawDist.end()) {
990 vecRawDistance.
991 sample(numInstrExecuted.value() - rawDist[vgpr+n]);
992 }
993 // increment number of reads to this register
994 vecReads[vgpr+n]++;
995 } else if (ii->isDstOperand(i)) {
996 // rawDist is set on writes, but will not be set
997 // for the first write to each physical register
998 if (rawDist.find(vgpr+n) != rawDist.end()) {
999 // sample the number of reads that were performed
1000 readsPerWrite.sample(vecReads[vgpr+n]);
1001 }
1002 // on a write, reset count of reads to 0
1003 vecReads[vgpr+n] = 0;
1004
1005 rawDist[vgpr+n] = numInstrExecuted.value();
1006 }
1007 }
1008 }
1009 }
1010
1011 if (pc() == old_pc) {
1012 // PC not modified by instruction, proceed to next
1013 _gpuISA.advancePC(ii);
1014 instructionBuffer.pop_front();
1015 } else {
1016 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave%d %s taken branch\n",
1017 computeUnit->cu_id, simdId, wfSlotId, wfDynId,
1018 ii->disassemble());
1019 discardFetch();
1020 }
1021 DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] (pc: %#x)\n",
1022 computeUnit->cu_id, simdId, wfSlotId, wfDynId, pc());
1023
1024 if (computeUnit->shader->hsail_mode==Shader::SIMT) {
1025 const int num_active_lanes = execMask().count();
1026 computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
1027 computeUnit->numVecOpsExecuted += num_active_lanes;
1028
1029 if (ii->isF16() && ii->isALU()) {
1030 if (ii->isF32() || ii->isF64()) {
1031 fatal("Instruction is tagged as both (1) F16, and (2)"
1032 "either F32 or F64.");
1033 }
1034 computeUnit->numVecOpsExecutedF16 += num_active_lanes;
1035 if (ii->isFMA()) {
1036 computeUnit->numVecOpsExecutedFMA16 += num_active_lanes;
1037 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1038 }
1039 else if (ii->isMAC()) {
1040 computeUnit->numVecOpsExecutedMAC16 += num_active_lanes;
1041 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1042 }
1043 else if (ii->isMAD()) {
1044 computeUnit->numVecOpsExecutedMAD16 += num_active_lanes;
1045 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1046 }
1047 }
1048 if (ii->isF32() && ii->isALU()) {
1049 if (ii->isF16() || ii->isF64()) {
1050 fatal("Instruction is tagged as both (1) F32, and (2)"
1051 "either F16 or F64.");
1052 }
1053 computeUnit->numVecOpsExecutedF32 += num_active_lanes;
1054 if (ii->isFMA()) {
1055 computeUnit->numVecOpsExecutedFMA32 += num_active_lanes;
1056 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1057 }
1058 else if (ii->isMAC()) {
1059 computeUnit->numVecOpsExecutedMAC32 += num_active_lanes;
1060 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1061 }
1062 else if (ii->isMAD()) {
1063 computeUnit->numVecOpsExecutedMAD32 += num_active_lanes;
1064 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1065 }
1066 }
1067 if (ii->isF64() && ii->isALU()) {
1068 if (ii->isF16() || ii->isF32()) {
1069 fatal("Instruction is tagged as both (1) F64, and (2)"
1070 "either F16 or F32.");
1071 }
1072 computeUnit->numVecOpsExecutedF64 += num_active_lanes;
1073 if (ii->isFMA()) {
1074 computeUnit->numVecOpsExecutedFMA64 += num_active_lanes;
1075 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1076 }
1077 else if (ii->isMAC()) {
1078 computeUnit->numVecOpsExecutedMAC64 += num_active_lanes;
1079 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1080 }
1081 else if (ii->isMAD()) {
1082 computeUnit->numVecOpsExecutedMAD64 += num_active_lanes;
1083 computeUnit->numVecOpsExecutedTwoOpFP += num_active_lanes;
1084 }
1085 }
1086 if (isGmInstruction(ii)) {
1087 computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
1088 } else if (isLmInstruction(ii)) {
1089 computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
1090 }
1091 }
1092
1093 /**
1094 * we return here to avoid spurious errors related to flat insts
1095 * and their address segment resolution.
1096 */
1097 if (execMask().none() && ii->isFlat()) {
1098 computeUnit->getTokenManager()->recvTokens(1);
1099 return;
1100 }
1101
1102 // Update Vector ALU pipeline and other resources
1103 bool flat_as_gm = false;
1104 bool flat_as_lm = false;
1105 if (ii->isFlat()) {
1106 flat_as_gm = (ii->executedAs() == Enums::SC_GLOBAL) ||
1107 (ii->executedAs() == Enums::SC_PRIVATE);
1108 flat_as_lm = (ii->executedAs() == Enums::SC_GROUP);
1109 }
1110
1111 // Single precision ALU or Branch or Return or Special instruction
1112 // Note, we use the same timing regardless of SP or DP ALU operation.
1113 if (ii->isALU() || ii->isSpecialOp() ||
1114 ii->isBranch() || ii->isNop() ||
1115 (ii->isKernArgSeg() && ii->isLoad()) ||
1116 ii->isArgSeg() || ii->isEndOfKernel() || ii->isReturn()) {
1117 // this is to enforce a fixed number of cycles per issue slot per SIMD
1118 if (!ii->isScalar()) {
1119 computeUnit->vectorALUs[simdId].set(computeUnit->
1120 cyclesToTicks(computeUnit->issuePeriod));
1121 } else {
1122 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1123 cyclesToTicks(computeUnit->issuePeriod));
1124 }
1125 // Barrier on Scalar ALU
1126 } else if (ii->isBarrier()) {
1127 computeUnit->scalarALUs[scalarAlu].set(computeUnit->
1128 cyclesToTicks(computeUnit->issuePeriod));
1129 // GM or Flat as GM Load
1130 } else if (ii->isLoad() && (ii->isGlobalMem() || flat_as_gm)) {
1131 if (!ii->isScalar()) {
1132 computeUnit->vrfToGlobalMemPipeBus.set(
1133 computeUnit->cyclesToTicks(computeUnit->vrf_gm_bus_latency));
1134 computeUnit->vectorGlobalMemUnit.
1135 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1136 computeUnit->instCyclesVMemPerSimd[simdId] +=
1137 computeUnit->vrf_gm_bus_latency;
1138 } else {
1139 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1140 cyclesToTicks(computeUnit->srf_scm_bus_latency));
1141 computeUnit->scalarMemUnit.
1142 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1143 computeUnit->instCyclesScMemPerSimd[simdId] +=
1144 computeUnit->srf_scm_bus_latency;
1145 }
1146 // GM or Flat as GM Store
1147 } else if (ii->isStore() && (ii->isGlobalMem() || flat_as_gm)) {
1148 if (!ii->isScalar()) {
1149 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1150 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1151 computeUnit->vectorGlobalMemUnit.
1152 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1153 computeUnit->instCyclesVMemPerSimd[simdId] +=
1154 (2 * computeUnit->vrf_gm_bus_latency);
1155 } else {
1156 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1157 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1158 computeUnit->scalarMemUnit.
1159 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1160 computeUnit->instCyclesScMemPerSimd[simdId] +=
1161 (2 * computeUnit->srf_scm_bus_latency);
1162 }
1163 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1164 (ii->isGlobalMem() || flat_as_gm)) {
1165 if (!ii->isScalar()) {
1166 computeUnit->vrfToGlobalMemPipeBus.set(computeUnit->
1167 cyclesToTicks(Cycles(2 * computeUnit->vrf_gm_bus_latency)));
1168 computeUnit->vectorGlobalMemUnit.
1169 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1170 computeUnit->instCyclesVMemPerSimd[simdId] +=
1171 (2 * computeUnit->vrf_gm_bus_latency);
1172 } else {
1173 computeUnit->srfToScalarMemPipeBus.set(computeUnit->
1174 cyclesToTicks(Cycles(2 * computeUnit->srf_scm_bus_latency)));
1175 computeUnit->scalarMemUnit.
1176 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1177 computeUnit->instCyclesScMemPerSimd[simdId] +=
1178 (2 * computeUnit->srf_scm_bus_latency);
1179 }
1180 // LM or Flat as LM Load
1181 } else if (ii->isLoad() && (ii->isLocalMem() || flat_as_lm)) {
1182 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1183 cyclesToTicks(computeUnit->vrf_lm_bus_latency));
1184 computeUnit->vectorSharedMemUnit.
1185 set(computeUnit->shader->cyclesToTicks(computeUnit->issuePeriod));
1186 computeUnit->instCyclesLdsPerSimd[simdId] +=
1187 computeUnit->vrf_lm_bus_latency;
1188 // LM or Flat as LM Store
1189 } else if (ii->isStore() && (ii->isLocalMem() || flat_as_lm)) {
1190 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1191 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1192 computeUnit->vectorSharedMemUnit.
1193 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1194 computeUnit->instCyclesLdsPerSimd[simdId] +=
1195 (2 * computeUnit->vrf_lm_bus_latency);
1196 // LM or Flat as LM, Atomic or MemFence
1197 } else if ((ii->isAtomic() || ii->isMemSync()) &&
1198 (ii->isLocalMem() || flat_as_lm)) {
1199 computeUnit->vrfToLocalMemPipeBus.set(computeUnit->
1200 cyclesToTicks(Cycles(2 * computeUnit->vrf_lm_bus_latency)));
1201 computeUnit->vectorSharedMemUnit.
1202 set(computeUnit->cyclesToTicks(computeUnit->issuePeriod));
1203 computeUnit->instCyclesLdsPerSimd[simdId] +=
1204 (2 * computeUnit->vrf_lm_bus_latency);
1205 } else {
1206 panic("Bad instruction type!\n");
1207 }
1208 }
1209
1210 GPUDynInstPtr
1211 Wavefront::nextInstr()
1212 {
1213 // Read next instruction from instruction buffer
1214 GPUDynInstPtr ii = instructionBuffer.front();
1215 // if the WF has been dispatched in the schedule stage then
1216 // check the next oldest instruction for readiness
1217 if (computeUnit->pipeMap.find(ii->seqNum()) !=
1218 computeUnit->pipeMap.end()) {
1219 if (instructionBuffer.size() > 1) {
1220 auto it = instructionBuffer.begin() + 1;
1221 return *it;
1222 } else { // No new instructions to check
1223 return nullptr;
1224 }
1225 }
1226 return ii;
1227 }
1228
1229 void
1230 Wavefront::discardFetch()
1231 {
1232 instructionBuffer.clear();
1233 dropFetch |= pendingFetch;
1234
1235 /**
1236 * clear the fetch buffer for this wave in order to
1237 * remove any stale inst data
1238 */
1239 computeUnit->fetchStage.fetchUnit(simdId).flushBuf(wfSlotId);
1240 }
1241
1242 bool
1243 Wavefront::waitCntsSatisfied()
1244 {
1245 // Both vmWaitCnt && lgkmWaitCnt uninitialized means
1246 // waitCnt instruction has been dispatched but not executed yet: next
1247 // instruction should be blocked until waitCnt is executed.
1248 if (vmWaitCnt == -1 && expWaitCnt == -1 && lgkmWaitCnt == -1) {
1249 return false;
1250 }
1251
1252 /**
1253 * If we reach here, that means an s_waitcnt instruction was executed
1254 * and the waitcnts are set by the execute method. Check if waitcnts
1255 * are satisfied.
1256 */
1257 if (vmWaitCnt != -1) {
1258 if (vmemInstsIssued > vmWaitCnt) {
1259 // vmWaitCnt not satisfied
1260 return false;
1261 }
1262 }
1263
1264 if (expWaitCnt != -1) {
1265 if (expInstsIssued > expWaitCnt) {
1266 // expWaitCnt not satisfied
1267 return false;
1268 }
1269 }
1270
1271 if (lgkmWaitCnt != -1) {
1272 if (lgkmInstsIssued > lgkmWaitCnt) {
1273 // lgkmWaitCnt not satisfied
1274 return false;
1275 }
1276 }
1277
1278 // if we get here all outstanding waitcnts must
1279 // be satisfied, so we resume normal operation
1280 clearWaitCnts();
1281
1282 return true;
1283 }
1284
1285 void
1286 Wavefront::setWaitCnts(int vm_wait_cnt, int exp_wait_cnt, int lgkm_wait_cnt)
1287 {
1288 // the scoreboard should have set the status
1289 // to S_WAITCNT once a waitcnt instruction
1290 // was marked as ready
1291 assert(status == S_WAITCNT);
1292
1293 // waitcnt instruction shouldn't be sending
1294 // negative counts
1295 assert(vm_wait_cnt >= 0);
1296 assert(exp_wait_cnt >= 0);
1297 assert(lgkm_wait_cnt >= 0);
1298 // waitcnts are a max of 15 because we have
1299 // only 1 nibble (4 bits) to set the counts
1300 assert(vm_wait_cnt <= 0xf);
1301 assert(exp_wait_cnt <= 0x7);
1302 assert(lgkm_wait_cnt <= 0x1f);
1303
1304 /**
1305 * prior waitcnts should be satisfied,
1306 * at which time the WF resets them
1307 * back to -1, indicating they are no
1308 * longer active
1309 */
1310 assert(vmWaitCnt == -1);
1311 assert(expWaitCnt == -1);
1312 assert(lgkmWaitCnt == -1);
1313
1314 /**
1315 * if the instruction encoding
1316 * indicates a waitcnt of 0xf,
1317 * that means the waitcnt is
1318 * not being used
1319 */
1320 if (vm_wait_cnt != 0xf)
1321 vmWaitCnt = vm_wait_cnt;
1322
1323 if (exp_wait_cnt != 0x7)
1324 expWaitCnt = exp_wait_cnt;
1325
1326 if (lgkm_wait_cnt != 0x1f)
1327 lgkmWaitCnt = lgkm_wait_cnt;
1328 }
1329
1330 void
1331 Wavefront::clearWaitCnts()
1332 {
1333 // reset the waitcnts back to
1334 // -1, indicating they are no
1335 // longer valid
1336 vmWaitCnt = -1;
1337 expWaitCnt = -1;
1338 lgkmWaitCnt = -1;
1339
1340 // resume running normally
1341 status = S_RUNNING;
1342 }
1343
1344 void
1345 Wavefront::incVMemInstsIssued()
1346 {
1347 ++vmemInstsIssued;
1348 }
1349
1350 void
1351 Wavefront::incExpInstsIssued()
1352 {
1353 ++expInstsIssued;
1354 }
1355
1356 void
1357 Wavefront::incLGKMInstsIssued()
1358 {
1359 ++lgkmInstsIssued;
1360 }
1361
1362 void
1363 Wavefront::decVMemInstsIssued()
1364 {
1365 --vmemInstsIssued;
1366 }
1367
1368 void
1369 Wavefront::decExpInstsIssued()
1370 {
1371 --expInstsIssued;
1372 }
1373
1374 void
1375 Wavefront::decLGKMInstsIssued()
1376 {
1377 --lgkmInstsIssued;
1378 }
1379
1380 Addr
1381 Wavefront::pc() const
1382 {
1383 return _pc;
1384 }
1385
1386 void
1387 Wavefront::pc(Addr new_pc)
1388 {
1389 _pc = new_pc;
1390 }
1391
1392 VectorMask&
1393 Wavefront::execMask()
1394 {
1395 return _execMask;
1396 }
1397
1398 bool
1399 Wavefront::execMask(int lane) const
1400 {
1401 return _execMask[lane];
1402 }
1403
1404 void
1405 Wavefront::freeRegisterFile()
1406 {
1407 /* clear busy registers */
1408 for (int i=0; i < maxVgprs; i++) {
1409 int vgprIdx = computeUnit->registerManager->mapVgpr(this, i);
1410 computeUnit->vrf[simdId]->markReg(vgprIdx, false);
1411 }
1412
1413 /* Free registers used by this wavefront */
1414 uint32_t endIndex = (startVgprIndex + reservedVectorRegs - 1) %
1415 computeUnit->vrf[simdId]->numRegs();
1416 computeUnit->registerManager->vrfPoolMgrs[simdId]->
1417 freeRegion(startVgprIndex, endIndex);
1418 }
1419
1420 void
1421 Wavefront::computeActualWgSz(HSAQueueEntry *task)
1422 {
1423 actualWgSzTotal = 1;
1424 for (int d = 0; d < HSAQueueEntry::MAX_DIM; ++d) {
1425 actualWgSz[d] = std::min(workGroupSz[d], gridSz[d]
1426 - task->wgId(d) * workGroupSz[d]);
1427 actualWgSzTotal *= actualWgSz[d];
1428 }
1429 }
1430
1431 void
1432 Wavefront::barrierId(int bar_id)
1433 {
1434 assert(bar_id >= WFBarrier::InvalidID);
1435 assert(bar_id < computeUnit->numBarrierSlots());
1436 barId = bar_id;
1437 }
1438
1439 int
1440 Wavefront::barrierId() const
1441 {
1442 return barId;
1443 }
1444
1445 bool
1446 Wavefront::hasBarrier() const
1447 {
1448 return barId > WFBarrier::InvalidID;
1449 }
1450
1451 void
1452 Wavefront::releaseBarrier()
1453 {
1454 barId = WFBarrier::InvalidID;
1455 }