arch-x86: Make JRCXZ instruction do 64-bit jump
[gem5.git] / src / gpu-compute / gpu_dyn_inst.cc
1 /*
2 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/gpu_dyn_inst.hh"
35
36 #include "debug/GPUMem.hh"
37 #include "gpu-compute/gpu_static_inst.hh"
38 #include "gpu-compute/scalar_register_file.hh"
39 #include "gpu-compute/shader.hh"
40 #include "gpu-compute/wavefront.hh"
41
42 GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
43 GPUStaticInst *static_inst, InstSeqNum instSeqNum)
44 : GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
45 (Addr)0), numScalarReqs(0), isSaveRestore(false),
46 _staticInst(static_inst), _seqNum(instSeqNum)
47 {
48 statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
49 tlbHitLevel.assign(computeUnit()->wfSize(), -1);
50 // vector instructions can have up to 4 source/destination operands
51 d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
52 a_data = new uint8_t[computeUnit()->wfSize() * 8];
53 x_data = new uint8_t[computeUnit()->wfSize() * 8];
54 // scalar loads can read up to 16 Dwords of data (see publicly
55 // available GCN3 ISA manual)
56 scalar_data = new uint8_t[16 * sizeof(uint32_t)];
57 for (int i = 0; i < (16 * sizeof(uint32_t)); ++i) {
58 scalar_data[i] = 0;
59 }
60 for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
61 a_data[i] = 0;
62 x_data[i] = 0;
63 }
64 for (int i = 0; i < (computeUnit()->wfSize() * 4 * sizeof(double)); ++i) {
65 d_data[i] = 0;
66 }
67 time = 0;
68
69 cu_id = _cu->cu_id;
70 if (_wf) {
71 simdId = _wf->simdId;
72 wfDynId = _wf->wfDynId;
73 kern_id = _wf->kernId;
74 wg_id = _wf->wgId;
75 wfSlotId = _wf->wfSlotId;
76 } else {
77 simdId = -1;
78 wfDynId = -1;
79 kern_id = -1;
80 wg_id = -1;
81 wfSlotId = -1;
82 }
83 }
84
85 GPUDynInst::~GPUDynInst()
86 {
87 delete[] d_data;
88 delete[] a_data;
89 delete[] x_data;
90 delete[] scalar_data;
91 delete _staticInst;
92 }
93
94 void
95 GPUDynInst::execute(GPUDynInstPtr gpuDynInst)
96 {
97 _staticInst->execute(gpuDynInst);
98 }
99
100 int
101 GPUDynInst::numSrcRegOperands()
102 {
103 return _staticInst->numSrcRegOperands();
104 }
105
106 int
107 GPUDynInst::numDstRegOperands()
108 {
109 return _staticInst->numDstRegOperands();
110 }
111
112 int
113 GPUDynInst::numSrcVecOperands()
114 {
115 return _staticInst->numSrcVecOperands();
116 }
117
118 int
119 GPUDynInst::numDstVecOperands()
120 {
121 return _staticInst->numDstVecOperands();
122 }
123
124 int
125 GPUDynInst::numSrcVecDWORDs()
126 {
127 return _staticInst->numSrcVecDWORDs();
128 }
129
130 int
131 GPUDynInst::numDstVecDWORDs()
132 {
133 return _staticInst->numDstVecDWORDs();
134 }
135
136 int
137 GPUDynInst::numOpdDWORDs(int operandIdx)
138 {
139 return _staticInst->numOpdDWORDs(operandIdx);
140 }
141
142 int
143 GPUDynInst::getNumOperands()
144 {
145 return _staticInst->getNumOperands();
146 }
147
148 bool
149 GPUDynInst::isVectorRegister(int operandIdx)
150 {
151 return _staticInst->isVectorRegister(operandIdx);
152 }
153
154 bool
155 GPUDynInst::isScalarRegister(int operandIdx)
156 {
157 return _staticInst->isScalarRegister(operandIdx);
158 }
159
160 int
161 GPUDynInst::getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst)
162 {
163 return _staticInst->getRegisterIndex(operandIdx, gpuDynInst);
164 }
165
166 int
167 GPUDynInst::getOperandSize(int operandIdx)
168 {
169 return _staticInst->getOperandSize(operandIdx);
170 }
171
172 bool
173 GPUDynInst::isDstOperand(int operandIdx)
174 {
175 return _staticInst->isDstOperand(operandIdx);
176 }
177
178 bool
179 GPUDynInst::isSrcOperand(int operandIdx)
180 {
181 return _staticInst->isSrcOperand(operandIdx);
182 }
183
184 bool
185 GPUDynInst::hasSourceSgpr() const
186 {
187 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
188 if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
189 return true;
190 }
191 }
192 return false;
193 }
194
195 bool
196 GPUDynInst::hasSourceVgpr() const
197 {
198 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
199 if (_staticInst->isVectorRegister(i) && _staticInst->isSrcOperand(i)) {
200 return true;
201 }
202 }
203 return false;
204 }
205
206 bool
207 GPUDynInst::hasDestinationSgpr() const
208 {
209 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
210 if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
211 return true;
212 }
213 }
214 return false;
215 }
216
217 bool
218 GPUDynInst::srcIsVgpr(int index) const
219 {
220 assert(index >= 0 && index < _staticInst->getNumOperands());
221 if (_staticInst->isVectorRegister(index) &&
222 _staticInst->isSrcOperand(index)) {
223 return true;
224 }
225 return false;
226 }
227
228 bool
229 GPUDynInst::hasDestinationVgpr() const
230 {
231 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
232 if (_staticInst->isVectorRegister(i) && _staticInst->isDstOperand(i)) {
233 return true;
234 }
235 }
236 return false;
237 }
238
239 bool
240 GPUDynInst::isOpcode(const std::string& opcodeStr,
241 const std::string& extStr) const
242 {
243 return _staticInst->opcode().find(opcodeStr) != std::string::npos &&
244 _staticInst->opcode().find(extStr) != std::string::npos;
245 }
246
247 bool
248 GPUDynInst::isOpcode(const std::string& opcodeStr) const
249 {
250 return _staticInst->opcode().find(opcodeStr) != std::string::npos;
251 }
252
253 const std::string&
254 GPUDynInst::disassemble() const
255 {
256 return _staticInst->disassemble();
257 }
258
259 InstSeqNum
260 GPUDynInst::seqNum() const
261 {
262 return _seqNum;
263 }
264
265 Enums::StorageClassType
266 GPUDynInst::executedAs()
267 {
268 return _staticInst->executed_as;
269 }
270
271 bool
272 GPUDynInst::hasVgprRawDependence(GPUDynInstPtr s)
273 {
274 assert(s);
275 for (int i = 0; i < getNumOperands(); ++i) {
276 if (isVectorRegister(i) && isSrcOperand(i)) {
277 for (int j = 0; j < s->getNumOperands(); ++j) {
278 if (s->isVectorRegister(j) && s->isDstOperand(j)) {
279 if (i == j)
280 return true;
281 }
282 }
283 }
284 }
285 return false;
286 }
287
288 bool
289 GPUDynInst::hasSgprRawDependence(GPUDynInstPtr s)
290 {
291 assert(s);
292 for (int i = 0; i < getNumOperands(); ++i) {
293 if (isScalarRegister(i) && isSrcOperand(i)) {
294 for (int j = 0; j < s->getNumOperands(); ++j) {
295 if (s->isScalarRegister(j) && s->isDstOperand(j)) {
296 if (i == j)
297 return true;
298 }
299 }
300 }
301 }
302 return false;
303 }
304
305 // Process a memory instruction and (if necessary) submit timing request
306 void
307 GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
308 {
309 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
310 cu->cu_id, simdId, wfSlotId, exec_mask);
311
312 _staticInst->initiateAcc(gpuDynInst);
313 }
314
315 void
316 GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
317 {
318 DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector="
319 "%#x\n complete",
320 cu->cu_id, simdId, wfSlotId, exec_mask);
321
322 _staticInst->completeAcc(gpuDynInst);
323 }
324
325 /**
326 * accessor methods for the attributes of
327 * the underlying GPU static instruction
328 */
329 bool
330 GPUDynInst::isALU() const
331 {
332 return _staticInst->isALU();
333 }
334
335 bool
336 GPUDynInst::isBranch() const
337 {
338 return _staticInst->isBranch();
339 }
340
341 bool
342 GPUDynInst::isCondBranch() const
343 {
344 return _staticInst->isCondBranch();
345 }
346
347 bool
348 GPUDynInst::isNop() const
349 {
350 return _staticInst->isNop();
351 }
352
353 bool
354 GPUDynInst::isEndOfKernel() const
355 {
356 return _staticInst->isEndOfKernel();
357 }
358
359 bool
360 GPUDynInst::isKernelLaunch() const
361 {
362 return _staticInst->isKernelLaunch();
363 }
364
365 bool
366 GPUDynInst::isSDWAInst() const
367 {
368 return _staticInst->isSDWAInst();
369 }
370
371 bool
372 GPUDynInst::isDPPInst() const
373 {
374 return _staticInst->isDPPInst();
375 }
376
377 bool
378 GPUDynInst::isReturn() const
379 {
380 return _staticInst->isReturn();
381 }
382
383 bool
384 GPUDynInst::isUnconditionalJump() const
385 {
386 return _staticInst->isUnconditionalJump();
387 }
388
389 bool
390 GPUDynInst::isSpecialOp() const
391 {
392 return _staticInst->isSpecialOp();
393 }
394
395 bool
396 GPUDynInst::isWaitcnt() const
397 {
398 return _staticInst->isWaitcnt();
399 }
400
401 bool
402 GPUDynInst::isBarrier() const
403 {
404 return _staticInst->isBarrier();
405 }
406
407 bool
408 GPUDynInst::isMemSync() const
409 {
410 return _staticInst->isMemSync();
411 }
412
413 bool
414 GPUDynInst::isMemRef() const
415 {
416 return _staticInst->isMemRef();
417 }
418
419 bool
420 GPUDynInst::isFlat() const
421 {
422 return _staticInst->isFlat();
423 }
424
425 bool
426 GPUDynInst::isLoad() const
427 {
428 return _staticInst->isLoad();
429 }
430
431 bool
432 GPUDynInst::isStore() const
433 {
434 return _staticInst->isStore();
435 }
436
437 bool
438 GPUDynInst::isAtomic() const
439 {
440 return _staticInst->isAtomic();
441 }
442
443 bool
444 GPUDynInst::isAtomicNoRet() const
445 {
446 return _staticInst->isAtomicNoRet();
447 }
448
449 bool
450 GPUDynInst::isAtomicRet() const
451 {
452 return _staticInst->isAtomicRet();
453 }
454
455 bool
456 GPUDynInst::isVector() const
457 {
458 return !_staticInst->isScalar();
459 }
460
461 bool
462 GPUDynInst::isScalar() const
463 {
464 return _staticInst->isScalar();
465 }
466
467 bool
468 GPUDynInst::readsSCC() const
469 {
470 return _staticInst->readsSCC();
471 }
472
473 bool
474 GPUDynInst::writesSCC() const
475 {
476 return _staticInst->writesSCC();
477 }
478
479 bool
480 GPUDynInst::readsVCC() const
481 {
482 return _staticInst->readsVCC();
483 }
484
485 bool
486 GPUDynInst::writesVCC() const
487 {
488 return _staticInst->writesVCC();
489 }
490
491 bool
492 GPUDynInst::readsMode() const
493 {
494 return _staticInst->readsMode();
495 }
496
497 bool
498 GPUDynInst::writesMode() const
499 {
500 return _staticInst->writesMode();
501 }
502
503 bool
504 GPUDynInst::readsEXEC() const
505 {
506 return _staticInst->readsEXEC();
507 }
508
509 bool
510 GPUDynInst::writesEXEC() const
511 {
512 return _staticInst->writesEXEC();
513 }
514
515 bool
516 GPUDynInst::ignoreExec() const
517 {
518 return _staticInst->ignoreExec();
519 }
520
521 bool
522 GPUDynInst::writesExecMask() const
523 {
524 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
525 return _staticInst->isDstOperand(i) &&
526 _staticInst->isExecMaskRegister(i);
527 }
528 return false;
529 }
530
531 bool
532 GPUDynInst::readsExecMask() const
533 {
534 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
535 return _staticInst->isSrcOperand(i) &&
536 _staticInst->isExecMaskRegister(i);
537 }
538 return false;
539 }
540
541 bool
542 GPUDynInst::writesFlatScratch() const
543 {
544 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
545 if (_staticInst->isScalarRegister(i) && _staticInst->isDstOperand(i)) {
546 return _staticInst->isFlatScratchRegister(i);
547 }
548 }
549 return false;
550 }
551
552 bool
553 GPUDynInst::readsFlatScratch() const
554 {
555 for (int i = 0; i < _staticInst->getNumOperands(); ++i) {
556 if (_staticInst->isScalarRegister(i) && _staticInst->isSrcOperand(i)) {
557 return _staticInst->isFlatScratchRegister(i);
558 }
559 }
560 return false;
561 }
562
563 bool
564 GPUDynInst::isAtomicAnd() const
565 {
566 return _staticInst->isAtomicAnd();
567 }
568
569 bool
570 GPUDynInst::isAtomicOr() const
571 {
572 return _staticInst->isAtomicOr();
573 }
574
575 bool
576 GPUDynInst::isAtomicXor() const
577 {
578 return _staticInst->isAtomicXor();
579 }
580
581 bool
582 GPUDynInst::isAtomicCAS() const
583 {
584 return _staticInst->isAtomicCAS();
585 }
586
587 bool GPUDynInst::isAtomicExch() const
588 {
589 return _staticInst->isAtomicExch();
590 }
591
592 bool
593 GPUDynInst::isAtomicAdd() const
594 {
595 return _staticInst->isAtomicAdd();
596 }
597
598 bool
599 GPUDynInst::isAtomicSub() const
600 {
601 return _staticInst->isAtomicSub();
602 }
603
604 bool
605 GPUDynInst::isAtomicInc() const
606 {
607 return _staticInst->isAtomicInc();
608 }
609
610 bool
611 GPUDynInst::isAtomicDec() const
612 {
613 return _staticInst->isAtomicDec();
614 }
615
616 bool
617 GPUDynInst::isAtomicMax() const
618 {
619 return _staticInst->isAtomicMax();
620 }
621
622 bool
623 GPUDynInst::isAtomicMin() const
624 {
625 return _staticInst->isAtomicMin();
626 }
627
628 bool
629 GPUDynInst::isArgLoad() const
630 {
631 return _staticInst->isArgLoad();
632 }
633
634 bool
635 GPUDynInst::isGlobalMem() const
636 {
637 return _staticInst->isGlobalMem();
638 }
639
640 bool
641 GPUDynInst::isLocalMem() const
642 {
643 return _staticInst->isLocalMem();
644 }
645
646 bool
647 GPUDynInst::isArgSeg() const
648 {
649 return _staticInst->isArgSeg();
650 }
651
652 bool
653 GPUDynInst::isGlobalSeg() const
654 {
655 return _staticInst->isGlobalSeg();
656 }
657
658 bool
659 GPUDynInst::isGroupSeg() const
660 {
661 return _staticInst->isGroupSeg();
662 }
663
664 bool
665 GPUDynInst::isKernArgSeg() const
666 {
667 return _staticInst->isKernArgSeg();
668 }
669
670 bool
671 GPUDynInst::isPrivateSeg() const
672 {
673 return _staticInst->isPrivateSeg();
674 }
675
676 bool
677 GPUDynInst::isReadOnlySeg() const
678 {
679 return _staticInst->isReadOnlySeg();
680 }
681
682 bool
683 GPUDynInst::isSpillSeg() const
684 {
685 return _staticInst->isSpillSeg();
686 }
687
688 bool
689 GPUDynInst::isGloballyCoherent() const
690 {
691 return _staticInst->isGloballyCoherent();
692 }
693
694 bool
695 GPUDynInst::isSystemCoherent() const
696 {
697 return _staticInst->isSystemCoherent();
698 }
699
700 bool
701 GPUDynInst::isF16() const
702 {
703 return _staticInst->isF16();
704 }
705
706 bool
707 GPUDynInst::isF32() const
708 {
709 return _staticInst->isF32();
710 }
711
712 bool
713 GPUDynInst::isF64() const
714 {
715 return _staticInst->isF64();
716 }
717
718 bool
719 GPUDynInst::isFMA() const
720 {
721 return _staticInst->isFMA();
722 }
723
724 bool
725 GPUDynInst::isMAC() const
726 {
727 return _staticInst->isMAC();
728 }
729
730 bool
731 GPUDynInst::isMAD() const
732 {
733 return _staticInst->isMAD();
734 }
735
736 void
737 GPUDynInst::doApertureCheck(const VectorMask &mask)
738 {
739 assert(mask.any());
740 // find the segment of the first active address, after
741 // that we check that all other active addresses also
742 // fall within the same APE
743 for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
744 if (mask[lane]) {
745 if (computeUnit()->shader->isLdsApe(addr[lane])) {
746 // group segment
747 staticInstruction()->executed_as = Enums::SC_GROUP;
748 break;
749 } else if (computeUnit()->shader->isScratchApe(addr[lane])) {
750 // private segment
751 staticInstruction()->executed_as = Enums::SC_PRIVATE;
752 break;
753 } else if (computeUnit()->shader->isGpuVmApe(addr[lane])) {
754 // we won't support GPUVM
755 fatal("flat access is in GPUVM APE\n");
756 } else if (bits(addr[lane], 63, 47) != 0x1FFFF &&
757 bits(addr[lane], 63, 47)) {
758 // we are in the "hole", this is a memory violation
759 fatal("flat access at addr %#x has a memory violation\n",
760 addr[lane]);
761 } else {
762 // global memory segment
763 staticInstruction()->executed_as = Enums::SC_GLOBAL;
764 break;
765 }
766 }
767 }
768
769 // we should have found the segment
770 assert(executedAs() != Enums::SC_NONE);
771
772 // flat accesses should not straddle multiple APEs so we
773 // must check that all addresses fall within the same APE
774 if (executedAs() == Enums::SC_GROUP) {
775 for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
776 if (mask[lane]) {
777 // if the first valid addr we found above was LDS,
778 // all the rest should be
779 assert(computeUnit()->shader->isLdsApe(addr[lane]));
780 }
781 }
782 } else if (executedAs() == Enums::SC_PRIVATE) {
783 for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
784 if (mask[lane]) {
785 // if the first valid addr we found above was private,
786 // all the rest should be
787 assert(computeUnit()->shader->isScratchApe(addr[lane]));
788 }
789 }
790 } else {
791 for (int lane = 0; lane < computeUnit()->wfSize(); ++lane) {
792 if (mask[lane]) {
793 // if the first valid addr we found above was global,
794 // all the rest should be. because we don't have an
795 // explicit range of the global segment, we just make
796 // sure that the address fall in no other APE and that
797 // it is not a memory violation
798 assert(!computeUnit()->shader->isLdsApe(addr[lane]));
799 assert(!computeUnit()->shader->isScratchApe(addr[lane]));
800 assert(!computeUnit()->shader->isGpuVmApe(addr[lane]));
801 assert(!(bits(addr[lane], 63, 47) != 0x1FFFF
802 && bits(addr[lane], 63, 47)));
803 }
804 }
805 }
806 }
807
808 void
809 GPUDynInst::resolveFlatSegment(const VectorMask &mask)
810 {
811 doApertureCheck(mask);
812
813
814 // Now that we know the aperature, do the following:
815 // 1. Transform the flat address to its segmented equivalent.
816 // 2. Set the execUnitId based an the aperture check.
817 // 3. Decrement any extra resources that were reserved. Other
818 // resources are released as normal, below.
819 if (executedAs() == Enums::SC_GLOBAL) {
820 // no transormation for global segment
821 wavefront()->execUnitId = wavefront()->flatGmUnitId;
822 if (isLoad()) {
823 wavefront()->rdLmReqsInPipe--;
824 } else if (isStore()) {
825 wavefront()->wrLmReqsInPipe--;
826 } else if (isAtomic() || isMemSync()) {
827 wavefront()->wrLmReqsInPipe--;
828 wavefront()->rdLmReqsInPipe--;
829 } else {
830 panic("Invalid memory operation!\n");
831 }
832 } else if (executedAs() == Enums::SC_GROUP) {
833 for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
834 if (mask[lane]) {
835 // flat address calculation goes here.
836 // addr[lane] = segmented address
837 panic("Flat group memory operation is unimplemented!\n");
838 }
839 }
840 wavefront()->execUnitId = wavefront()->flatLmUnitId;
841 wavefront()->decVMemInstsIssued();
842 if (isLoad()) {
843 wavefront()->rdGmReqsInPipe--;
844 } else if (isStore()) {
845 wavefront()->wrGmReqsInPipe--;
846 } else if (isAtomic() || isMemSync()) {
847 wavefront()->rdGmReqsInPipe--;
848 wavefront()->wrGmReqsInPipe--;
849 } else {
850 panic("Invalid memory operation!\n");
851 }
852 } else if (executedAs() == Enums::SC_PRIVATE) {
853 /**
854 * Flat instructions may resolve to the private segment (scratch),
855 * which is backed by main memory and provides per-lane scratch
856 * memory. Flat addressing uses apertures - registers that specify
857 * the address range in the VA space where LDS/private memory is
858 * mapped. The value of which is set by the kernel mode driver.
859 * These apertures use addresses that are not used by x86 CPUs.
860 * When the address of a Flat operation falls into one of the
861 * apertures, the Flat operation is redirected to either LDS or
862 * to the private memory segment.
863 *
864 * For private memory the SW runtime will allocate some space in
865 * the VA space for each AQL queue. The base address of which is
866 * stored in scalar registers per the AMD GPU ABI. The amd_queue_t
867 * scratch_backing_memory_location provides the base address in
868 * memory for the queue's private segment. Various other fields
869 * loaded into register state during kernel launch specify per-WF
870 * and per-work-item offsets so that individual lanes may access
871 * their private segment allocation.
872 *
873 * For more details about flat addressing see:
874 * http://rocm-documentation.readthedocs.io/en/latest/
875 * ROCm_Compiler_SDK/ROCm-Native-ISA.html#flat-scratch
876 *
877 * https://github.com/ROCm-Developer-Tools/
878 * ROCm-ComputeABI-Doc/blob/master/AMDGPU-ABI.md
879 * #flat-addressing
880 */
881
882 uint32_t numSgprs = wavefront()->maxSgprs;
883 uint32_t physSgprIdx =
884 wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
885 numSgprs - 3);
886 uint32_t offset =
887 wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
888 physSgprIdx =
889 wavefront()->computeUnit->registerManager->mapSgpr(wavefront(),
890 numSgprs - 4);
891 uint32_t size =
892 wavefront()->computeUnit->srf[simdId]->read(physSgprIdx);
893 for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
894 if (mask[lane]) {
895 addr[lane] = addr[lane] + lane * size + offset +
896 wavefront()->computeUnit->shader->getHiddenPrivateBase() -
897 wavefront()->computeUnit->shader->getScratchBase();
898 }
899 }
900 wavefront()->execUnitId = wavefront()->flatLmUnitId;
901 wavefront()->decLGKMInstsIssued();
902 if (isLoad()) {
903 wavefront()->rdGmReqsInPipe--;
904 } else if (isStore()) {
905 wavefront()->wrGmReqsInPipe--;
906 } else if (isAtomic() || isMemSync()) {
907 wavefront()->rdGmReqsInPipe--;
908 wavefront()->wrGmReqsInPipe--;
909 } else {
910 panic("Invalid memory operation!\n");
911 }
912 } else {
913 for (int lane = 0; lane < wavefront()->computeUnit->wfSize(); ++lane) {
914 if (mask[lane]) {
915 panic("flat addr %#llx maps to bad segment %d\n",
916 addr[lane], executedAs());
917 }
918 }
919 }
920 }
921
922 TheGpuISA::ScalarRegU32
923 GPUDynInst::srcLiteral() const
924 {
925 return _staticInst->srcLiteral();
926 }
927
928 void
929 GPUDynInst::updateStats()
930 {
931 if (_staticInst->isLocalMem()) {
932 // access to LDS (shared) memory
933 cu->stats.dynamicLMemInstrCnt++;
934 } else if (_staticInst->isFlat()) {
935 cu->stats.dynamicFlatMemInstrCnt++;
936 } else {
937 // access to global memory
938
939 // update PageDivergence histogram
940 int number_pages_touched = cu->pagesTouched.size();
941 assert(number_pages_touched);
942 cu->stats.pageDivergenceDist.sample(number_pages_touched);
943
944 std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
945
946 for (auto it : cu->pagesTouched) {
947 // see if this page has been touched before. if not, this also
948 // inserts the page into the table.
949 ret = cu->pageAccesses
950 .insert(ComputeUnit::pageDataStruct::value_type(it.first,
951 std::make_pair(1, it.second)));
952
953 // if yes, then update the stats
954 if (!ret.second) {
955 ret.first->second.first++;
956 ret.first->second.second += it.second;
957 }
958 }
959
960 cu->pagesTouched.clear();
961
962 // total number of memory instructions (dynamic)
963 // Atomics are counted as a single memory instruction.
964 // this is # memory instructions per wavefronts, not per workitem
965 cu->stats.dynamicGMemInstrCnt++;
966 }
967 }
968
969 void
970 GPUDynInst::profileRoundTripTime(Tick currentTime, int hopId)
971 {
972 // Only take the first measurement in the case of coalescing
973 if (roundTripTime.size() > hopId)
974 return;
975
976 roundTripTime.push_back(currentTime);
977 }
978
979 void
980 GPUDynInst::profileLineAddressTime(Addr addr, Tick currentTime, int hopId)
981 {
982 if (lineAddressTime.count(addr)) {
983 if (lineAddressTime[addr].size() > hopId) {
984 return;
985 }
986
987 lineAddressTime[addr].push_back(currentTime);
988 } else if (hopId == 0) {
989 auto addressTimeVec = std::vector<Tick> { currentTime };
990 lineAddressTime.insert(std::make_pair(addr, addressTimeVec));
991 }
992 }