arch: Make and use endian specific versions of the mem helpers.
[gem5.git] / src / gpu-compute / gpu_tlb.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36 #include "gpu-compute/gpu_tlb.hh"
37
38 #include <cmath>
39 #include <cstring>
40
41 #include "arch/x86/faults.hh"
42 #include "arch/x86/insts/microldstop.hh"
43 #include "arch/x86/pagetable.hh"
44 #include "arch/x86/pagetable_walker.hh"
45 #include "arch/x86/regs/misc.hh"
46 #include "arch/x86/x86_traits.hh"
47 #include "base/bitfield.hh"
48 #include "base/logging.hh"
49 #include "base/output.hh"
50 #include "base/trace.hh"
51 #include "cpu/base.hh"
52 #include "cpu/thread_context.hh"
53 #include "debug/GPUPrefetch.hh"
54 #include "debug/GPUTLB.hh"
55 #include "mem/packet_access.hh"
56 #include "mem/page_table.hh"
57 #include "mem/request.hh"
58 #include "sim/process.hh"
59
60 namespace X86ISA
61 {
62
63 GpuTLB::GpuTLB(const Params *p)
64 : ClockedObject(p), configAddress(0), size(p->size),
65 cleanupEvent([this]{ cleanup(); }, name(), false,
66 Event::Maximum_Pri),
67 exitEvent([this]{ exitCallback(); }, name())
68 {
69 assoc = p->assoc;
70 assert(assoc <= size);
71 numSets = size/assoc;
72 allocationPolicy = p->allocationPolicy;
73 hasMemSidePort = false;
74 accessDistance = p->accessDistance;
75 clock = p->clk_domain->clockPeriod();
76
77 tlb.assign(size, TlbEntry());
78
79 freeList.resize(numSets);
80 entryList.resize(numSets);
81
82 for (int set = 0; set < numSets; ++set) {
83 for (int way = 0; way < assoc; ++way) {
84 int x = set * assoc + way;
85 freeList[set].push_back(&tlb.at(x));
86 }
87 }
88
89 FA = (size == assoc);
90
91 /**
92 * @warning: the set-associative version assumes you have a
93 * fixed page size of 4KB.
94 * If the page size is greather than 4KB (as defined in the
95 * TheISA::PageBytes), then there are various issues w/ the current
96 * implementation (you'd have the same 8KB page being replicated in
97 * different sets etc)
98 */
99 setMask = numSets - 1;
100
101 maxCoalescedReqs = p->maxOutstandingReqs;
102
103 // Do not allow maxCoalescedReqs to be more than the TLB associativity
104 if (maxCoalescedReqs > assoc) {
105 maxCoalescedReqs = assoc;
106 cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
107 }
108
109 outstandingReqs = 0;
110 hitLatency = p->hitLatency;
111 missLatency1 = p->missLatency1;
112 missLatency2 = p->missLatency2;
113
114 // create the slave ports based on the number of connected ports
115 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
116 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
117 name(), i), this, i));
118 }
119
120 // create the master ports based on the number of connected ports
121 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
122 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
123 name(), i), this, i));
124 }
125 }
126
127 // fixme: this is never called?
128 GpuTLB::~GpuTLB()
129 {
130 // make sure all the hash-maps are empty
131 assert(translationReturnEvent.empty());
132 }
133
134 Port &
135 GpuTLB::getPort(const std::string &if_name, PortID idx)
136 {
137 if (if_name == "slave") {
138 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
139 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
140 }
141
142 return *cpuSidePort[idx];
143 } else if (if_name == "master") {
144 if (idx >= static_cast<PortID>(memSidePort.size())) {
145 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
146 }
147
148 hasMemSidePort = true;
149
150 return *memSidePort[idx];
151 } else {
152 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
153 }
154 }
155
156 TlbEntry*
157 GpuTLB::insert(Addr vpn, TlbEntry &entry)
158 {
159 TlbEntry *newEntry = nullptr;
160
161 /**
162 * vpn holds the virtual page address
163 * The least significant bits are simply masked
164 */
165 int set = (vpn >> TheISA::PageShift) & setMask;
166
167 if (!freeList[set].empty()) {
168 newEntry = freeList[set].front();
169 freeList[set].pop_front();
170 } else {
171 newEntry = entryList[set].back();
172 entryList[set].pop_back();
173 }
174
175 *newEntry = entry;
176 newEntry->vaddr = vpn;
177 entryList[set].push_front(newEntry);
178
179 return newEntry;
180 }
181
182 GpuTLB::EntryList::iterator
183 GpuTLB::lookupIt(Addr va, bool update_lru)
184 {
185 int set = (va >> TheISA::PageShift) & setMask;
186
187 if (FA) {
188 assert(!set);
189 }
190
191 auto entry = entryList[set].begin();
192 for (; entry != entryList[set].end(); ++entry) {
193 int page_size = (*entry)->size();
194
195 if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
196 DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
197 "with size %#x.\n", va, (*entry)->vaddr, page_size);
198
199 if (update_lru) {
200 entryList[set].push_front(*entry);
201 entryList[set].erase(entry);
202 entry = entryList[set].begin();
203 }
204
205 break;
206 }
207 }
208
209 return entry;
210 }
211
212 TlbEntry*
213 GpuTLB::lookup(Addr va, bool update_lru)
214 {
215 int set = (va >> TheISA::PageShift) & setMask;
216
217 auto entry = lookupIt(va, update_lru);
218
219 if (entry == entryList[set].end())
220 return nullptr;
221 else
222 return *entry;
223 }
224
225 void
226 GpuTLB::invalidateAll()
227 {
228 DPRINTF(GPUTLB, "Invalidating all entries.\n");
229
230 for (int i = 0; i < numSets; ++i) {
231 while (!entryList[i].empty()) {
232 TlbEntry *entry = entryList[i].front();
233 entryList[i].pop_front();
234 freeList[i].push_back(entry);
235 }
236 }
237 }
238
239 void
240 GpuTLB::setConfigAddress(uint32_t addr)
241 {
242 configAddress = addr;
243 }
244
245 void
246 GpuTLB::invalidateNonGlobal()
247 {
248 DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
249
250 for (int i = 0; i < numSets; ++i) {
251 for (auto entryIt = entryList[i].begin();
252 entryIt != entryList[i].end();) {
253 if (!(*entryIt)->global) {
254 freeList[i].push_back(*entryIt);
255 entryList[i].erase(entryIt++);
256 } else {
257 ++entryIt;
258 }
259 }
260 }
261 }
262
263 void
264 GpuTLB::demapPage(Addr va, uint64_t asn)
265 {
266
267 int set = (va >> TheISA::PageShift) & setMask;
268 auto entry = lookupIt(va, false);
269
270 if (entry != entryList[set].end()) {
271 freeList[set].push_back(*entry);
272 entryList[set].erase(entry);
273 }
274 }
275
276 Fault
277 GpuTLB::translateInt(const RequestPtr &req, ThreadContext *tc)
278 {
279 DPRINTF(GPUTLB, "Addresses references internal memory.\n");
280 Addr vaddr = req->getVaddr();
281 Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
282
283 if (prefix == IntAddrPrefixCPUID) {
284 panic("CPUID memory space not yet implemented!\n");
285 } else if (prefix == IntAddrPrefixMSR) {
286 vaddr = vaddr >> 3;
287 req->setFlags(Request::MMAPPED_IPR);
288 Addr regNum = 0;
289
290 switch (vaddr & ~IntAddrPrefixMask) {
291 case 0x10:
292 regNum = MISCREG_TSC;
293 break;
294 case 0x1B:
295 regNum = MISCREG_APIC_BASE;
296 break;
297 case 0xFE:
298 regNum = MISCREG_MTRRCAP;
299 break;
300 case 0x174:
301 regNum = MISCREG_SYSENTER_CS;
302 break;
303 case 0x175:
304 regNum = MISCREG_SYSENTER_ESP;
305 break;
306 case 0x176:
307 regNum = MISCREG_SYSENTER_EIP;
308 break;
309 case 0x179:
310 regNum = MISCREG_MCG_CAP;
311 break;
312 case 0x17A:
313 regNum = MISCREG_MCG_STATUS;
314 break;
315 case 0x17B:
316 regNum = MISCREG_MCG_CTL;
317 break;
318 case 0x1D9:
319 regNum = MISCREG_DEBUG_CTL_MSR;
320 break;
321 case 0x1DB:
322 regNum = MISCREG_LAST_BRANCH_FROM_IP;
323 break;
324 case 0x1DC:
325 regNum = MISCREG_LAST_BRANCH_TO_IP;
326 break;
327 case 0x1DD:
328 regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
329 break;
330 case 0x1DE:
331 regNum = MISCREG_LAST_EXCEPTION_TO_IP;
332 break;
333 case 0x200:
334 regNum = MISCREG_MTRR_PHYS_BASE_0;
335 break;
336 case 0x201:
337 regNum = MISCREG_MTRR_PHYS_MASK_0;
338 break;
339 case 0x202:
340 regNum = MISCREG_MTRR_PHYS_BASE_1;
341 break;
342 case 0x203:
343 regNum = MISCREG_MTRR_PHYS_MASK_1;
344 break;
345 case 0x204:
346 regNum = MISCREG_MTRR_PHYS_BASE_2;
347 break;
348 case 0x205:
349 regNum = MISCREG_MTRR_PHYS_MASK_2;
350 break;
351 case 0x206:
352 regNum = MISCREG_MTRR_PHYS_BASE_3;
353 break;
354 case 0x207:
355 regNum = MISCREG_MTRR_PHYS_MASK_3;
356 break;
357 case 0x208:
358 regNum = MISCREG_MTRR_PHYS_BASE_4;
359 break;
360 case 0x209:
361 regNum = MISCREG_MTRR_PHYS_MASK_4;
362 break;
363 case 0x20A:
364 regNum = MISCREG_MTRR_PHYS_BASE_5;
365 break;
366 case 0x20B:
367 regNum = MISCREG_MTRR_PHYS_MASK_5;
368 break;
369 case 0x20C:
370 regNum = MISCREG_MTRR_PHYS_BASE_6;
371 break;
372 case 0x20D:
373 regNum = MISCREG_MTRR_PHYS_MASK_6;
374 break;
375 case 0x20E:
376 regNum = MISCREG_MTRR_PHYS_BASE_7;
377 break;
378 case 0x20F:
379 regNum = MISCREG_MTRR_PHYS_MASK_7;
380 break;
381 case 0x250:
382 regNum = MISCREG_MTRR_FIX_64K_00000;
383 break;
384 case 0x258:
385 regNum = MISCREG_MTRR_FIX_16K_80000;
386 break;
387 case 0x259:
388 regNum = MISCREG_MTRR_FIX_16K_A0000;
389 break;
390 case 0x268:
391 regNum = MISCREG_MTRR_FIX_4K_C0000;
392 break;
393 case 0x269:
394 regNum = MISCREG_MTRR_FIX_4K_C8000;
395 break;
396 case 0x26A:
397 regNum = MISCREG_MTRR_FIX_4K_D0000;
398 break;
399 case 0x26B:
400 regNum = MISCREG_MTRR_FIX_4K_D8000;
401 break;
402 case 0x26C:
403 regNum = MISCREG_MTRR_FIX_4K_E0000;
404 break;
405 case 0x26D:
406 regNum = MISCREG_MTRR_FIX_4K_E8000;
407 break;
408 case 0x26E:
409 regNum = MISCREG_MTRR_FIX_4K_F0000;
410 break;
411 case 0x26F:
412 regNum = MISCREG_MTRR_FIX_4K_F8000;
413 break;
414 case 0x277:
415 regNum = MISCREG_PAT;
416 break;
417 case 0x2FF:
418 regNum = MISCREG_DEF_TYPE;
419 break;
420 case 0x400:
421 regNum = MISCREG_MC0_CTL;
422 break;
423 case 0x404:
424 regNum = MISCREG_MC1_CTL;
425 break;
426 case 0x408:
427 regNum = MISCREG_MC2_CTL;
428 break;
429 case 0x40C:
430 regNum = MISCREG_MC3_CTL;
431 break;
432 case 0x410:
433 regNum = MISCREG_MC4_CTL;
434 break;
435 case 0x414:
436 regNum = MISCREG_MC5_CTL;
437 break;
438 case 0x418:
439 regNum = MISCREG_MC6_CTL;
440 break;
441 case 0x41C:
442 regNum = MISCREG_MC7_CTL;
443 break;
444 case 0x401:
445 regNum = MISCREG_MC0_STATUS;
446 break;
447 case 0x405:
448 regNum = MISCREG_MC1_STATUS;
449 break;
450 case 0x409:
451 regNum = MISCREG_MC2_STATUS;
452 break;
453 case 0x40D:
454 regNum = MISCREG_MC3_STATUS;
455 break;
456 case 0x411:
457 regNum = MISCREG_MC4_STATUS;
458 break;
459 case 0x415:
460 regNum = MISCREG_MC5_STATUS;
461 break;
462 case 0x419:
463 regNum = MISCREG_MC6_STATUS;
464 break;
465 case 0x41D:
466 regNum = MISCREG_MC7_STATUS;
467 break;
468 case 0x402:
469 regNum = MISCREG_MC0_ADDR;
470 break;
471 case 0x406:
472 regNum = MISCREG_MC1_ADDR;
473 break;
474 case 0x40A:
475 regNum = MISCREG_MC2_ADDR;
476 break;
477 case 0x40E:
478 regNum = MISCREG_MC3_ADDR;
479 break;
480 case 0x412:
481 regNum = MISCREG_MC4_ADDR;
482 break;
483 case 0x416:
484 regNum = MISCREG_MC5_ADDR;
485 break;
486 case 0x41A:
487 regNum = MISCREG_MC6_ADDR;
488 break;
489 case 0x41E:
490 regNum = MISCREG_MC7_ADDR;
491 break;
492 case 0x403:
493 regNum = MISCREG_MC0_MISC;
494 break;
495 case 0x407:
496 regNum = MISCREG_MC1_MISC;
497 break;
498 case 0x40B:
499 regNum = MISCREG_MC2_MISC;
500 break;
501 case 0x40F:
502 regNum = MISCREG_MC3_MISC;
503 break;
504 case 0x413:
505 regNum = MISCREG_MC4_MISC;
506 break;
507 case 0x417:
508 regNum = MISCREG_MC5_MISC;
509 break;
510 case 0x41B:
511 regNum = MISCREG_MC6_MISC;
512 break;
513 case 0x41F:
514 regNum = MISCREG_MC7_MISC;
515 break;
516 case 0xC0000080:
517 regNum = MISCREG_EFER;
518 break;
519 case 0xC0000081:
520 regNum = MISCREG_STAR;
521 break;
522 case 0xC0000082:
523 regNum = MISCREG_LSTAR;
524 break;
525 case 0xC0000083:
526 regNum = MISCREG_CSTAR;
527 break;
528 case 0xC0000084:
529 regNum = MISCREG_SF_MASK;
530 break;
531 case 0xC0000100:
532 regNum = MISCREG_FS_BASE;
533 break;
534 case 0xC0000101:
535 regNum = MISCREG_GS_BASE;
536 break;
537 case 0xC0000102:
538 regNum = MISCREG_KERNEL_GS_BASE;
539 break;
540 case 0xC0000103:
541 regNum = MISCREG_TSC_AUX;
542 break;
543 case 0xC0010000:
544 regNum = MISCREG_PERF_EVT_SEL0;
545 break;
546 case 0xC0010001:
547 regNum = MISCREG_PERF_EVT_SEL1;
548 break;
549 case 0xC0010002:
550 regNum = MISCREG_PERF_EVT_SEL2;
551 break;
552 case 0xC0010003:
553 regNum = MISCREG_PERF_EVT_SEL3;
554 break;
555 case 0xC0010004:
556 regNum = MISCREG_PERF_EVT_CTR0;
557 break;
558 case 0xC0010005:
559 regNum = MISCREG_PERF_EVT_CTR1;
560 break;
561 case 0xC0010006:
562 regNum = MISCREG_PERF_EVT_CTR2;
563 break;
564 case 0xC0010007:
565 regNum = MISCREG_PERF_EVT_CTR3;
566 break;
567 case 0xC0010010:
568 regNum = MISCREG_SYSCFG;
569 break;
570 case 0xC0010016:
571 regNum = MISCREG_IORR_BASE0;
572 break;
573 case 0xC0010017:
574 regNum = MISCREG_IORR_BASE1;
575 break;
576 case 0xC0010018:
577 regNum = MISCREG_IORR_MASK0;
578 break;
579 case 0xC0010019:
580 regNum = MISCREG_IORR_MASK1;
581 break;
582 case 0xC001001A:
583 regNum = MISCREG_TOP_MEM;
584 break;
585 case 0xC001001D:
586 regNum = MISCREG_TOP_MEM2;
587 break;
588 case 0xC0010114:
589 regNum = MISCREG_VM_CR;
590 break;
591 case 0xC0010115:
592 regNum = MISCREG_IGNNE;
593 break;
594 case 0xC0010116:
595 regNum = MISCREG_SMM_CTL;
596 break;
597 case 0xC0010117:
598 regNum = MISCREG_VM_HSAVE_PA;
599 break;
600 default:
601 return std::make_shared<GeneralProtection>(0);
602 }
603 //The index is multiplied by the size of a MiscReg so that
604 //any memory dependence calculations will not see these as
605 //overlapping.
606 req->setPaddr(regNum * sizeof(RegVal));
607 return NoFault;
608 } else if (prefix == IntAddrPrefixIO) {
609 // TODO If CPL > IOPL or in virtual mode, check the I/O permission
610 // bitmap in the TSS.
611
612 Addr IOPort = vaddr & ~IntAddrPrefixMask;
613 // Make sure the address fits in the expected 16 bit IO address
614 // space.
615 assert(!(IOPort & ~0xFFFF));
616
617 if (IOPort == 0xCF8 && req->getSize() == 4) {
618 req->setFlags(Request::MMAPPED_IPR);
619 req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(RegVal));
620 } else if ((IOPort & ~mask(2)) == 0xCFC) {
621 req->setFlags(Request::UNCACHEABLE);
622
623 Addr configAddress =
624 tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
625
626 if (bits(configAddress, 31, 31)) {
627 req->setPaddr(PhysAddrPrefixPciConfig |
628 mbits(configAddress, 30, 2) |
629 (IOPort & mask(2)));
630 } else {
631 req->setPaddr(PhysAddrPrefixIO | IOPort);
632 }
633 } else {
634 req->setFlags(Request::UNCACHEABLE);
635 req->setPaddr(PhysAddrPrefixIO | IOPort);
636 }
637 return NoFault;
638 } else {
639 panic("Access to unrecognized internal address space %#x.\n",
640 prefix);
641 }
642 }
643
644 /**
645 * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
646 * and false on a TLB miss.
647 * Many of the checks about different modes have been converted to
648 * assertions, since these parts of the code are not really used.
649 * On a hit it will update the LRU stack.
650 */
651 bool
652 GpuTLB::tlbLookup(const RequestPtr &req,
653 ThreadContext *tc, bool update_stats)
654 {
655 bool tlb_hit = false;
656 #ifndef NDEBUG
657 uint32_t flags = req->getFlags();
658 int seg = flags & SegmentFlagMask;
659 #endif
660
661 assert(seg != SEGMENT_REG_MS);
662 Addr vaddr = req->getVaddr();
663 DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
664 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
665
666 if (m5Reg.prot) {
667 DPRINTF(GPUTLB, "In protected mode.\n");
668 // make sure we are in 64-bit mode
669 assert(m5Reg.mode == LongMode);
670
671 // If paging is enabled, do the translation.
672 if (m5Reg.paging) {
673 DPRINTF(GPUTLB, "Paging enabled.\n");
674 //update LRU stack on a hit
675 TlbEntry *entry = lookup(vaddr, true);
676
677 if (entry)
678 tlb_hit = true;
679
680 if (!update_stats) {
681 // functional tlb access for memory initialization
682 // i.e., memory seeding or instr. seeding -> don't update
683 // TLB and stats
684 return tlb_hit;
685 }
686
687 localNumTLBAccesses++;
688
689 if (!entry) {
690 localNumTLBMisses++;
691 } else {
692 localNumTLBHits++;
693 }
694 }
695 }
696
697 return tlb_hit;
698 }
699
700 Fault
701 GpuTLB::translate(const RequestPtr &req, ThreadContext *tc,
702 Translation *translation, Mode mode,
703 bool &delayedResponse, bool timing, int &latency)
704 {
705 uint32_t flags = req->getFlags();
706 int seg = flags & SegmentFlagMask;
707 bool storeCheck = flags & (StoreCheck << FlagShift);
708
709 // If this is true, we're dealing with a request
710 // to a non-memory address space.
711 if (seg == SEGMENT_REG_MS) {
712 return translateInt(req, tc);
713 }
714
715 delayedResponse = false;
716 Addr vaddr = req->getVaddr();
717 DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
718
719 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
720
721 // If protected mode has been enabled...
722 if (m5Reg.prot) {
723 DPRINTF(GPUTLB, "In protected mode.\n");
724 // If we're not in 64-bit mode, do protection/limit checks
725 if (m5Reg.mode != LongMode) {
726 DPRINTF(GPUTLB, "Not in long mode. Checking segment "
727 "protection.\n");
728
729 // Check for a null segment selector.
730 if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
731 seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
732 && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
733 return std::make_shared<GeneralProtection>(0);
734 }
735
736 bool expandDown = false;
737 SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
738
739 if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
740 if (!attr.writable && (mode == BaseTLB::Write ||
741 storeCheck))
742 return std::make_shared<GeneralProtection>(0);
743
744 if (!attr.readable && mode == BaseTLB::Read)
745 return std::make_shared<GeneralProtection>(0);
746
747 expandDown = attr.expandDown;
748
749 }
750
751 Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
752 Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
753 // This assumes we're not in 64 bit mode. If we were, the
754 // default address size is 64 bits, overridable to 32.
755 int size = 32;
756 bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
757 SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
758
759 if ((csAttr.defaultSize && sizeOverride) ||
760 (!csAttr.defaultSize && !sizeOverride)) {
761 size = 16;
762 }
763
764 Addr offset = bits(vaddr - base, size - 1, 0);
765 Addr endOffset = offset + req->getSize() - 1;
766
767 if (expandDown) {
768 DPRINTF(GPUTLB, "Checking an expand down segment.\n");
769 warn_once("Expand down segments are untested.\n");
770
771 if (offset <= limit || endOffset <= limit)
772 return std::make_shared<GeneralProtection>(0);
773 } else {
774 if (offset > limit || endOffset > limit)
775 return std::make_shared<GeneralProtection>(0);
776 }
777 }
778
779 // If paging is enabled, do the translation.
780 if (m5Reg.paging) {
781 DPRINTF(GPUTLB, "Paging enabled.\n");
782 // The vaddr already has the segment base applied.
783 TlbEntry *entry = lookup(vaddr);
784 localNumTLBAccesses++;
785
786 if (!entry) {
787 localNumTLBMisses++;
788 if (timing) {
789 latency = missLatency1;
790 }
791
792 if (FullSystem) {
793 fatal("GpuTLB doesn't support full-system mode\n");
794 } else {
795 DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
796 "at pc %#x.\n", vaddr, tc->instAddr());
797
798 Process *p = tc->getProcessPtr();
799 const EmulationPageTable::Entry *pte =
800 p->pTable->lookup(vaddr);
801
802 if (!pte && mode != BaseTLB::Execute) {
803 // penalize a "page fault" more
804 if (timing)
805 latency += missLatency2;
806
807 if (p->fixupStackFault(vaddr))
808 pte = p->pTable->lookup(vaddr);
809 }
810
811 if (!pte) {
812 return std::make_shared<PageFault>(vaddr, true,
813 mode, true,
814 false);
815 } else {
816 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
817
818 DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
819 alignedVaddr, pte->paddr);
820
821 TlbEntry gpuEntry(p->pid(), alignedVaddr,
822 pte->paddr, false, false);
823 entry = insert(alignedVaddr, gpuEntry);
824 }
825
826 DPRINTF(GPUTLB, "Miss was serviced.\n");
827 }
828 } else {
829 localNumTLBHits++;
830
831 if (timing) {
832 latency = hitLatency;
833 }
834 }
835
836 // Do paging protection checks.
837 bool inUser = (m5Reg.cpl == 3 &&
838 !(flags & (CPL0FlagBit << FlagShift)));
839
840 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
841 bool badWrite = (!entry->writable && (inUser || cr0.wp));
842
843 if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
844 badWrite)) {
845 // The page must have been present to get into the TLB in
846 // the first place. We'll assume the reserved bits are
847 // fine even though we're not checking them.
848 return std::make_shared<PageFault>(vaddr, true, mode,
849 inUser, false);
850 }
851
852 if (storeCheck && badWrite) {
853 // This would fault if this were a write, so return a page
854 // fault that reflects that happening.
855 return std::make_shared<PageFault>(vaddr, true,
856 BaseTLB::Write,
857 inUser, false);
858 }
859
860
861 DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
862 "checks.\n", entry->paddr);
863
864 int page_size = entry->size();
865 Addr paddr = entry->paddr | (vaddr & (page_size - 1));
866 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
867 req->setPaddr(paddr);
868
869 if (entry->uncacheable)
870 req->setFlags(Request::UNCACHEABLE);
871 } else {
872 //Use the address which already has segmentation applied.
873 DPRINTF(GPUTLB, "Paging disabled.\n");
874 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
875 req->setPaddr(vaddr);
876 }
877 } else {
878 // Real mode
879 DPRINTF(GPUTLB, "In real mode.\n");
880 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
881 req->setPaddr(vaddr);
882 }
883
884 // Check for an access to the local APIC
885 if (FullSystem) {
886 LocalApicBase localApicBase =
887 tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
888
889 Addr baseAddr = localApicBase.base * PageBytes;
890 Addr paddr = req->getPaddr();
891
892 if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
893 // Force the access to be uncacheable.
894 req->setFlags(Request::UNCACHEABLE);
895 req->setPaddr(x86LocalAPICAddress(tc->contextId(),
896 paddr - baseAddr));
897 }
898 }
899
900 return NoFault;
901 };
902
903 Fault
904 GpuTLB::translateAtomic(const RequestPtr &req, ThreadContext *tc,
905 Mode mode, int &latency)
906 {
907 bool delayedResponse;
908
909 return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
910 latency);
911 }
912
913 void
914 GpuTLB::translateTiming(const RequestPtr &req, ThreadContext *tc,
915 Translation *translation, Mode mode, int &latency)
916 {
917 bool delayedResponse;
918 assert(translation);
919
920 Fault fault = GpuTLB::translate(req, tc, translation, mode,
921 delayedResponse, true, latency);
922
923 if (!delayedResponse)
924 translation->finish(fault, req, tc, mode);
925 }
926
927 Walker*
928 GpuTLB::getWalker()
929 {
930 return walker;
931 }
932
933
934 void
935 GpuTLB::serialize(CheckpointOut &cp) const
936 {
937 }
938
939 void
940 GpuTLB::unserialize(CheckpointIn &cp)
941 {
942 }
943
944 void
945 GpuTLB::regStats()
946 {
947 ClockedObject::regStats();
948
949 localNumTLBAccesses
950 .name(name() + ".local_TLB_accesses")
951 .desc("Number of TLB accesses")
952 ;
953
954 localNumTLBHits
955 .name(name() + ".local_TLB_hits")
956 .desc("Number of TLB hits")
957 ;
958
959 localNumTLBMisses
960 .name(name() + ".local_TLB_misses")
961 .desc("Number of TLB misses")
962 ;
963
964 localTLBMissRate
965 .name(name() + ".local_TLB_miss_rate")
966 .desc("TLB miss rate")
967 ;
968
969 accessCycles
970 .name(name() + ".access_cycles")
971 .desc("Cycles spent accessing this TLB level")
972 ;
973
974 pageTableCycles
975 .name(name() + ".page_table_cycles")
976 .desc("Cycles spent accessing the page table")
977 ;
978
979 localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
980
981 numUniquePages
982 .name(name() + ".unique_pages")
983 .desc("Number of unique pages touched")
984 ;
985
986 localCycles
987 .name(name() + ".local_cycles")
988 .desc("Number of cycles spent in queue for all incoming reqs")
989 ;
990
991 localLatency
992 .name(name() + ".local_latency")
993 .desc("Avg. latency over incoming coalesced reqs")
994 ;
995
996 localLatency = localCycles / localNumTLBAccesses;
997
998 globalNumTLBAccesses
999 .name(name() + ".global_TLB_accesses")
1000 .desc("Number of TLB accesses")
1001 ;
1002
1003 globalNumTLBHits
1004 .name(name() + ".global_TLB_hits")
1005 .desc("Number of TLB hits")
1006 ;
1007
1008 globalNumTLBMisses
1009 .name(name() + ".global_TLB_misses")
1010 .desc("Number of TLB misses")
1011 ;
1012
1013 globalTLBMissRate
1014 .name(name() + ".global_TLB_miss_rate")
1015 .desc("TLB miss rate")
1016 ;
1017
1018 globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
1019
1020 avgReuseDistance
1021 .name(name() + ".avg_reuse_distance")
1022 .desc("avg. reuse distance over all pages (in ticks)")
1023 ;
1024
1025 }
1026
1027 /**
1028 * Do the TLB lookup for this coalesced request and schedule
1029 * another event <TLB access latency> cycles later.
1030 */
1031
1032 void
1033 GpuTLB::issueTLBLookup(PacketPtr pkt)
1034 {
1035 assert(pkt);
1036 assert(pkt->senderState);
1037
1038 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1039 TheISA::PageBytes);
1040
1041 TranslationState *sender_state =
1042 safe_cast<TranslationState*>(pkt->senderState);
1043
1044 bool update_stats = !sender_state->prefetch;
1045 ThreadContext * tmp_tc = sender_state->tc;
1046
1047 DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
1048 virt_page_addr);
1049
1050 int req_cnt = sender_state->reqCnt.back();
1051
1052 if (update_stats) {
1053 accessCycles -= (curTick() * req_cnt);
1054 localCycles -= curTick();
1055 updatePageFootprint(virt_page_addr);
1056 globalNumTLBAccesses += req_cnt;
1057 }
1058
1059 tlbOutcome lookup_outcome = TLB_MISS;
1060 const RequestPtr &tmp_req = pkt->req;
1061
1062 // Access the TLB and figure out if it's a hit or a miss.
1063 bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
1064
1065 if (success) {
1066 lookup_outcome = TLB_HIT;
1067 // Put the entry in SenderState
1068 TlbEntry *entry = lookup(tmp_req->getVaddr(), false);
1069 assert(entry);
1070
1071 auto p = sender_state->tc->getProcessPtr();
1072 sender_state->tlbEntry =
1073 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
1074 false, false);
1075
1076 if (update_stats) {
1077 // the reqCnt has an entry per level, so its size tells us
1078 // which level we are in
1079 sender_state->hitLevel = sender_state->reqCnt.size();
1080 globalNumTLBHits += req_cnt;
1081 }
1082 } else {
1083 if (update_stats)
1084 globalNumTLBMisses += req_cnt;
1085 }
1086
1087 /*
1088 * We now know the TLB lookup outcome (if it's a hit or a miss), as well
1089 * as the TLB access latency.
1090 *
1091 * We create and schedule a new TLBEvent which will help us take the
1092 * appropriate actions (e.g., update TLB on a hit, send request to lower
1093 * level TLB on a miss, or start a page walk if this was the last-level
1094 * TLB)
1095 */
1096 TLBEvent *tlb_event =
1097 new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
1098
1099 if (translationReturnEvent.count(virt_page_addr)) {
1100 panic("Virtual Page Address %#x already has a return event\n",
1101 virt_page_addr);
1102 }
1103
1104 translationReturnEvent[virt_page_addr] = tlb_event;
1105 assert(tlb_event);
1106
1107 DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
1108 curTick() + this->ticks(hitLatency));
1109
1110 schedule(tlb_event, curTick() + this->ticks(hitLatency));
1111 }
1112
1113 GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
1114 PacketPtr _pkt)
1115 : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
1116 outcome(tlb_outcome), pkt(_pkt)
1117 {
1118 }
1119
1120 /**
1121 * Do Paging protection checks. If we encounter a page fault, then
1122 * an assertion is fired.
1123 */
1124 void
1125 GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
1126 TlbEntry * tlb_entry, Mode mode)
1127 {
1128 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
1129 uint32_t flags = pkt->req->getFlags();
1130 bool storeCheck = flags & (StoreCheck << FlagShift);
1131
1132 // Do paging protection checks.
1133 bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
1134 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
1135
1136 bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
1137
1138 if ((inUser && !tlb_entry->user) ||
1139 (mode == BaseTLB::Write && badWrite)) {
1140 // The page must have been present to get into the TLB in
1141 // the first place. We'll assume the reserved bits are
1142 // fine even though we're not checking them.
1143 panic("Page fault detected");
1144 }
1145
1146 if (storeCheck && badWrite) {
1147 // This would fault if this were a write, so return a page
1148 // fault that reflects that happening.
1149 panic("Page fault detected");
1150 }
1151 }
1152
1153 /**
1154 * handleTranslationReturn is called on a TLB hit,
1155 * when a TLB miss returns or when a page fault returns.
1156 * The latter calls handelHit with TLB miss as tlbOutcome.
1157 */
1158 void
1159 GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
1160 PacketPtr pkt)
1161 {
1162
1163 assert(pkt);
1164 Addr vaddr = pkt->req->getVaddr();
1165
1166 TranslationState *sender_state =
1167 safe_cast<TranslationState*>(pkt->senderState);
1168
1169 ThreadContext *tc = sender_state->tc;
1170 Mode mode = sender_state->tlbMode;
1171
1172 TlbEntry *local_entry, *new_entry;
1173
1174 if (tlb_outcome == TLB_HIT) {
1175 DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
1176 local_entry = sender_state->tlbEntry;
1177 } else {
1178 DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
1179 vaddr);
1180
1181 // We are returning either from a page walk or from a hit at a lower
1182 // TLB level. The senderState should be "carrying" a pointer to the
1183 // correct TLBEntry.
1184 new_entry = sender_state->tlbEntry;
1185 assert(new_entry);
1186 local_entry = new_entry;
1187
1188 if (allocationPolicy) {
1189 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1190 virt_page_addr);
1191
1192 local_entry = insert(virt_page_addr, *new_entry);
1193 }
1194
1195 assert(local_entry);
1196 }
1197
1198 /**
1199 * At this point the packet carries an up-to-date tlbEntry pointer
1200 * in its senderState.
1201 * Next step is to do the paging protection checks.
1202 */
1203 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
1204 "while paddr was %#x.\n", local_entry->vaddr,
1205 local_entry->paddr);
1206
1207 pagingProtectionChecks(tc, pkt, local_entry, mode);
1208 int page_size = local_entry->size();
1209 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1210 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1211
1212 // Since this packet will be sent through the cpu side slave port,
1213 // it must be converted to a response pkt if it is not one already
1214 if (pkt->isRequest()) {
1215 pkt->makeTimingResponse();
1216 }
1217
1218 pkt->req->setPaddr(paddr);
1219
1220 if (local_entry->uncacheable) {
1221 pkt->req->setFlags(Request::UNCACHEABLE);
1222 }
1223
1224 //send packet back to coalescer
1225 cpuSidePort[0]->sendTimingResp(pkt);
1226 //schedule cleanup event
1227 cleanupQueue.push(virt_page_addr);
1228
1229 // schedule this only once per cycle.
1230 // The check is required because we might have multiple translations
1231 // returning the same cycle
1232 // this is a maximum priority event and must be on the same cycle
1233 // as the cleanup event in TLBCoalescer to avoid a race with
1234 // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
1235 if (!cleanupEvent.scheduled())
1236 schedule(cleanupEvent, curTick());
1237 }
1238
1239 /**
1240 * Here we take the appropriate actions based on the result of the
1241 * TLB lookup.
1242 */
1243 void
1244 GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
1245 PacketPtr pkt)
1246 {
1247 DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
1248
1249 assert(translationReturnEvent[virtPageAddr]);
1250 assert(pkt);
1251
1252 TranslationState *tmp_sender_state =
1253 safe_cast<TranslationState*>(pkt->senderState);
1254
1255 int req_cnt = tmp_sender_state->reqCnt.back();
1256 bool update_stats = !tmp_sender_state->prefetch;
1257
1258
1259 if (outcome == TLB_HIT) {
1260 handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
1261
1262 if (update_stats) {
1263 accessCycles += (req_cnt * curTick());
1264 localCycles += curTick();
1265 }
1266
1267 } else if (outcome == TLB_MISS) {
1268
1269 DPRINTF(GPUTLB, "This is a TLB miss\n");
1270 if (update_stats) {
1271 accessCycles += (req_cnt*curTick());
1272 localCycles += curTick();
1273 }
1274
1275 if (hasMemSidePort) {
1276 // the one cyle added here represent the delay from when we get
1277 // the reply back till when we propagate it to the coalescer
1278 // above.
1279 if (update_stats) {
1280 accessCycles += (req_cnt * 1);
1281 localCycles += 1;
1282 }
1283
1284 /**
1285 * There is a TLB below. Send the coalesced request.
1286 * We actually send the very first packet of all the
1287 * pending packets for this virtual page address.
1288 */
1289 if (!memSidePort[0]->sendTimingReq(pkt)) {
1290 DPRINTF(GPUTLB, "Failed sending translation request to "
1291 "lower level TLB for addr %#x\n", virtPageAddr);
1292
1293 memSidePort[0]->retries.push_back(pkt);
1294 } else {
1295 DPRINTF(GPUTLB, "Sent translation request to lower level "
1296 "TLB for addr %#x\n", virtPageAddr);
1297 }
1298 } else {
1299 //this is the last level TLB. Start a page walk
1300 DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
1301 "addr %#x\n", virtPageAddr);
1302
1303 if (update_stats)
1304 pageTableCycles -= (req_cnt*curTick());
1305
1306 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
1307 assert(tlb_event);
1308 tlb_event->updateOutcome(PAGE_WALK);
1309 schedule(tlb_event, curTick() + ticks(missLatency2));
1310 }
1311 } else if (outcome == PAGE_WALK) {
1312 if (update_stats)
1313 pageTableCycles += (req_cnt*curTick());
1314
1315 // Need to access the page table and update the TLB
1316 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1317 virtPageAddr);
1318
1319 TranslationState *sender_state =
1320 safe_cast<TranslationState*>(pkt->senderState);
1321
1322 Process *p = sender_state->tc->getProcessPtr();
1323 Addr vaddr = pkt->req->getVaddr();
1324 #ifndef NDEBUG
1325 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1326 assert(alignedVaddr == virtPageAddr);
1327 #endif
1328 const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
1329 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1330 p->fixupStackFault(vaddr)) {
1331 pte = p->pTable->lookup(vaddr);
1332 }
1333
1334 if (pte) {
1335 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1336 pte->paddr);
1337
1338 sender_state->tlbEntry =
1339 new TlbEntry(p->pid(), virtPageAddr, pte->paddr, false,
1340 false);
1341 } else {
1342 sender_state->tlbEntry = nullptr;
1343 }
1344
1345 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1346 } else if (outcome == MISS_RETURN) {
1347 /** we add an extra cycle in the return path of the translation
1348 * requests in between the various TLB levels.
1349 */
1350 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1351 } else {
1352 panic("Unexpected TLB outcome %d", outcome);
1353 }
1354 }
1355
1356 void
1357 GpuTLB::TLBEvent::process()
1358 {
1359 tlb->translationReturn(virtPageAddr, outcome, pkt);
1360 }
1361
1362 const char*
1363 GpuTLB::TLBEvent::description() const
1364 {
1365 return "trigger translationDoneEvent";
1366 }
1367
1368 void
1369 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
1370 {
1371 outcome = _outcome;
1372 }
1373
1374 Addr
1375 GpuTLB::TLBEvent::getTLBEventVaddr()
1376 {
1377 return virtPageAddr;
1378 }
1379
1380 /*
1381 * recvTiming receives a coalesced timing request from a TLBCoalescer
1382 * and it calls issueTLBLookup()
1383 * It only rejects the packet if we have exceeded the max
1384 * outstanding number of requests for the TLB
1385 */
1386 bool
1387 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
1388 {
1389 if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
1390 tlb->issueTLBLookup(pkt);
1391 // update number of outstanding translation requests
1392 tlb->outstandingReqs++;
1393 return true;
1394 } else {
1395 DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
1396 tlb->outstandingReqs);
1397 return false;
1398 }
1399 }
1400
1401 /**
1402 * handleFuncTranslationReturn is called on a TLB hit,
1403 * when a TLB miss returns or when a page fault returns.
1404 * It updates LRU, inserts the TLB entry on a miss
1405 * depending on the allocation policy and does the required
1406 * protection checks. It does NOT create a new packet to
1407 * update the packet's addr; this is done in hsail-gpu code.
1408 */
1409 void
1410 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
1411 {
1412 TranslationState *sender_state =
1413 safe_cast<TranslationState*>(pkt->senderState);
1414
1415 ThreadContext *tc = sender_state->tc;
1416 Mode mode = sender_state->tlbMode;
1417 Addr vaddr = pkt->req->getVaddr();
1418
1419 TlbEntry *local_entry, *new_entry;
1420
1421 if (tlb_outcome == TLB_HIT) {
1422 DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
1423 "%#x\n", vaddr);
1424
1425 local_entry = sender_state->tlbEntry;
1426 } else {
1427 DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
1428 "%#x\n", vaddr);
1429
1430 // We are returning either from a page walk or from a hit at a lower
1431 // TLB level. The senderState should be "carrying" a pointer to the
1432 // correct TLBEntry.
1433 new_entry = sender_state->tlbEntry;
1434 assert(new_entry);
1435 local_entry = new_entry;
1436
1437 if (allocationPolicy) {
1438 Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
1439
1440 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1441 virt_page_addr);
1442
1443 local_entry = insert(virt_page_addr, *new_entry);
1444 }
1445
1446 assert(local_entry);
1447 }
1448
1449 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
1450 "while paddr was %#x.\n", local_entry->vaddr,
1451 local_entry->paddr);
1452
1453 /**
1454 * Do paging checks if it's a normal functional access. If it's for a
1455 * prefetch, then sometimes you can try to prefetch something that
1456 * won't pass protection. We don't actually want to fault becuase there
1457 * is no demand access to deem this a violation. Just put it in the
1458 * TLB and it will fault if indeed a future demand access touches it in
1459 * violation.
1460 *
1461 * This feature could be used to explore security issues around
1462 * speculative memory accesses.
1463 */
1464 if (!sender_state->prefetch && sender_state->tlbEntry)
1465 pagingProtectionChecks(tc, pkt, local_entry, mode);
1466
1467 int page_size = local_entry->size();
1468 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1469 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1470
1471 pkt->req->setPaddr(paddr);
1472
1473 if (local_entry->uncacheable)
1474 pkt->req->setFlags(Request::UNCACHEABLE);
1475 }
1476
1477 // This is used for atomic translations. Need to
1478 // make it all happen during the same cycle.
1479 void
1480 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
1481 {
1482 TranslationState *sender_state =
1483 safe_cast<TranslationState*>(pkt->senderState);
1484
1485 ThreadContext *tc = sender_state->tc;
1486 bool update_stats = !sender_state->prefetch;
1487
1488 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1489 TheISA::PageBytes);
1490
1491 if (update_stats)
1492 tlb->updatePageFootprint(virt_page_addr);
1493
1494 // do the TLB lookup without updating the stats
1495 bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
1496 tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
1497
1498 // functional mode means no coalescing
1499 // global metrics are the same as the local metrics
1500 if (update_stats) {
1501 tlb->globalNumTLBAccesses++;
1502
1503 if (success) {
1504 sender_state->hitLevel = sender_state->reqCnt.size();
1505 tlb->globalNumTLBHits++;
1506 }
1507 }
1508
1509 if (!success) {
1510 if (update_stats)
1511 tlb->globalNumTLBMisses++;
1512 if (tlb->hasMemSidePort) {
1513 // there is a TLB below -> propagate down the TLB hierarchy
1514 tlb->memSidePort[0]->sendFunctional(pkt);
1515 // If no valid translation from a prefetch, then just return
1516 if (sender_state->prefetch && !pkt->req->hasPaddr())
1517 return;
1518 } else {
1519 // Need to access the page table and update the TLB
1520 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1521 virt_page_addr);
1522
1523 Process *p = tc->getProcessPtr();
1524
1525 Addr vaddr = pkt->req->getVaddr();
1526 #ifndef NDEBUG
1527 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1528 assert(alignedVaddr == virt_page_addr);
1529 #endif
1530
1531 const EmulationPageTable::Entry *pte =
1532 p->pTable->lookup(vaddr);
1533 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1534 p->fixupStackFault(vaddr)) {
1535 pte = p->pTable->lookup(vaddr);
1536 }
1537
1538 if (!sender_state->prefetch) {
1539 // no PageFaults are permitted after
1540 // the second page table lookup
1541 assert(pte);
1542
1543 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1544 pte->paddr);
1545
1546 sender_state->tlbEntry =
1547 new TlbEntry(p->pid(), virt_page_addr,
1548 pte->paddr, false, false);
1549 } else {
1550 // If this was a prefetch, then do the normal thing if it
1551 // was a successful translation. Otherwise, send an empty
1552 // TLB entry back so that it can be figured out as empty and
1553 // handled accordingly.
1554 if (pte) {
1555 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1556 pte->paddr);
1557
1558 sender_state->tlbEntry =
1559 new TlbEntry(p->pid(), virt_page_addr,
1560 pte->paddr, false, false);
1561 } else {
1562 DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
1563 alignedVaddr);
1564
1565 sender_state->tlbEntry = nullptr;
1566
1567 return;
1568 }
1569 }
1570 }
1571 } else {
1572 DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
1573 tlb->lookup(pkt->req->getVaddr()));
1574
1575 TlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
1576 update_stats);
1577
1578 assert(entry);
1579
1580 auto p = sender_state->tc->getProcessPtr();
1581 sender_state->tlbEntry =
1582 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
1583 false, false);
1584 }
1585 // This is the function that would populate pkt->req with the paddr of
1586 // the translation. But if no translation happens (i.e Prefetch fails)
1587 // then the early returns in the above code wiill keep this function
1588 // from executing.
1589 tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
1590 }
1591
1592 void
1593 GpuTLB::CpuSidePort::recvReqRetry()
1594 {
1595 // The CPUSidePort never sends anything but replies. No retries
1596 // expected.
1597 panic("recvReqRetry called");
1598 }
1599
1600 AddrRangeList
1601 GpuTLB::CpuSidePort::getAddrRanges() const
1602 {
1603 // currently not checked by the master
1604 AddrRangeList ranges;
1605
1606 return ranges;
1607 }
1608
1609 /**
1610 * MemSidePort receives the packet back.
1611 * We need to call the handleTranslationReturn
1612 * and propagate up the hierarchy.
1613 */
1614 bool
1615 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
1616 {
1617 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1618 TheISA::PageBytes);
1619
1620 DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
1621 virt_page_addr);
1622
1623 TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
1624 assert(tlb_event);
1625 assert(virt_page_addr == tlb_event->getTLBEventVaddr());
1626
1627 tlb_event->updateOutcome(MISS_RETURN);
1628 tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
1629
1630 return true;
1631 }
1632
1633 void
1634 GpuTLB::MemSidePort::recvReqRetry()
1635 {
1636 // No retries should reach the TLB. The retries
1637 // should only reach the TLBCoalescer.
1638 panic("recvReqRetry called");
1639 }
1640
1641 void
1642 GpuTLB::cleanup()
1643 {
1644 while (!cleanupQueue.empty()) {
1645 Addr cleanup_addr = cleanupQueue.front();
1646 cleanupQueue.pop();
1647
1648 // delete TLBEvent
1649 TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
1650 delete old_tlb_event;
1651 translationReturnEvent.erase(cleanup_addr);
1652
1653 // update number of outstanding requests
1654 outstandingReqs--;
1655 }
1656
1657 /** the higher level coalescer should retry if it has
1658 * any pending requests.
1659 */
1660 for (int i = 0; i < cpuSidePort.size(); ++i) {
1661 cpuSidePort[i]->sendRetryReq();
1662 }
1663 }
1664
1665 void
1666 GpuTLB::updatePageFootprint(Addr virt_page_addr)
1667 {
1668
1669 std::pair<AccessPatternTable::iterator, bool> ret;
1670
1671 AccessInfo tmp_access_info;
1672 tmp_access_info.lastTimeAccessed = 0;
1673 tmp_access_info.accessesPerPage = 0;
1674 tmp_access_info.totalReuseDistance = 0;
1675 tmp_access_info.sumDistance = 0;
1676 tmp_access_info.meanDistance = 0;
1677
1678 ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
1679 tmp_access_info));
1680
1681 bool first_page_access = ret.second;
1682
1683 if (first_page_access) {
1684 numUniquePages++;
1685 } else {
1686 int accessed_before;
1687 accessed_before = curTick() - ret.first->second.lastTimeAccessed;
1688 ret.first->second.totalReuseDistance += accessed_before;
1689 }
1690
1691 ret.first->second.accessesPerPage++;
1692 ret.first->second.lastTimeAccessed = curTick();
1693
1694 if (accessDistance) {
1695 ret.first->second.localTLBAccesses
1696 .push_back(localNumTLBAccesses.value());
1697 }
1698 }
1699
1700 void
1701 GpuTLB::exitCallback()
1702 {
1703 std::ostream *page_stat_file = nullptr;
1704
1705 if (accessDistance) {
1706
1707 // print per page statistics to a separate file (.csv format)
1708 // simout is the gem5 output directory (default is m5out or the one
1709 // specified with -d
1710 page_stat_file = simout.create(name().c_str())->stream();
1711
1712 // print header
1713 *page_stat_file << "page,max_access_distance,mean_access_distance, "
1714 << "stddev_distance" << std::endl;
1715 }
1716
1717 // update avg. reuse distance footprint
1718 AccessPatternTable::iterator iter, iter_begin, iter_end;
1719 unsigned int sum_avg_reuse_distance_per_page = 0;
1720
1721 // iterate through all pages seen by this TLB
1722 for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
1723 sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
1724 iter->second.accessesPerPage;
1725
1726 if (accessDistance) {
1727 unsigned int tmp = iter->second.localTLBAccesses[0];
1728 unsigned int prev = tmp;
1729
1730 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1731 if (i) {
1732 tmp = prev + 1;
1733 }
1734
1735 prev = iter->second.localTLBAccesses[i];
1736 // update the localTLBAccesses value
1737 // with the actual differece
1738 iter->second.localTLBAccesses[i] -= tmp;
1739 // compute the sum of AccessDistance per page
1740 // used later for mean
1741 iter->second.sumDistance +=
1742 iter->second.localTLBAccesses[i];
1743 }
1744
1745 iter->second.meanDistance =
1746 iter->second.sumDistance / iter->second.accessesPerPage;
1747
1748 // compute std_dev and max (we need a second round because we
1749 // need to know the mean value
1750 unsigned int max_distance = 0;
1751 unsigned int stddev_distance = 0;
1752
1753 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1754 unsigned int tmp_access_distance =
1755 iter->second.localTLBAccesses[i];
1756
1757 if (tmp_access_distance > max_distance) {
1758 max_distance = tmp_access_distance;
1759 }
1760
1761 unsigned int diff =
1762 tmp_access_distance - iter->second.meanDistance;
1763 stddev_distance += pow(diff, 2);
1764
1765 }
1766
1767 stddev_distance =
1768 sqrt(stddev_distance/iter->second.accessesPerPage);
1769
1770 if (page_stat_file) {
1771 *page_stat_file << std::hex << iter->first << ",";
1772 *page_stat_file << std::dec << max_distance << ",";
1773 *page_stat_file << std::dec << iter->second.meanDistance
1774 << ",";
1775 *page_stat_file << std::dec << stddev_distance;
1776 *page_stat_file << std::endl;
1777 }
1778
1779 // erase the localTLBAccesses array
1780 iter->second.localTLBAccesses.clear();
1781 }
1782 }
1783
1784 if (!TLBFootprint.empty()) {
1785 avgReuseDistance =
1786 sum_avg_reuse_distance_per_page / TLBFootprint.size();
1787 }
1788
1789 //clear the TLBFootprint map
1790 TLBFootprint.clear();
1791 }
1792 } // namespace X86ISA
1793
1794 X86ISA::GpuTLB*
1795 X86GPUTLBParams::create()
1796 {
1797 return new X86ISA::GpuTLB(this);
1798 }
1799