gpu-compute: Dropping fetchs when no entry is reserved in the buffer
[gem5.git] / src / gpu-compute / gpu_tlb.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36 #include "gpu-compute/gpu_tlb.hh"
37
38 #include <cmath>
39 #include <cstring>
40
41 #include "arch/x86/faults.hh"
42 #include "arch/x86/insts/microldstop.hh"
43 #include "arch/x86/pagetable.hh"
44 #include "arch/x86/pagetable_walker.hh"
45 #include "arch/x86/regs/misc.hh"
46 #include "arch/x86/regs/msr.hh"
47 #include "arch/x86/x86_traits.hh"
48 #include "base/bitfield.hh"
49 #include "base/logging.hh"
50 #include "base/output.hh"
51 #include "base/trace.hh"
52 #include "cpu/base.hh"
53 #include "cpu/thread_context.hh"
54 #include "debug/GPUPrefetch.hh"
55 #include "debug/GPUTLB.hh"
56 #include "mem/packet_access.hh"
57 #include "mem/page_table.hh"
58 #include "mem/request.hh"
59 #include "sim/process.hh"
60 #include "sim/pseudo_inst.hh"
61
62 namespace X86ISA
63 {
64
65 GpuTLB::GpuTLB(const Params *p)
66 : ClockedObject(p), configAddress(0), size(p->size),
67 cleanupEvent([this]{ cleanup(); }, name(), false,
68 Event::Maximum_Pri),
69 exitEvent([this]{ exitCallback(); }, name())
70 {
71 assoc = p->assoc;
72 assert(assoc <= size);
73 numSets = size/assoc;
74 allocationPolicy = p->allocationPolicy;
75 hasMemSidePort = false;
76 accessDistance = p->accessDistance;
77
78 tlb.assign(size, TlbEntry());
79
80 freeList.resize(numSets);
81 entryList.resize(numSets);
82
83 for (int set = 0; set < numSets; ++set) {
84 for (int way = 0; way < assoc; ++way) {
85 int x = set * assoc + way;
86 freeList[set].push_back(&tlb.at(x));
87 }
88 }
89
90 FA = (size == assoc);
91
92 /**
93 * @warning: the set-associative version assumes you have a
94 * fixed page size of 4KB.
95 * If the page size is greather than 4KB (as defined in the
96 * TheISA::PageBytes), then there are various issues w/ the current
97 * implementation (you'd have the same 8KB page being replicated in
98 * different sets etc)
99 */
100 setMask = numSets - 1;
101
102 maxCoalescedReqs = p->maxOutstandingReqs;
103
104 // Do not allow maxCoalescedReqs to be more than the TLB associativity
105 if (maxCoalescedReqs > assoc) {
106 maxCoalescedReqs = assoc;
107 cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
108 }
109
110 outstandingReqs = 0;
111 hitLatency = p->hitLatency;
112 missLatency1 = p->missLatency1;
113 missLatency2 = p->missLatency2;
114
115 // create the slave ports based on the number of connected ports
116 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
117 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
118 name(), i), this, i));
119 }
120
121 // create the master ports based on the number of connected ports
122 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
123 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
124 name(), i), this, i));
125 }
126 }
127
128 // fixme: this is never called?
129 GpuTLB::~GpuTLB()
130 {
131 // make sure all the hash-maps are empty
132 assert(translationReturnEvent.empty());
133 }
134
135 Port &
136 GpuTLB::getPort(const std::string &if_name, PortID idx)
137 {
138 if (if_name == "slave") {
139 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
140 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
141 }
142
143 return *cpuSidePort[idx];
144 } else if (if_name == "master") {
145 if (idx >= static_cast<PortID>(memSidePort.size())) {
146 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
147 }
148
149 hasMemSidePort = true;
150
151 return *memSidePort[idx];
152 } else {
153 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
154 }
155 }
156
157 TlbEntry*
158 GpuTLB::insert(Addr vpn, TlbEntry &entry)
159 {
160 TlbEntry *newEntry = nullptr;
161
162 /**
163 * vpn holds the virtual page address
164 * The least significant bits are simply masked
165 */
166 int set = (vpn >> TheISA::PageShift) & setMask;
167
168 if (!freeList[set].empty()) {
169 newEntry = freeList[set].front();
170 freeList[set].pop_front();
171 } else {
172 newEntry = entryList[set].back();
173 entryList[set].pop_back();
174 }
175
176 *newEntry = entry;
177 newEntry->vaddr = vpn;
178 entryList[set].push_front(newEntry);
179
180 return newEntry;
181 }
182
183 GpuTLB::EntryList::iterator
184 GpuTLB::lookupIt(Addr va, bool update_lru)
185 {
186 int set = (va >> TheISA::PageShift) & setMask;
187
188 if (FA) {
189 assert(!set);
190 }
191
192 auto entry = entryList[set].begin();
193 for (; entry != entryList[set].end(); ++entry) {
194 int page_size = (*entry)->size();
195
196 if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
197 DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
198 "with size %#x.\n", va, (*entry)->vaddr, page_size);
199
200 if (update_lru) {
201 entryList[set].push_front(*entry);
202 entryList[set].erase(entry);
203 entry = entryList[set].begin();
204 }
205
206 break;
207 }
208 }
209
210 return entry;
211 }
212
213 TlbEntry*
214 GpuTLB::lookup(Addr va, bool update_lru)
215 {
216 int set = (va >> TheISA::PageShift) & setMask;
217
218 auto entry = lookupIt(va, update_lru);
219
220 if (entry == entryList[set].end())
221 return nullptr;
222 else
223 return *entry;
224 }
225
226 void
227 GpuTLB::invalidateAll()
228 {
229 DPRINTF(GPUTLB, "Invalidating all entries.\n");
230
231 for (int i = 0; i < numSets; ++i) {
232 while (!entryList[i].empty()) {
233 TlbEntry *entry = entryList[i].front();
234 entryList[i].pop_front();
235 freeList[i].push_back(entry);
236 }
237 }
238 }
239
240 void
241 GpuTLB::setConfigAddress(uint32_t addr)
242 {
243 configAddress = addr;
244 }
245
246 void
247 GpuTLB::invalidateNonGlobal()
248 {
249 DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
250
251 for (int i = 0; i < numSets; ++i) {
252 for (auto entryIt = entryList[i].begin();
253 entryIt != entryList[i].end();) {
254 if (!(*entryIt)->global) {
255 freeList[i].push_back(*entryIt);
256 entryList[i].erase(entryIt++);
257 } else {
258 ++entryIt;
259 }
260 }
261 }
262 }
263
264 void
265 GpuTLB::demapPage(Addr va, uint64_t asn)
266 {
267
268 int set = (va >> TheISA::PageShift) & setMask;
269 auto entry = lookupIt(va, false);
270
271 if (entry != entryList[set].end()) {
272 freeList[set].push_back(*entry);
273 entryList[set].erase(entry);
274 }
275 }
276
277
278
279 namespace
280 {
281
282 Cycles
283 localMiscRegAccess(bool read, MiscRegIndex regNum,
284 ThreadContext *tc, PacketPtr pkt)
285 {
286 if (read) {
287 RegVal data = htole(tc->readMiscReg(regNum));
288 // Make sure we don't trot off the end of data.
289 pkt->setData((uint8_t *)&data);
290 } else {
291 RegVal data = htole(tc->readMiscRegNoEffect(regNum));
292 tc->setMiscReg(regNum, letoh(data));
293 }
294 return Cycles(1);
295 }
296
297 } // anonymous namespace
298
299 Fault
300 GpuTLB::translateInt(bool read, const RequestPtr &req, ThreadContext *tc)
301 {
302 DPRINTF(GPUTLB, "Addresses references internal memory.\n");
303 Addr vaddr = req->getVaddr();
304 Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
305
306 if (prefix == IntAddrPrefixCPUID) {
307 panic("CPUID memory space not yet implemented!\n");
308 } else if (prefix == IntAddrPrefixMSR) {
309 vaddr = (vaddr >> 3) & ~IntAddrPrefixMask;
310
311 MiscRegIndex regNum;
312 if (!msrAddrToIndex(regNum, vaddr))
313 return std::make_shared<GeneralProtection>(0);
314
315 req->setLocalAccessor(
316 [read,regNum](ThreadContext *tc, PacketPtr pkt)
317 {
318 return localMiscRegAccess(read, regNum, tc, pkt);
319 }
320 );
321
322 return NoFault;
323 } else if (prefix == IntAddrPrefixIO) {
324 // TODO If CPL > IOPL or in virtual mode, check the I/O permission
325 // bitmap in the TSS.
326
327 Addr IOPort = vaddr & ~IntAddrPrefixMask;
328 // Make sure the address fits in the expected 16 bit IO address
329 // space.
330 assert(!(IOPort & ~0xFFFF));
331 if (IOPort == 0xCF8 && req->getSize() == 4) {
332 req->setLocalAccessor(
333 [read](ThreadContext *tc, PacketPtr pkt)
334 {
335 return localMiscRegAccess(
336 read, MISCREG_PCI_CONFIG_ADDRESS, tc, pkt);
337 }
338 );
339 } else if ((IOPort & ~mask(2)) == 0xCFC) {
340 req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
341 Addr configAddress =
342 tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
343 if (bits(configAddress, 31, 31)) {
344 req->setPaddr(PhysAddrPrefixPciConfig |
345 mbits(configAddress, 30, 2) |
346 (IOPort & mask(2)));
347 } else {
348 req->setPaddr(PhysAddrPrefixIO | IOPort);
349 }
350 } else {
351 req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
352 req->setPaddr(PhysAddrPrefixIO | IOPort);
353 }
354 return NoFault;
355 } else {
356 panic("Access to unrecognized internal address space %#x.\n",
357 prefix);
358 }
359 }
360
361 /**
362 * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
363 * and false on a TLB miss.
364 * Many of the checks about different modes have been converted to
365 * assertions, since these parts of the code are not really used.
366 * On a hit it will update the LRU stack.
367 */
368 bool
369 GpuTLB::tlbLookup(const RequestPtr &req,
370 ThreadContext *tc, bool update_stats)
371 {
372 bool tlb_hit = false;
373 #ifndef NDEBUG
374 uint32_t flags = req->getFlags();
375 int seg = flags & SegmentFlagMask;
376 #endif
377
378 assert(seg != SEGMENT_REG_MS);
379 Addr vaddr = req->getVaddr();
380 DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
381 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
382
383 if (m5Reg.prot) {
384 DPRINTF(GPUTLB, "In protected mode.\n");
385 // make sure we are in 64-bit mode
386 assert(m5Reg.mode == LongMode);
387
388 // If paging is enabled, do the translation.
389 if (m5Reg.paging) {
390 DPRINTF(GPUTLB, "Paging enabled.\n");
391 //update LRU stack on a hit
392 TlbEntry *entry = lookup(vaddr, true);
393
394 if (entry)
395 tlb_hit = true;
396
397 if (!update_stats) {
398 // functional tlb access for memory initialization
399 // i.e., memory seeding or instr. seeding -> don't update
400 // TLB and stats
401 return tlb_hit;
402 }
403
404 localNumTLBAccesses++;
405
406 if (!entry) {
407 localNumTLBMisses++;
408 } else {
409 localNumTLBHits++;
410 }
411 }
412 }
413
414 return tlb_hit;
415 }
416
417 Fault
418 GpuTLB::translate(const RequestPtr &req, ThreadContext *tc,
419 Translation *translation, Mode mode,
420 bool &delayedResponse, bool timing, int &latency)
421 {
422 uint32_t flags = req->getFlags();
423 int seg = flags & SegmentFlagMask;
424 bool storeCheck = flags & (StoreCheck << FlagShift);
425
426 // If this is true, we're dealing with a request
427 // to a non-memory address space.
428 if (seg == SEGMENT_REG_MS) {
429 return translateInt(mode == Mode::Read, req, tc);
430 }
431
432 delayedResponse = false;
433 Addr vaddr = req->getVaddr();
434 DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
435
436 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
437
438 // If protected mode has been enabled...
439 if (m5Reg.prot) {
440 DPRINTF(GPUTLB, "In protected mode.\n");
441 // If we're not in 64-bit mode, do protection/limit checks
442 if (m5Reg.mode != LongMode) {
443 DPRINTF(GPUTLB, "Not in long mode. Checking segment "
444 "protection.\n");
445
446 // Check for a null segment selector.
447 if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
448 seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
449 && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
450 return std::make_shared<GeneralProtection>(0);
451 }
452
453 bool expandDown = false;
454 SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
455
456 if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
457 if (!attr.writable && (mode == BaseTLB::Write ||
458 storeCheck))
459 return std::make_shared<GeneralProtection>(0);
460
461 if (!attr.readable && mode == BaseTLB::Read)
462 return std::make_shared<GeneralProtection>(0);
463
464 expandDown = attr.expandDown;
465
466 }
467
468 Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
469 Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
470 // This assumes we're not in 64 bit mode. If we were, the
471 // default address size is 64 bits, overridable to 32.
472 int size = 32;
473 bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
474 SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
475
476 if ((csAttr.defaultSize && sizeOverride) ||
477 (!csAttr.defaultSize && !sizeOverride)) {
478 size = 16;
479 }
480
481 Addr offset = bits(vaddr - base, size - 1, 0);
482 Addr endOffset = offset + req->getSize() - 1;
483
484 if (expandDown) {
485 DPRINTF(GPUTLB, "Checking an expand down segment.\n");
486 warn_once("Expand down segments are untested.\n");
487
488 if (offset <= limit || endOffset <= limit)
489 return std::make_shared<GeneralProtection>(0);
490 } else {
491 if (offset > limit || endOffset > limit)
492 return std::make_shared<GeneralProtection>(0);
493 }
494 }
495
496 // If paging is enabled, do the translation.
497 if (m5Reg.paging) {
498 DPRINTF(GPUTLB, "Paging enabled.\n");
499 // The vaddr already has the segment base applied.
500 TlbEntry *entry = lookup(vaddr);
501 localNumTLBAccesses++;
502
503 if (!entry) {
504 localNumTLBMisses++;
505 if (timing) {
506 latency = missLatency1;
507 }
508
509 if (FullSystem) {
510 fatal("GpuTLB doesn't support full-system mode\n");
511 } else {
512 DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
513 "at pc %#x.\n", vaddr, tc->instAddr());
514
515 Process *p = tc->getProcessPtr();
516 const EmulationPageTable::Entry *pte =
517 p->pTable->lookup(vaddr);
518
519 if (!pte && mode != BaseTLB::Execute) {
520 // penalize a "page fault" more
521 if (timing)
522 latency += missLatency2;
523
524 if (p->fixupFault(vaddr))
525 pte = p->pTable->lookup(vaddr);
526 }
527
528 if (!pte) {
529 return std::make_shared<PageFault>(vaddr, true,
530 mode, true,
531 false);
532 } else {
533 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
534
535 DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
536 alignedVaddr, pte->paddr);
537
538 TlbEntry gpuEntry(p->pid(), alignedVaddr,
539 pte->paddr, false, false);
540 entry = insert(alignedVaddr, gpuEntry);
541 }
542
543 DPRINTF(GPUTLB, "Miss was serviced.\n");
544 }
545 } else {
546 localNumTLBHits++;
547
548 if (timing) {
549 latency = hitLatency;
550 }
551 }
552
553 // Do paging protection checks.
554 bool inUser = (m5Reg.cpl == 3 &&
555 !(flags & (CPL0FlagBit << FlagShift)));
556
557 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
558 bool badWrite = (!entry->writable && (inUser || cr0.wp));
559
560 if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
561 badWrite)) {
562 // The page must have been present to get into the TLB in
563 // the first place. We'll assume the reserved bits are
564 // fine even though we're not checking them.
565 return std::make_shared<PageFault>(vaddr, true, mode,
566 inUser, false);
567 }
568
569 if (storeCheck && badWrite) {
570 // This would fault if this were a write, so return a page
571 // fault that reflects that happening.
572 return std::make_shared<PageFault>(vaddr, true,
573 BaseTLB::Write,
574 inUser, false);
575 }
576
577
578 DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
579 "checks.\n", entry->paddr);
580
581 int page_size = entry->size();
582 Addr paddr = entry->paddr | (vaddr & (page_size - 1));
583 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
584 req->setPaddr(paddr);
585
586 if (entry->uncacheable)
587 req->setFlags(Request::UNCACHEABLE);
588 } else {
589 //Use the address which already has segmentation applied.
590 DPRINTF(GPUTLB, "Paging disabled.\n");
591 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
592 req->setPaddr(vaddr);
593 }
594 } else {
595 // Real mode
596 DPRINTF(GPUTLB, "In real mode.\n");
597 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
598 req->setPaddr(vaddr);
599 }
600
601 // Check for an access to the local APIC
602 if (FullSystem) {
603 LocalApicBase localApicBase =
604 tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
605
606 Addr baseAddr = localApicBase.base * PageBytes;
607 Addr paddr = req->getPaddr();
608
609 if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
610 // Force the access to be uncacheable.
611 req->setFlags(Request::UNCACHEABLE);
612 req->setPaddr(x86LocalAPICAddress(tc->contextId(),
613 paddr - baseAddr));
614 }
615 }
616
617 return NoFault;
618 };
619
620 Fault
621 GpuTLB::translateAtomic(const RequestPtr &req, ThreadContext *tc,
622 Mode mode, int &latency)
623 {
624 bool delayedResponse;
625
626 return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,
627 false, latency);
628 }
629
630 void
631 GpuTLB::translateTiming(const RequestPtr &req, ThreadContext *tc,
632 Translation *translation, Mode mode, int &latency)
633 {
634 bool delayedResponse;
635 assert(translation);
636
637 Fault fault = GpuTLB::translate(req, tc, translation, mode,
638 delayedResponse, true, latency);
639
640 if (!delayedResponse)
641 translation->finish(fault, req, tc, mode);
642 }
643
644 Walker*
645 GpuTLB::getWalker()
646 {
647 return walker;
648 }
649
650
651 void
652 GpuTLB::serialize(CheckpointOut &cp) const
653 {
654 }
655
656 void
657 GpuTLB::unserialize(CheckpointIn &cp)
658 {
659 }
660
661 void
662 GpuTLB::regStats()
663 {
664 ClockedObject::regStats();
665
666 localNumTLBAccesses
667 .name(name() + ".local_TLB_accesses")
668 .desc("Number of TLB accesses")
669 ;
670
671 localNumTLBHits
672 .name(name() + ".local_TLB_hits")
673 .desc("Number of TLB hits")
674 ;
675
676 localNumTLBMisses
677 .name(name() + ".local_TLB_misses")
678 .desc("Number of TLB misses")
679 ;
680
681 localTLBMissRate
682 .name(name() + ".local_TLB_miss_rate")
683 .desc("TLB miss rate")
684 ;
685
686 accessCycles
687 .name(name() + ".access_cycles")
688 .desc("Cycles spent accessing this TLB level")
689 ;
690
691 pageTableCycles
692 .name(name() + ".page_table_cycles")
693 .desc("Cycles spent accessing the page table")
694 ;
695
696 localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
697
698 numUniquePages
699 .name(name() + ".unique_pages")
700 .desc("Number of unique pages touched")
701 ;
702
703 localCycles
704 .name(name() + ".local_cycles")
705 .desc("Number of cycles spent in queue for all incoming reqs")
706 ;
707
708 localLatency
709 .name(name() + ".local_latency")
710 .desc("Avg. latency over incoming coalesced reqs")
711 ;
712
713 localLatency = localCycles / localNumTLBAccesses;
714
715 globalNumTLBAccesses
716 .name(name() + ".global_TLB_accesses")
717 .desc("Number of TLB accesses")
718 ;
719
720 globalNumTLBHits
721 .name(name() + ".global_TLB_hits")
722 .desc("Number of TLB hits")
723 ;
724
725 globalNumTLBMisses
726 .name(name() + ".global_TLB_misses")
727 .desc("Number of TLB misses")
728 ;
729
730 globalTLBMissRate
731 .name(name() + ".global_TLB_miss_rate")
732 .desc("TLB miss rate")
733 ;
734
735 globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
736
737 avgReuseDistance
738 .name(name() + ".avg_reuse_distance")
739 .desc("avg. reuse distance over all pages (in ticks)")
740 ;
741
742 }
743
744 /**
745 * Do the TLB lookup for this coalesced request and schedule
746 * another event <TLB access latency> cycles later.
747 */
748
749 void
750 GpuTLB::issueTLBLookup(PacketPtr pkt)
751 {
752 assert(pkt);
753 assert(pkt->senderState);
754
755 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
756 TheISA::PageBytes);
757
758 TranslationState *sender_state =
759 safe_cast<TranslationState*>(pkt->senderState);
760
761 bool update_stats = !sender_state->prefetch;
762 ThreadContext * tmp_tc = sender_state->tc;
763
764 DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
765 virt_page_addr);
766
767 int req_cnt = sender_state->reqCnt.back();
768
769 if (update_stats) {
770 accessCycles -= (curTick() * req_cnt);
771 localCycles -= curTick();
772 updatePageFootprint(virt_page_addr);
773 globalNumTLBAccesses += req_cnt;
774 }
775
776 tlbOutcome lookup_outcome = TLB_MISS;
777 const RequestPtr &tmp_req = pkt->req;
778
779 // Access the TLB and figure out if it's a hit or a miss.
780 bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
781
782 if (success) {
783 lookup_outcome = TLB_HIT;
784 // Put the entry in SenderState
785 TlbEntry *entry = lookup(tmp_req->getVaddr(), false);
786 assert(entry);
787
788 auto p = sender_state->tc->getProcessPtr();
789 sender_state->tlbEntry =
790 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
791 false, false);
792
793 if (update_stats) {
794 // the reqCnt has an entry per level, so its size tells us
795 // which level we are in
796 sender_state->hitLevel = sender_state->reqCnt.size();
797 globalNumTLBHits += req_cnt;
798 }
799 } else {
800 if (update_stats)
801 globalNumTLBMisses += req_cnt;
802 }
803
804 /*
805 * We now know the TLB lookup outcome (if it's a hit or a miss), as
806 * well as the TLB access latency.
807 *
808 * We create and schedule a new TLBEvent which will help us take the
809 * appropriate actions (e.g., update TLB on a hit, send request to
810 * lower level TLB on a miss, or start a page walk if this was the
811 * last-level TLB)
812 */
813 TLBEvent *tlb_event =
814 new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
815
816 if (translationReturnEvent.count(virt_page_addr)) {
817 panic("Virtual Page Address %#x already has a return event\n",
818 virt_page_addr);
819 }
820
821 translationReturnEvent[virt_page_addr] = tlb_event;
822 assert(tlb_event);
823
824 DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
825 curTick() + cyclesToTicks(Cycles(hitLatency)));
826
827 schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));
828 }
829
830 GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,
831 tlbOutcome tlb_outcome, PacketPtr _pkt)
832 : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
833 outcome(tlb_outcome), pkt(_pkt)
834 {
835 }
836
837 /**
838 * Do Paging protection checks. If we encounter a page fault, then
839 * an assertion is fired.
840 */
841 void
842 GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
843 TlbEntry * tlb_entry, Mode mode)
844 {
845 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
846 uint32_t flags = pkt->req->getFlags();
847 bool storeCheck = flags & (StoreCheck << FlagShift);
848
849 // Do paging protection checks.
850 bool inUser
851 = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
852 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
853
854 bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
855
856 if ((inUser && !tlb_entry->user) ||
857 (mode == BaseTLB::Write && badWrite)) {
858 // The page must have been present to get into the TLB in
859 // the first place. We'll assume the reserved bits are
860 // fine even though we're not checking them.
861 panic("Page fault detected");
862 }
863
864 if (storeCheck && badWrite) {
865 // This would fault if this were a write, so return a page
866 // fault that reflects that happening.
867 panic("Page fault detected");
868 }
869 }
870
871 /**
872 * handleTranslationReturn is called on a TLB hit,
873 * when a TLB miss returns or when a page fault returns.
874 * The latter calls handelHit with TLB miss as tlbOutcome.
875 */
876 void
877 GpuTLB::handleTranslationReturn(Addr virt_page_addr,
878 tlbOutcome tlb_outcome, PacketPtr pkt)
879 {
880 assert(pkt);
881 Addr vaddr = pkt->req->getVaddr();
882
883 TranslationState *sender_state =
884 safe_cast<TranslationState*>(pkt->senderState);
885
886 ThreadContext *tc = sender_state->tc;
887 Mode mode = sender_state->tlbMode;
888
889 TlbEntry *local_entry, *new_entry;
890
891 if (tlb_outcome == TLB_HIT) {
892 DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",
893 vaddr);
894 local_entry = sender_state->tlbEntry;
895 } else {
896 DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
897 vaddr);
898
899 /**
900 * We are returning either from a page walk or from a hit at a
901 * lower TLB level. The senderState should be "carrying" a pointer
902 * to the correct TLBEntry.
903 */
904 new_entry = sender_state->tlbEntry;
905 assert(new_entry);
906 local_entry = new_entry;
907
908 if (allocationPolicy) {
909 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
910 virt_page_addr);
911
912 local_entry = insert(virt_page_addr, *new_entry);
913 }
914
915 assert(local_entry);
916 }
917
918 /**
919 * At this point the packet carries an up-to-date tlbEntry pointer
920 * in its senderState.
921 * Next step is to do the paging protection checks.
922 */
923 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
924 "while paddr was %#x.\n", local_entry->vaddr,
925 local_entry->paddr);
926
927 pagingProtectionChecks(tc, pkt, local_entry, mode);
928 int page_size = local_entry->size();
929 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
930 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
931
932 // Since this packet will be sent through the cpu side slave port,
933 // it must be converted to a response pkt if it is not one already
934 if (pkt->isRequest()) {
935 pkt->makeTimingResponse();
936 }
937
938 pkt->req->setPaddr(paddr);
939
940 if (local_entry->uncacheable) {
941 pkt->req->setFlags(Request::UNCACHEABLE);
942 }
943
944 //send packet back to coalescer
945 cpuSidePort[0]->sendTimingResp(pkt);
946 //schedule cleanup event
947 cleanupQueue.push(virt_page_addr);
948
949 // schedule this only once per cycle.
950 // The check is required because we might have multiple translations
951 // returning the same cycle
952 // this is a maximum priority event and must be on the same cycle
953 // as the cleanup event in TLBCoalescer to avoid a race with
954 // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
955 if (!cleanupEvent.scheduled())
956 schedule(cleanupEvent, curTick());
957 }
958
959 /**
960 * Here we take the appropriate actions based on the result of the
961 * TLB lookup.
962 */
963 void
964 GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
965 PacketPtr pkt)
966 {
967 DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
968
969 assert(translationReturnEvent[virtPageAddr]);
970 assert(pkt);
971
972 TranslationState *tmp_sender_state =
973 safe_cast<TranslationState*>(pkt->senderState);
974
975 int req_cnt = tmp_sender_state->reqCnt.back();
976 bool update_stats = !tmp_sender_state->prefetch;
977
978
979 if (outcome == TLB_HIT) {
980 handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
981
982 if (update_stats) {
983 accessCycles += (req_cnt * curTick());
984 localCycles += curTick();
985 }
986
987 } else if (outcome == TLB_MISS) {
988
989 DPRINTF(GPUTLB, "This is a TLB miss\n");
990 if (update_stats) {
991 accessCycles += (req_cnt*curTick());
992 localCycles += curTick();
993 }
994
995 if (hasMemSidePort) {
996 // the one cyle added here represent the delay from when we get
997 // the reply back till when we propagate it to the coalescer
998 // above.
999 if (update_stats) {
1000 accessCycles += (req_cnt * 1);
1001 localCycles += 1;
1002 }
1003
1004 /**
1005 * There is a TLB below. Send the coalesced request.
1006 * We actually send the very first packet of all the
1007 * pending packets for this virtual page address.
1008 */
1009 if (!memSidePort[0]->sendTimingReq(pkt)) {
1010 DPRINTF(GPUTLB, "Failed sending translation request to "
1011 "lower level TLB for addr %#x\n", virtPageAddr);
1012
1013 memSidePort[0]->retries.push_back(pkt);
1014 } else {
1015 DPRINTF(GPUTLB, "Sent translation request to lower level "
1016 "TLB for addr %#x\n", virtPageAddr);
1017 }
1018 } else {
1019 //this is the last level TLB. Start a page walk
1020 DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
1021 "addr %#x\n", virtPageAddr);
1022
1023 if (update_stats)
1024 pageTableCycles -= (req_cnt*curTick());
1025
1026 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
1027 assert(tlb_event);
1028 tlb_event->updateOutcome(PAGE_WALK);
1029 schedule(tlb_event,
1030 curTick() + cyclesToTicks(Cycles(missLatency2)));
1031 }
1032 } else if (outcome == PAGE_WALK) {
1033 if (update_stats)
1034 pageTableCycles += (req_cnt*curTick());
1035
1036 // Need to access the page table and update the TLB
1037 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1038 virtPageAddr);
1039
1040 TranslationState *sender_state =
1041 safe_cast<TranslationState*>(pkt->senderState);
1042
1043 Process *p = sender_state->tc->getProcessPtr();
1044 Addr vaddr = pkt->req->getVaddr();
1045 #ifndef NDEBUG
1046 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1047 assert(alignedVaddr == virtPageAddr);
1048 #endif
1049 const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
1050 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1051 p->fixupFault(vaddr)) {
1052 pte = p->pTable->lookup(vaddr);
1053 }
1054
1055 if (pte) {
1056 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1057 pte->paddr);
1058
1059 sender_state->tlbEntry =
1060 new TlbEntry(p->pid(), virtPageAddr, pte->paddr, false,
1061 false);
1062 } else {
1063 sender_state->tlbEntry = nullptr;
1064 }
1065
1066 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1067 } else if (outcome == MISS_RETURN) {
1068 /** we add an extra cycle in the return path of the translation
1069 * requests in between the various TLB levels.
1070 */
1071 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1072 } else {
1073 panic("Unexpected TLB outcome %d", outcome);
1074 }
1075 }
1076
1077 void
1078 GpuTLB::TLBEvent::process()
1079 {
1080 tlb->translationReturn(virtPageAddr, outcome, pkt);
1081 }
1082
1083 const char*
1084 GpuTLB::TLBEvent::description() const
1085 {
1086 return "trigger translationDoneEvent";
1087 }
1088
1089 void
1090 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
1091 {
1092 outcome = _outcome;
1093 }
1094
1095 Addr
1096 GpuTLB::TLBEvent::getTLBEventVaddr()
1097 {
1098 return virtPageAddr;
1099 }
1100
1101 /**
1102 * recvTiming receives a coalesced timing request from a TLBCoalescer
1103 * and it calls issueTLBLookup()
1104 * It only rejects the packet if we have exceeded the max
1105 * outstanding number of requests for the TLB
1106 */
1107 bool
1108 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
1109 {
1110 if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
1111 tlb->issueTLBLookup(pkt);
1112 // update number of outstanding translation requests
1113 tlb->outstandingReqs++;
1114 return true;
1115 } else {
1116 DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
1117 tlb->outstandingReqs);
1118 return false;
1119 }
1120 }
1121
1122 /**
1123 * handleFuncTranslationReturn is called on a TLB hit,
1124 * when a TLB miss returns or when a page fault returns.
1125 * It updates LRU, inserts the TLB entry on a miss
1126 * depending on the allocation policy and does the required
1127 * protection checks. It does NOT create a new packet to
1128 * update the packet's addr; this is done in hsail-gpu code.
1129 */
1130 void
1131 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
1132 {
1133 TranslationState *sender_state =
1134 safe_cast<TranslationState*>(pkt->senderState);
1135
1136 ThreadContext *tc = sender_state->tc;
1137 Mode mode = sender_state->tlbMode;
1138 Addr vaddr = pkt->req->getVaddr();
1139
1140 TlbEntry *local_entry, *new_entry;
1141
1142 if (tlb_outcome == TLB_HIT) {
1143 DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
1144 "%#x\n", vaddr);
1145
1146 local_entry = sender_state->tlbEntry;
1147 } else {
1148 DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
1149 "%#x\n", vaddr);
1150
1151 /**
1152 * We are returning either from a page walk or from a hit at a
1153 * lower TLB level. The senderState should be "carrying" a pointer
1154 * to the correct TLBEntry.
1155 */
1156 new_entry = sender_state->tlbEntry;
1157 assert(new_entry);
1158 local_entry = new_entry;
1159
1160 if (allocationPolicy) {
1161 Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
1162
1163 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1164 virt_page_addr);
1165
1166 local_entry = insert(virt_page_addr, *new_entry);
1167 }
1168
1169 assert(local_entry);
1170 }
1171
1172 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
1173 "while paddr was %#x.\n", local_entry->vaddr,
1174 local_entry->paddr);
1175
1176 /**
1177 * Do paging checks if it's a normal functional access. If it's for a
1178 * prefetch, then sometimes you can try to prefetch something that
1179 * won't pass protection. We don't actually want to fault becuase there
1180 * is no demand access to deem this a violation. Just put it in the
1181 * TLB and it will fault if indeed a future demand access touches it in
1182 * violation.
1183 *
1184 * This feature could be used to explore security issues around
1185 * speculative memory accesses.
1186 */
1187 if (!sender_state->prefetch && sender_state->tlbEntry)
1188 pagingProtectionChecks(tc, pkt, local_entry, mode);
1189
1190 int page_size = local_entry->size();
1191 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1192 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1193
1194 pkt->req->setPaddr(paddr);
1195
1196 if (local_entry->uncacheable)
1197 pkt->req->setFlags(Request::UNCACHEABLE);
1198 }
1199
1200 // This is used for atomic translations. Need to
1201 // make it all happen during the same cycle.
1202 void
1203 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
1204 {
1205 TranslationState *sender_state =
1206 safe_cast<TranslationState*>(pkt->senderState);
1207
1208 ThreadContext *tc = sender_state->tc;
1209 bool update_stats = !sender_state->prefetch;
1210
1211 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1212 TheISA::PageBytes);
1213
1214 if (update_stats)
1215 tlb->updatePageFootprint(virt_page_addr);
1216
1217 // do the TLB lookup without updating the stats
1218 bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
1219 tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
1220
1221 // functional mode means no coalescing
1222 // global metrics are the same as the local metrics
1223 if (update_stats) {
1224 tlb->globalNumTLBAccesses++;
1225
1226 if (success) {
1227 sender_state->hitLevel = sender_state->reqCnt.size();
1228 tlb->globalNumTLBHits++;
1229 }
1230 }
1231
1232 if (!success) {
1233 if (update_stats)
1234 tlb->globalNumTLBMisses++;
1235 if (tlb->hasMemSidePort) {
1236 // there is a TLB below -> propagate down the TLB hierarchy
1237 tlb->memSidePort[0]->sendFunctional(pkt);
1238 // If no valid translation from a prefetch, then just return
1239 if (sender_state->prefetch && !pkt->req->hasPaddr())
1240 return;
1241 } else {
1242 // Need to access the page table and update the TLB
1243 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1244 virt_page_addr);
1245
1246 Process *p = tc->getProcessPtr();
1247
1248 Addr vaddr = pkt->req->getVaddr();
1249 #ifndef NDEBUG
1250 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1251 assert(alignedVaddr == virt_page_addr);
1252 #endif
1253
1254 const EmulationPageTable::Entry *pte =
1255 p->pTable->lookup(vaddr);
1256 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1257 p->fixupFault(vaddr)) {
1258 pte = p->pTable->lookup(vaddr);
1259 }
1260
1261 if (!sender_state->prefetch) {
1262 // no PageFaults are permitted after
1263 // the second page table lookup
1264 assert(pte);
1265
1266 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1267 pte->paddr);
1268
1269 sender_state->tlbEntry =
1270 new TlbEntry(p->pid(), virt_page_addr,
1271 pte->paddr, false, false);
1272 } else {
1273 // If this was a prefetch, then do the normal thing if it
1274 // was a successful translation. Otherwise, send an empty
1275 // TLB entry back so that it can be figured out as empty
1276 // and handled accordingly.
1277 if (pte) {
1278 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1279 pte->paddr);
1280
1281 sender_state->tlbEntry =
1282 new TlbEntry(p->pid(), virt_page_addr,
1283 pte->paddr, false, false);
1284 } else {
1285 DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
1286 alignedVaddr);
1287
1288 sender_state->tlbEntry = nullptr;
1289
1290 return;
1291 }
1292 }
1293 }
1294 } else {
1295 DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
1296 tlb->lookup(pkt->req->getVaddr()));
1297
1298 TlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
1299 update_stats);
1300
1301 assert(entry);
1302
1303 auto p = sender_state->tc->getProcessPtr();
1304 sender_state->tlbEntry =
1305 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
1306 false, false);
1307 }
1308 // This is the function that would populate pkt->req with the paddr of
1309 // the translation. But if no translation happens (i.e Prefetch fails)
1310 // then the early returns in the above code wiill keep this function
1311 // from executing.
1312 tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
1313 }
1314
1315 void
1316 GpuTLB::CpuSidePort::recvReqRetry()
1317 {
1318 // The CPUSidePort never sends anything but replies. No retries
1319 // expected.
1320 panic("recvReqRetry called");
1321 }
1322
1323 AddrRangeList
1324 GpuTLB::CpuSidePort::getAddrRanges() const
1325 {
1326 // currently not checked by the master
1327 AddrRangeList ranges;
1328
1329 return ranges;
1330 }
1331
1332 /**
1333 * MemSidePort receives the packet back.
1334 * We need to call the handleTranslationReturn
1335 * and propagate up the hierarchy.
1336 */
1337 bool
1338 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
1339 {
1340 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1341 TheISA::PageBytes);
1342
1343 DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
1344 virt_page_addr);
1345
1346 TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
1347 assert(tlb_event);
1348 assert(virt_page_addr == tlb_event->getTLBEventVaddr());
1349
1350 tlb_event->updateOutcome(MISS_RETURN);
1351 tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());
1352
1353 return true;
1354 }
1355
1356 void
1357 GpuTLB::MemSidePort::recvReqRetry()
1358 {
1359 // No retries should reach the TLB. The retries
1360 // should only reach the TLBCoalescer.
1361 panic("recvReqRetry called");
1362 }
1363
1364 void
1365 GpuTLB::cleanup()
1366 {
1367 while (!cleanupQueue.empty()) {
1368 Addr cleanup_addr = cleanupQueue.front();
1369 cleanupQueue.pop();
1370
1371 // delete TLBEvent
1372 TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
1373 delete old_tlb_event;
1374 translationReturnEvent.erase(cleanup_addr);
1375
1376 // update number of outstanding requests
1377 outstandingReqs--;
1378 }
1379
1380 /** the higher level coalescer should retry if it has
1381 * any pending requests.
1382 */
1383 for (int i = 0; i < cpuSidePort.size(); ++i) {
1384 cpuSidePort[i]->sendRetryReq();
1385 }
1386 }
1387
1388 void
1389 GpuTLB::updatePageFootprint(Addr virt_page_addr)
1390 {
1391
1392 std::pair<AccessPatternTable::iterator, bool> ret;
1393
1394 AccessInfo tmp_access_info;
1395 tmp_access_info.lastTimeAccessed = 0;
1396 tmp_access_info.accessesPerPage = 0;
1397 tmp_access_info.totalReuseDistance = 0;
1398 tmp_access_info.sumDistance = 0;
1399 tmp_access_info.meanDistance = 0;
1400
1401 ret = TLBFootprint.insert(
1402 AccessPatternTable::value_type(virt_page_addr, tmp_access_info));
1403
1404 bool first_page_access = ret.second;
1405
1406 if (first_page_access) {
1407 numUniquePages++;
1408 } else {
1409 int accessed_before;
1410 accessed_before = curTick() - ret.first->second.lastTimeAccessed;
1411 ret.first->second.totalReuseDistance += accessed_before;
1412 }
1413
1414 ret.first->second.accessesPerPage++;
1415 ret.first->second.lastTimeAccessed = curTick();
1416
1417 if (accessDistance) {
1418 ret.first->second.localTLBAccesses
1419 .push_back(localNumTLBAccesses.value());
1420 }
1421 }
1422
1423 void
1424 GpuTLB::exitCallback()
1425 {
1426 std::ostream *page_stat_file = nullptr;
1427
1428 if (accessDistance) {
1429
1430 // print per page statistics to a separate file (.csv format)
1431 // simout is the gem5 output directory (default is m5out or the one
1432 // specified with -d
1433 page_stat_file = simout.create(name().c_str())->stream();
1434
1435 // print header
1436 *page_stat_file
1437 << "page,max_access_distance,mean_access_distance, "
1438 << "stddev_distance" << std::endl;
1439 }
1440
1441 // update avg. reuse distance footprint
1442 unsigned int sum_avg_reuse_distance_per_page = 0;
1443
1444 // iterate through all pages seen by this TLB
1445 for (auto &iter : TLBFootprint) {
1446 sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /
1447 iter.second.accessesPerPage;
1448
1449 if (accessDistance) {
1450 unsigned int tmp = iter.second.localTLBAccesses[0];
1451 unsigned int prev = tmp;
1452
1453 for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
1454 if (i) {
1455 tmp = prev + 1;
1456 }
1457
1458 prev = iter.second.localTLBAccesses[i];
1459 // update the localTLBAccesses value
1460 // with the actual differece
1461 iter.second.localTLBAccesses[i] -= tmp;
1462 // compute the sum of AccessDistance per page
1463 // used later for mean
1464 iter.second.sumDistance +=
1465 iter.second.localTLBAccesses[i];
1466 }
1467
1468 iter.second.meanDistance =
1469 iter.second.sumDistance / iter.second.accessesPerPage;
1470
1471 // compute std_dev and max (we need a second round because we
1472 // need to know the mean value
1473 unsigned int max_distance = 0;
1474 unsigned int stddev_distance = 0;
1475
1476 for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {
1477 unsigned int tmp_access_distance =
1478 iter.second.localTLBAccesses[i];
1479
1480 if (tmp_access_distance > max_distance) {
1481 max_distance = tmp_access_distance;
1482 }
1483
1484 unsigned int diff =
1485 tmp_access_distance - iter.second.meanDistance;
1486 stddev_distance += pow(diff, 2);
1487
1488 }
1489
1490 stddev_distance =
1491 sqrt(stddev_distance/iter.second.accessesPerPage);
1492
1493 if (page_stat_file) {
1494 *page_stat_file << std::hex << iter.first << ",";
1495 *page_stat_file << std::dec << max_distance << ",";
1496 *page_stat_file << std::dec << iter.second.meanDistance
1497 << ",";
1498 *page_stat_file << std::dec << stddev_distance;
1499 *page_stat_file << std::endl;
1500 }
1501
1502 // erase the localTLBAccesses array
1503 iter.second.localTLBAccesses.clear();
1504 }
1505 }
1506
1507 if (!TLBFootprint.empty()) {
1508 avgReuseDistance =
1509 sum_avg_reuse_distance_per_page / TLBFootprint.size();
1510 }
1511
1512 //clear the TLBFootprint map
1513 TLBFootprint.clear();
1514 }
1515 } // namespace X86ISA
1516
1517 X86ISA::GpuTLB*
1518 X86GPUTLBParams::create()
1519 {
1520 return new X86ISA::GpuTLB(this);
1521 }
1522