gpu-compute,mem-ruby: Refactor GPU coalescer
[gem5.git] / src / gpu-compute / gpu_tlb.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Author: Lisa Hsu
34 */
35
36 #include "gpu-compute/gpu_tlb.hh"
37
38 #include <cmath>
39 #include <cstring>
40
41 #include "arch/x86/faults.hh"
42 #include "arch/x86/insts/microldstop.hh"
43 #include "arch/x86/pagetable.hh"
44 #include "arch/x86/pagetable_walker.hh"
45 #include "arch/x86/regs/misc.hh"
46 #include "arch/x86/regs/msr.hh"
47 #include "arch/x86/x86_traits.hh"
48 #include "base/bitfield.hh"
49 #include "base/logging.hh"
50 #include "base/output.hh"
51 #include "base/trace.hh"
52 #include "cpu/base.hh"
53 #include "cpu/thread_context.hh"
54 #include "debug/GPUPrefetch.hh"
55 #include "debug/GPUTLB.hh"
56 #include "mem/packet_access.hh"
57 #include "mem/page_table.hh"
58 #include "mem/request.hh"
59 #include "sim/process.hh"
60 #include "sim/pseudo_inst.hh"
61
62 namespace X86ISA
63 {
64
65 GpuTLB::GpuTLB(const Params *p)
66 : ClockedObject(p), configAddress(0), size(p->size),
67 cleanupEvent([this]{ cleanup(); }, name(), false,
68 Event::Maximum_Pri),
69 exitEvent([this]{ exitCallback(); }, name())
70 {
71 assoc = p->assoc;
72 assert(assoc <= size);
73 numSets = size/assoc;
74 allocationPolicy = p->allocationPolicy;
75 hasMemSidePort = false;
76 accessDistance = p->accessDistance;
77 clock = p->clk_domain->clockPeriod();
78
79 tlb.assign(size, TlbEntry());
80
81 freeList.resize(numSets);
82 entryList.resize(numSets);
83
84 for (int set = 0; set < numSets; ++set) {
85 for (int way = 0; way < assoc; ++way) {
86 int x = set * assoc + way;
87 freeList[set].push_back(&tlb.at(x));
88 }
89 }
90
91 FA = (size == assoc);
92
93 /**
94 * @warning: the set-associative version assumes you have a
95 * fixed page size of 4KB.
96 * If the page size is greather than 4KB (as defined in the
97 * TheISA::PageBytes), then there are various issues w/ the current
98 * implementation (you'd have the same 8KB page being replicated in
99 * different sets etc)
100 */
101 setMask = numSets - 1;
102
103 maxCoalescedReqs = p->maxOutstandingReqs;
104
105 // Do not allow maxCoalescedReqs to be more than the TLB associativity
106 if (maxCoalescedReqs > assoc) {
107 maxCoalescedReqs = assoc;
108 cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
109 }
110
111 outstandingReqs = 0;
112 hitLatency = p->hitLatency;
113 missLatency1 = p->missLatency1;
114 missLatency2 = p->missLatency2;
115
116 // create the slave ports based on the number of connected ports
117 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
118 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
119 name(), i), this, i));
120 }
121
122 // create the master ports based on the number of connected ports
123 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
124 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
125 name(), i), this, i));
126 }
127 }
128
129 // fixme: this is never called?
130 GpuTLB::~GpuTLB()
131 {
132 // make sure all the hash-maps are empty
133 assert(translationReturnEvent.empty());
134 }
135
136 Port &
137 GpuTLB::getPort(const std::string &if_name, PortID idx)
138 {
139 if (if_name == "slave") {
140 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
141 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
142 }
143
144 return *cpuSidePort[idx];
145 } else if (if_name == "master") {
146 if (idx >= static_cast<PortID>(memSidePort.size())) {
147 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
148 }
149
150 hasMemSidePort = true;
151
152 return *memSidePort[idx];
153 } else {
154 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
155 }
156 }
157
158 TlbEntry*
159 GpuTLB::insert(Addr vpn, TlbEntry &entry)
160 {
161 TlbEntry *newEntry = nullptr;
162
163 /**
164 * vpn holds the virtual page address
165 * The least significant bits are simply masked
166 */
167 int set = (vpn >> TheISA::PageShift) & setMask;
168
169 if (!freeList[set].empty()) {
170 newEntry = freeList[set].front();
171 freeList[set].pop_front();
172 } else {
173 newEntry = entryList[set].back();
174 entryList[set].pop_back();
175 }
176
177 *newEntry = entry;
178 newEntry->vaddr = vpn;
179 entryList[set].push_front(newEntry);
180
181 return newEntry;
182 }
183
184 GpuTLB::EntryList::iterator
185 GpuTLB::lookupIt(Addr va, bool update_lru)
186 {
187 int set = (va >> TheISA::PageShift) & setMask;
188
189 if (FA) {
190 assert(!set);
191 }
192
193 auto entry = entryList[set].begin();
194 for (; entry != entryList[set].end(); ++entry) {
195 int page_size = (*entry)->size();
196
197 if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
198 DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
199 "with size %#x.\n", va, (*entry)->vaddr, page_size);
200
201 if (update_lru) {
202 entryList[set].push_front(*entry);
203 entryList[set].erase(entry);
204 entry = entryList[set].begin();
205 }
206
207 break;
208 }
209 }
210
211 return entry;
212 }
213
214 TlbEntry*
215 GpuTLB::lookup(Addr va, bool update_lru)
216 {
217 int set = (va >> TheISA::PageShift) & setMask;
218
219 auto entry = lookupIt(va, update_lru);
220
221 if (entry == entryList[set].end())
222 return nullptr;
223 else
224 return *entry;
225 }
226
227 void
228 GpuTLB::invalidateAll()
229 {
230 DPRINTF(GPUTLB, "Invalidating all entries.\n");
231
232 for (int i = 0; i < numSets; ++i) {
233 while (!entryList[i].empty()) {
234 TlbEntry *entry = entryList[i].front();
235 entryList[i].pop_front();
236 freeList[i].push_back(entry);
237 }
238 }
239 }
240
241 void
242 GpuTLB::setConfigAddress(uint32_t addr)
243 {
244 configAddress = addr;
245 }
246
247 void
248 GpuTLB::invalidateNonGlobal()
249 {
250 DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
251
252 for (int i = 0; i < numSets; ++i) {
253 for (auto entryIt = entryList[i].begin();
254 entryIt != entryList[i].end();) {
255 if (!(*entryIt)->global) {
256 freeList[i].push_back(*entryIt);
257 entryList[i].erase(entryIt++);
258 } else {
259 ++entryIt;
260 }
261 }
262 }
263 }
264
265 void
266 GpuTLB::demapPage(Addr va, uint64_t asn)
267 {
268
269 int set = (va >> TheISA::PageShift) & setMask;
270 auto entry = lookupIt(va, false);
271
272 if (entry != entryList[set].end()) {
273 freeList[set].push_back(*entry);
274 entryList[set].erase(entry);
275 }
276 }
277
278
279
280 namespace
281 {
282
283 Cycles
284 localMiscRegAccess(bool read, MiscRegIndex regNum,
285 ThreadContext *tc, PacketPtr pkt)
286 {
287 if (read) {
288 RegVal data = htole(tc->readMiscReg(regNum));
289 // Make sure we don't trot off the end of data.
290 pkt->setData((uint8_t *)&data);
291 } else {
292 RegVal data = htole(tc->readMiscRegNoEffect(regNum));
293 tc->setMiscReg(regNum, letoh(data));
294 }
295 return Cycles(1);
296 }
297
298 } // anonymous namespace
299
300 Fault
301 GpuTLB::translateInt(bool read, const RequestPtr &req, ThreadContext *tc)
302 {
303 DPRINTF(GPUTLB, "Addresses references internal memory.\n");
304 Addr vaddr = req->getVaddr();
305 Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
306
307 if (prefix == IntAddrPrefixCPUID) {
308 panic("CPUID memory space not yet implemented!\n");
309 } else if (prefix == IntAddrPrefixMSR) {
310 vaddr = (vaddr >> 3) & ~IntAddrPrefixMask;
311
312 MiscRegIndex regNum;
313 if (!msrAddrToIndex(regNum, vaddr))
314 return std::make_shared<GeneralProtection>(0);
315
316 req->setLocalAccessor(
317 [read,regNum,vaddr](ThreadContext *tc, PacketPtr pkt)
318 {
319 return localMiscRegAccess(read, regNum, tc, pkt);
320 }
321 );
322
323 return NoFault;
324 } else if (prefix == IntAddrPrefixIO) {
325 // TODO If CPL > IOPL or in virtual mode, check the I/O permission
326 // bitmap in the TSS.
327
328 Addr IOPort = vaddr & ~IntAddrPrefixMask;
329 // Make sure the address fits in the expected 16 bit IO address
330 // space.
331 assert(!(IOPort & ~0xFFFF));
332 if (IOPort == 0xCF8 && req->getSize() == 4) {
333 req->setLocalAccessor(
334 [read](ThreadContext *tc, PacketPtr pkt)
335 {
336 return localMiscRegAccess(
337 read, MISCREG_PCI_CONFIG_ADDRESS, tc, pkt);
338 }
339 );
340 } else if ((IOPort & ~mask(2)) == 0xCFC) {
341 req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
342 Addr configAddress =
343 tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
344 if (bits(configAddress, 31, 31)) {
345 req->setPaddr(PhysAddrPrefixPciConfig |
346 mbits(configAddress, 30, 2) |
347 (IOPort & mask(2)));
348 } else {
349 req->setPaddr(PhysAddrPrefixIO | IOPort);
350 }
351 } else {
352 req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);
353 req->setPaddr(PhysAddrPrefixIO | IOPort);
354 }
355 return NoFault;
356 } else {
357 panic("Access to unrecognized internal address space %#x.\n",
358 prefix);
359 }
360 }
361
362 /**
363 * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
364 * and false on a TLB miss.
365 * Many of the checks about different modes have been converted to
366 * assertions, since these parts of the code are not really used.
367 * On a hit it will update the LRU stack.
368 */
369 bool
370 GpuTLB::tlbLookup(const RequestPtr &req,
371 ThreadContext *tc, bool update_stats)
372 {
373 bool tlb_hit = false;
374 #ifndef NDEBUG
375 uint32_t flags = req->getFlags();
376 int seg = flags & SegmentFlagMask;
377 #endif
378
379 assert(seg != SEGMENT_REG_MS);
380 Addr vaddr = req->getVaddr();
381 DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
382 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
383
384 if (m5Reg.prot) {
385 DPRINTF(GPUTLB, "In protected mode.\n");
386 // make sure we are in 64-bit mode
387 assert(m5Reg.mode == LongMode);
388
389 // If paging is enabled, do the translation.
390 if (m5Reg.paging) {
391 DPRINTF(GPUTLB, "Paging enabled.\n");
392 //update LRU stack on a hit
393 TlbEntry *entry = lookup(vaddr, true);
394
395 if (entry)
396 tlb_hit = true;
397
398 if (!update_stats) {
399 // functional tlb access for memory initialization
400 // i.e., memory seeding or instr. seeding -> don't update
401 // TLB and stats
402 return tlb_hit;
403 }
404
405 localNumTLBAccesses++;
406
407 if (!entry) {
408 localNumTLBMisses++;
409 } else {
410 localNumTLBHits++;
411 }
412 }
413 }
414
415 return tlb_hit;
416 }
417
418 Fault
419 GpuTLB::translate(const RequestPtr &req, ThreadContext *tc,
420 Translation *translation, Mode mode,
421 bool &delayedResponse, bool timing, int &latency)
422 {
423 uint32_t flags = req->getFlags();
424 int seg = flags & SegmentFlagMask;
425 bool storeCheck = flags & (StoreCheck << FlagShift);
426
427 // If this is true, we're dealing with a request
428 // to a non-memory address space.
429 if (seg == SEGMENT_REG_MS) {
430 return translateInt(mode == Mode::Read, req, tc);
431 }
432
433 delayedResponse = false;
434 Addr vaddr = req->getVaddr();
435 DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
436
437 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
438
439 // If protected mode has been enabled...
440 if (m5Reg.prot) {
441 DPRINTF(GPUTLB, "In protected mode.\n");
442 // If we're not in 64-bit mode, do protection/limit checks
443 if (m5Reg.mode != LongMode) {
444 DPRINTF(GPUTLB, "Not in long mode. Checking segment "
445 "protection.\n");
446
447 // Check for a null segment selector.
448 if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
449 seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
450 && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
451 return std::make_shared<GeneralProtection>(0);
452 }
453
454 bool expandDown = false;
455 SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
456
457 if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
458 if (!attr.writable && (mode == BaseTLB::Write ||
459 storeCheck))
460 return std::make_shared<GeneralProtection>(0);
461
462 if (!attr.readable && mode == BaseTLB::Read)
463 return std::make_shared<GeneralProtection>(0);
464
465 expandDown = attr.expandDown;
466
467 }
468
469 Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
470 Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
471 // This assumes we're not in 64 bit mode. If we were, the
472 // default address size is 64 bits, overridable to 32.
473 int size = 32;
474 bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
475 SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
476
477 if ((csAttr.defaultSize && sizeOverride) ||
478 (!csAttr.defaultSize && !sizeOverride)) {
479 size = 16;
480 }
481
482 Addr offset = bits(vaddr - base, size - 1, 0);
483 Addr endOffset = offset + req->getSize() - 1;
484
485 if (expandDown) {
486 DPRINTF(GPUTLB, "Checking an expand down segment.\n");
487 warn_once("Expand down segments are untested.\n");
488
489 if (offset <= limit || endOffset <= limit)
490 return std::make_shared<GeneralProtection>(0);
491 } else {
492 if (offset > limit || endOffset > limit)
493 return std::make_shared<GeneralProtection>(0);
494 }
495 }
496
497 // If paging is enabled, do the translation.
498 if (m5Reg.paging) {
499 DPRINTF(GPUTLB, "Paging enabled.\n");
500 // The vaddr already has the segment base applied.
501 TlbEntry *entry = lookup(vaddr);
502 localNumTLBAccesses++;
503
504 if (!entry) {
505 localNumTLBMisses++;
506 if (timing) {
507 latency = missLatency1;
508 }
509
510 if (FullSystem) {
511 fatal("GpuTLB doesn't support full-system mode\n");
512 } else {
513 DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
514 "at pc %#x.\n", vaddr, tc->instAddr());
515
516 Process *p = tc->getProcessPtr();
517 const EmulationPageTable::Entry *pte =
518 p->pTable->lookup(vaddr);
519
520 if (!pte && mode != BaseTLB::Execute) {
521 // penalize a "page fault" more
522 if (timing)
523 latency += missLatency2;
524
525 if (p->fixupFault(vaddr))
526 pte = p->pTable->lookup(vaddr);
527 }
528
529 if (!pte) {
530 return std::make_shared<PageFault>(vaddr, true,
531 mode, true,
532 false);
533 } else {
534 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
535
536 DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
537 alignedVaddr, pte->paddr);
538
539 TlbEntry gpuEntry(p->pid(), alignedVaddr,
540 pte->paddr, false, false);
541 entry = insert(alignedVaddr, gpuEntry);
542 }
543
544 DPRINTF(GPUTLB, "Miss was serviced.\n");
545 }
546 } else {
547 localNumTLBHits++;
548
549 if (timing) {
550 latency = hitLatency;
551 }
552 }
553
554 // Do paging protection checks.
555 bool inUser = (m5Reg.cpl == 3 &&
556 !(flags & (CPL0FlagBit << FlagShift)));
557
558 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
559 bool badWrite = (!entry->writable && (inUser || cr0.wp));
560
561 if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
562 badWrite)) {
563 // The page must have been present to get into the TLB in
564 // the first place. We'll assume the reserved bits are
565 // fine even though we're not checking them.
566 return std::make_shared<PageFault>(vaddr, true, mode,
567 inUser, false);
568 }
569
570 if (storeCheck && badWrite) {
571 // This would fault if this were a write, so return a page
572 // fault that reflects that happening.
573 return std::make_shared<PageFault>(vaddr, true,
574 BaseTLB::Write,
575 inUser, false);
576 }
577
578
579 DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
580 "checks.\n", entry->paddr);
581
582 int page_size = entry->size();
583 Addr paddr = entry->paddr | (vaddr & (page_size - 1));
584 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
585 req->setPaddr(paddr);
586
587 if (entry->uncacheable)
588 req->setFlags(Request::UNCACHEABLE);
589 } else {
590 //Use the address which already has segmentation applied.
591 DPRINTF(GPUTLB, "Paging disabled.\n");
592 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
593 req->setPaddr(vaddr);
594 }
595 } else {
596 // Real mode
597 DPRINTF(GPUTLB, "In real mode.\n");
598 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
599 req->setPaddr(vaddr);
600 }
601
602 // Check for an access to the local APIC
603 if (FullSystem) {
604 LocalApicBase localApicBase =
605 tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
606
607 Addr baseAddr = localApicBase.base * PageBytes;
608 Addr paddr = req->getPaddr();
609
610 if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
611 // Force the access to be uncacheable.
612 req->setFlags(Request::UNCACHEABLE);
613 req->setPaddr(x86LocalAPICAddress(tc->contextId(),
614 paddr - baseAddr));
615 }
616 }
617
618 return NoFault;
619 };
620
621 Fault
622 GpuTLB::translateAtomic(const RequestPtr &req, ThreadContext *tc,
623 Mode mode, int &latency)
624 {
625 bool delayedResponse;
626
627 return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
628 latency);
629 }
630
631 void
632 GpuTLB::translateTiming(const RequestPtr &req, ThreadContext *tc,
633 Translation *translation, Mode mode, int &latency)
634 {
635 bool delayedResponse;
636 assert(translation);
637
638 Fault fault = GpuTLB::translate(req, tc, translation, mode,
639 delayedResponse, true, latency);
640
641 if (!delayedResponse)
642 translation->finish(fault, req, tc, mode);
643 }
644
645 Walker*
646 GpuTLB::getWalker()
647 {
648 return walker;
649 }
650
651
652 void
653 GpuTLB::serialize(CheckpointOut &cp) const
654 {
655 }
656
657 void
658 GpuTLB::unserialize(CheckpointIn &cp)
659 {
660 }
661
662 void
663 GpuTLB::regStats()
664 {
665 ClockedObject::regStats();
666
667 localNumTLBAccesses
668 .name(name() + ".local_TLB_accesses")
669 .desc("Number of TLB accesses")
670 ;
671
672 localNumTLBHits
673 .name(name() + ".local_TLB_hits")
674 .desc("Number of TLB hits")
675 ;
676
677 localNumTLBMisses
678 .name(name() + ".local_TLB_misses")
679 .desc("Number of TLB misses")
680 ;
681
682 localTLBMissRate
683 .name(name() + ".local_TLB_miss_rate")
684 .desc("TLB miss rate")
685 ;
686
687 accessCycles
688 .name(name() + ".access_cycles")
689 .desc("Cycles spent accessing this TLB level")
690 ;
691
692 pageTableCycles
693 .name(name() + ".page_table_cycles")
694 .desc("Cycles spent accessing the page table")
695 ;
696
697 localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
698
699 numUniquePages
700 .name(name() + ".unique_pages")
701 .desc("Number of unique pages touched")
702 ;
703
704 localCycles
705 .name(name() + ".local_cycles")
706 .desc("Number of cycles spent in queue for all incoming reqs")
707 ;
708
709 localLatency
710 .name(name() + ".local_latency")
711 .desc("Avg. latency over incoming coalesced reqs")
712 ;
713
714 localLatency = localCycles / localNumTLBAccesses;
715
716 globalNumTLBAccesses
717 .name(name() + ".global_TLB_accesses")
718 .desc("Number of TLB accesses")
719 ;
720
721 globalNumTLBHits
722 .name(name() + ".global_TLB_hits")
723 .desc("Number of TLB hits")
724 ;
725
726 globalNumTLBMisses
727 .name(name() + ".global_TLB_misses")
728 .desc("Number of TLB misses")
729 ;
730
731 globalTLBMissRate
732 .name(name() + ".global_TLB_miss_rate")
733 .desc("TLB miss rate")
734 ;
735
736 globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
737
738 avgReuseDistance
739 .name(name() + ".avg_reuse_distance")
740 .desc("avg. reuse distance over all pages (in ticks)")
741 ;
742
743 }
744
745 /**
746 * Do the TLB lookup for this coalesced request and schedule
747 * another event <TLB access latency> cycles later.
748 */
749
750 void
751 GpuTLB::issueTLBLookup(PacketPtr pkt)
752 {
753 assert(pkt);
754 assert(pkt->senderState);
755
756 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
757 TheISA::PageBytes);
758
759 TranslationState *sender_state =
760 safe_cast<TranslationState*>(pkt->senderState);
761
762 bool update_stats = !sender_state->prefetch;
763 ThreadContext * tmp_tc = sender_state->tc;
764
765 DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
766 virt_page_addr);
767
768 int req_cnt = sender_state->reqCnt.back();
769
770 if (update_stats) {
771 accessCycles -= (curTick() * req_cnt);
772 localCycles -= curTick();
773 updatePageFootprint(virt_page_addr);
774 globalNumTLBAccesses += req_cnt;
775 }
776
777 tlbOutcome lookup_outcome = TLB_MISS;
778 const RequestPtr &tmp_req = pkt->req;
779
780 // Access the TLB and figure out if it's a hit or a miss.
781 bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
782
783 if (success) {
784 lookup_outcome = TLB_HIT;
785 // Put the entry in SenderState
786 TlbEntry *entry = lookup(tmp_req->getVaddr(), false);
787 assert(entry);
788
789 auto p = sender_state->tc->getProcessPtr();
790 sender_state->tlbEntry =
791 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
792 false, false);
793
794 if (update_stats) {
795 // the reqCnt has an entry per level, so its size tells us
796 // which level we are in
797 sender_state->hitLevel = sender_state->reqCnt.size();
798 globalNumTLBHits += req_cnt;
799 }
800 } else {
801 if (update_stats)
802 globalNumTLBMisses += req_cnt;
803 }
804
805 /*
806 * We now know the TLB lookup outcome (if it's a hit or a miss), as well
807 * as the TLB access latency.
808 *
809 * We create and schedule a new TLBEvent which will help us take the
810 * appropriate actions (e.g., update TLB on a hit, send request to lower
811 * level TLB on a miss, or start a page walk if this was the last-level
812 * TLB)
813 */
814 TLBEvent *tlb_event =
815 new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
816
817 if (translationReturnEvent.count(virt_page_addr)) {
818 panic("Virtual Page Address %#x already has a return event\n",
819 virt_page_addr);
820 }
821
822 translationReturnEvent[virt_page_addr] = tlb_event;
823 assert(tlb_event);
824
825 DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
826 curTick() + this->ticks(hitLatency));
827
828 schedule(tlb_event, curTick() + this->ticks(hitLatency));
829 }
830
831 GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
832 PacketPtr _pkt)
833 : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
834 outcome(tlb_outcome), pkt(_pkt)
835 {
836 }
837
838 /**
839 * Do Paging protection checks. If we encounter a page fault, then
840 * an assertion is fired.
841 */
842 void
843 GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
844 TlbEntry * tlb_entry, Mode mode)
845 {
846 HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
847 uint32_t flags = pkt->req->getFlags();
848 bool storeCheck = flags & (StoreCheck << FlagShift);
849
850 // Do paging protection checks.
851 bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
852 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
853
854 bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
855
856 if ((inUser && !tlb_entry->user) ||
857 (mode == BaseTLB::Write && badWrite)) {
858 // The page must have been present to get into the TLB in
859 // the first place. We'll assume the reserved bits are
860 // fine even though we're not checking them.
861 panic("Page fault detected");
862 }
863
864 if (storeCheck && badWrite) {
865 // This would fault if this were a write, so return a page
866 // fault that reflects that happening.
867 panic("Page fault detected");
868 }
869 }
870
871 /**
872 * handleTranslationReturn is called on a TLB hit,
873 * when a TLB miss returns or when a page fault returns.
874 * The latter calls handelHit with TLB miss as tlbOutcome.
875 */
876 void
877 GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
878 PacketPtr pkt)
879 {
880
881 assert(pkt);
882 Addr vaddr = pkt->req->getVaddr();
883
884 TranslationState *sender_state =
885 safe_cast<TranslationState*>(pkt->senderState);
886
887 ThreadContext *tc = sender_state->tc;
888 Mode mode = sender_state->tlbMode;
889
890 TlbEntry *local_entry, *new_entry;
891
892 if (tlb_outcome == TLB_HIT) {
893 DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
894 local_entry = sender_state->tlbEntry;
895 } else {
896 DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
897 vaddr);
898
899 // We are returning either from a page walk or from a hit at a lower
900 // TLB level. The senderState should be "carrying" a pointer to the
901 // correct TLBEntry.
902 new_entry = sender_state->tlbEntry;
903 assert(new_entry);
904 local_entry = new_entry;
905
906 if (allocationPolicy) {
907 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
908 virt_page_addr);
909
910 local_entry = insert(virt_page_addr, *new_entry);
911 }
912
913 assert(local_entry);
914 }
915
916 /**
917 * At this point the packet carries an up-to-date tlbEntry pointer
918 * in its senderState.
919 * Next step is to do the paging protection checks.
920 */
921 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
922 "while paddr was %#x.\n", local_entry->vaddr,
923 local_entry->paddr);
924
925 pagingProtectionChecks(tc, pkt, local_entry, mode);
926 int page_size = local_entry->size();
927 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
928 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
929
930 // Since this packet will be sent through the cpu side slave port,
931 // it must be converted to a response pkt if it is not one already
932 if (pkt->isRequest()) {
933 pkt->makeTimingResponse();
934 }
935
936 pkt->req->setPaddr(paddr);
937
938 if (local_entry->uncacheable) {
939 pkt->req->setFlags(Request::UNCACHEABLE);
940 }
941
942 //send packet back to coalescer
943 cpuSidePort[0]->sendTimingResp(pkt);
944 //schedule cleanup event
945 cleanupQueue.push(virt_page_addr);
946
947 // schedule this only once per cycle.
948 // The check is required because we might have multiple translations
949 // returning the same cycle
950 // this is a maximum priority event and must be on the same cycle
951 // as the cleanup event in TLBCoalescer to avoid a race with
952 // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
953 if (!cleanupEvent.scheduled())
954 schedule(cleanupEvent, curTick());
955 }
956
957 /**
958 * Here we take the appropriate actions based on the result of the
959 * TLB lookup.
960 */
961 void
962 GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
963 PacketPtr pkt)
964 {
965 DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
966
967 assert(translationReturnEvent[virtPageAddr]);
968 assert(pkt);
969
970 TranslationState *tmp_sender_state =
971 safe_cast<TranslationState*>(pkt->senderState);
972
973 int req_cnt = tmp_sender_state->reqCnt.back();
974 bool update_stats = !tmp_sender_state->prefetch;
975
976
977 if (outcome == TLB_HIT) {
978 handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
979
980 if (update_stats) {
981 accessCycles += (req_cnt * curTick());
982 localCycles += curTick();
983 }
984
985 } else if (outcome == TLB_MISS) {
986
987 DPRINTF(GPUTLB, "This is a TLB miss\n");
988 if (update_stats) {
989 accessCycles += (req_cnt*curTick());
990 localCycles += curTick();
991 }
992
993 if (hasMemSidePort) {
994 // the one cyle added here represent the delay from when we get
995 // the reply back till when we propagate it to the coalescer
996 // above.
997 if (update_stats) {
998 accessCycles += (req_cnt * 1);
999 localCycles += 1;
1000 }
1001
1002 /**
1003 * There is a TLB below. Send the coalesced request.
1004 * We actually send the very first packet of all the
1005 * pending packets for this virtual page address.
1006 */
1007 if (!memSidePort[0]->sendTimingReq(pkt)) {
1008 DPRINTF(GPUTLB, "Failed sending translation request to "
1009 "lower level TLB for addr %#x\n", virtPageAddr);
1010
1011 memSidePort[0]->retries.push_back(pkt);
1012 } else {
1013 DPRINTF(GPUTLB, "Sent translation request to lower level "
1014 "TLB for addr %#x\n", virtPageAddr);
1015 }
1016 } else {
1017 //this is the last level TLB. Start a page walk
1018 DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
1019 "addr %#x\n", virtPageAddr);
1020
1021 if (update_stats)
1022 pageTableCycles -= (req_cnt*curTick());
1023
1024 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
1025 assert(tlb_event);
1026 tlb_event->updateOutcome(PAGE_WALK);
1027 schedule(tlb_event, curTick() + ticks(missLatency2));
1028 }
1029 } else if (outcome == PAGE_WALK) {
1030 if (update_stats)
1031 pageTableCycles += (req_cnt*curTick());
1032
1033 // Need to access the page table and update the TLB
1034 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1035 virtPageAddr);
1036
1037 TranslationState *sender_state =
1038 safe_cast<TranslationState*>(pkt->senderState);
1039
1040 Process *p = sender_state->tc->getProcessPtr();
1041 Addr vaddr = pkt->req->getVaddr();
1042 #ifndef NDEBUG
1043 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1044 assert(alignedVaddr == virtPageAddr);
1045 #endif
1046 const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
1047 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1048 p->fixupFault(vaddr)) {
1049 pte = p->pTable->lookup(vaddr);
1050 }
1051
1052 if (pte) {
1053 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1054 pte->paddr);
1055
1056 sender_state->tlbEntry =
1057 new TlbEntry(p->pid(), virtPageAddr, pte->paddr, false,
1058 false);
1059 } else {
1060 sender_state->tlbEntry = nullptr;
1061 }
1062
1063 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1064 } else if (outcome == MISS_RETURN) {
1065 /** we add an extra cycle in the return path of the translation
1066 * requests in between the various TLB levels.
1067 */
1068 handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1069 } else {
1070 panic("Unexpected TLB outcome %d", outcome);
1071 }
1072 }
1073
1074 void
1075 GpuTLB::TLBEvent::process()
1076 {
1077 tlb->translationReturn(virtPageAddr, outcome, pkt);
1078 }
1079
1080 const char*
1081 GpuTLB::TLBEvent::description() const
1082 {
1083 return "trigger translationDoneEvent";
1084 }
1085
1086 void
1087 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
1088 {
1089 outcome = _outcome;
1090 }
1091
1092 Addr
1093 GpuTLB::TLBEvent::getTLBEventVaddr()
1094 {
1095 return virtPageAddr;
1096 }
1097
1098 /*
1099 * recvTiming receives a coalesced timing request from a TLBCoalescer
1100 * and it calls issueTLBLookup()
1101 * It only rejects the packet if we have exceeded the max
1102 * outstanding number of requests for the TLB
1103 */
1104 bool
1105 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
1106 {
1107 if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
1108 tlb->issueTLBLookup(pkt);
1109 // update number of outstanding translation requests
1110 tlb->outstandingReqs++;
1111 return true;
1112 } else {
1113 DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
1114 tlb->outstandingReqs);
1115 return false;
1116 }
1117 }
1118
1119 /**
1120 * handleFuncTranslationReturn is called on a TLB hit,
1121 * when a TLB miss returns or when a page fault returns.
1122 * It updates LRU, inserts the TLB entry on a miss
1123 * depending on the allocation policy and does the required
1124 * protection checks. It does NOT create a new packet to
1125 * update the packet's addr; this is done in hsail-gpu code.
1126 */
1127 void
1128 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
1129 {
1130 TranslationState *sender_state =
1131 safe_cast<TranslationState*>(pkt->senderState);
1132
1133 ThreadContext *tc = sender_state->tc;
1134 Mode mode = sender_state->tlbMode;
1135 Addr vaddr = pkt->req->getVaddr();
1136
1137 TlbEntry *local_entry, *new_entry;
1138
1139 if (tlb_outcome == TLB_HIT) {
1140 DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
1141 "%#x\n", vaddr);
1142
1143 local_entry = sender_state->tlbEntry;
1144 } else {
1145 DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
1146 "%#x\n", vaddr);
1147
1148 // We are returning either from a page walk or from a hit at a lower
1149 // TLB level. The senderState should be "carrying" a pointer to the
1150 // correct TLBEntry.
1151 new_entry = sender_state->tlbEntry;
1152 assert(new_entry);
1153 local_entry = new_entry;
1154
1155 if (allocationPolicy) {
1156 Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
1157
1158 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1159 virt_page_addr);
1160
1161 local_entry = insert(virt_page_addr, *new_entry);
1162 }
1163
1164 assert(local_entry);
1165 }
1166
1167 DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
1168 "while paddr was %#x.\n", local_entry->vaddr,
1169 local_entry->paddr);
1170
1171 /**
1172 * Do paging checks if it's a normal functional access. If it's for a
1173 * prefetch, then sometimes you can try to prefetch something that
1174 * won't pass protection. We don't actually want to fault becuase there
1175 * is no demand access to deem this a violation. Just put it in the
1176 * TLB and it will fault if indeed a future demand access touches it in
1177 * violation.
1178 *
1179 * This feature could be used to explore security issues around
1180 * speculative memory accesses.
1181 */
1182 if (!sender_state->prefetch && sender_state->tlbEntry)
1183 pagingProtectionChecks(tc, pkt, local_entry, mode);
1184
1185 int page_size = local_entry->size();
1186 Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1187 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1188
1189 pkt->req->setPaddr(paddr);
1190
1191 if (local_entry->uncacheable)
1192 pkt->req->setFlags(Request::UNCACHEABLE);
1193 }
1194
1195 // This is used for atomic translations. Need to
1196 // make it all happen during the same cycle.
1197 void
1198 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
1199 {
1200 TranslationState *sender_state =
1201 safe_cast<TranslationState*>(pkt->senderState);
1202
1203 ThreadContext *tc = sender_state->tc;
1204 bool update_stats = !sender_state->prefetch;
1205
1206 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1207 TheISA::PageBytes);
1208
1209 if (update_stats)
1210 tlb->updatePageFootprint(virt_page_addr);
1211
1212 // do the TLB lookup without updating the stats
1213 bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
1214 tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
1215
1216 // functional mode means no coalescing
1217 // global metrics are the same as the local metrics
1218 if (update_stats) {
1219 tlb->globalNumTLBAccesses++;
1220
1221 if (success) {
1222 sender_state->hitLevel = sender_state->reqCnt.size();
1223 tlb->globalNumTLBHits++;
1224 }
1225 }
1226
1227 if (!success) {
1228 if (update_stats)
1229 tlb->globalNumTLBMisses++;
1230 if (tlb->hasMemSidePort) {
1231 // there is a TLB below -> propagate down the TLB hierarchy
1232 tlb->memSidePort[0]->sendFunctional(pkt);
1233 // If no valid translation from a prefetch, then just return
1234 if (sender_state->prefetch && !pkt->req->hasPaddr())
1235 return;
1236 } else {
1237 // Need to access the page table and update the TLB
1238 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1239 virt_page_addr);
1240
1241 Process *p = tc->getProcessPtr();
1242
1243 Addr vaddr = pkt->req->getVaddr();
1244 #ifndef NDEBUG
1245 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1246 assert(alignedVaddr == virt_page_addr);
1247 #endif
1248
1249 const EmulationPageTable::Entry *pte =
1250 p->pTable->lookup(vaddr);
1251 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1252 p->fixupFault(vaddr)) {
1253 pte = p->pTable->lookup(vaddr);
1254 }
1255
1256 if (!sender_state->prefetch) {
1257 // no PageFaults are permitted after
1258 // the second page table lookup
1259 assert(pte);
1260
1261 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1262 pte->paddr);
1263
1264 sender_state->tlbEntry =
1265 new TlbEntry(p->pid(), virt_page_addr,
1266 pte->paddr, false, false);
1267 } else {
1268 // If this was a prefetch, then do the normal thing if it
1269 // was a successful translation. Otherwise, send an empty
1270 // TLB entry back so that it can be figured out as empty and
1271 // handled accordingly.
1272 if (pte) {
1273 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1274 pte->paddr);
1275
1276 sender_state->tlbEntry =
1277 new TlbEntry(p->pid(), virt_page_addr,
1278 pte->paddr, false, false);
1279 } else {
1280 DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
1281 alignedVaddr);
1282
1283 sender_state->tlbEntry = nullptr;
1284
1285 return;
1286 }
1287 }
1288 }
1289 } else {
1290 DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
1291 tlb->lookup(pkt->req->getVaddr()));
1292
1293 TlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
1294 update_stats);
1295
1296 assert(entry);
1297
1298 auto p = sender_state->tc->getProcessPtr();
1299 sender_state->tlbEntry =
1300 new TlbEntry(p->pid(), entry->vaddr, entry->paddr,
1301 false, false);
1302 }
1303 // This is the function that would populate pkt->req with the paddr of
1304 // the translation. But if no translation happens (i.e Prefetch fails)
1305 // then the early returns in the above code wiill keep this function
1306 // from executing.
1307 tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
1308 }
1309
1310 void
1311 GpuTLB::CpuSidePort::recvReqRetry()
1312 {
1313 // The CPUSidePort never sends anything but replies. No retries
1314 // expected.
1315 panic("recvReqRetry called");
1316 }
1317
1318 AddrRangeList
1319 GpuTLB::CpuSidePort::getAddrRanges() const
1320 {
1321 // currently not checked by the master
1322 AddrRangeList ranges;
1323
1324 return ranges;
1325 }
1326
1327 /**
1328 * MemSidePort receives the packet back.
1329 * We need to call the handleTranslationReturn
1330 * and propagate up the hierarchy.
1331 */
1332 bool
1333 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
1334 {
1335 Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1336 TheISA::PageBytes);
1337
1338 DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
1339 virt_page_addr);
1340
1341 TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
1342 assert(tlb_event);
1343 assert(virt_page_addr == tlb_event->getTLBEventVaddr());
1344
1345 tlb_event->updateOutcome(MISS_RETURN);
1346 tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
1347
1348 return true;
1349 }
1350
1351 void
1352 GpuTLB::MemSidePort::recvReqRetry()
1353 {
1354 // No retries should reach the TLB. The retries
1355 // should only reach the TLBCoalescer.
1356 panic("recvReqRetry called");
1357 }
1358
1359 void
1360 GpuTLB::cleanup()
1361 {
1362 while (!cleanupQueue.empty()) {
1363 Addr cleanup_addr = cleanupQueue.front();
1364 cleanupQueue.pop();
1365
1366 // delete TLBEvent
1367 TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
1368 delete old_tlb_event;
1369 translationReturnEvent.erase(cleanup_addr);
1370
1371 // update number of outstanding requests
1372 outstandingReqs--;
1373 }
1374
1375 /** the higher level coalescer should retry if it has
1376 * any pending requests.
1377 */
1378 for (int i = 0; i < cpuSidePort.size(); ++i) {
1379 cpuSidePort[i]->sendRetryReq();
1380 }
1381 }
1382
1383 void
1384 GpuTLB::updatePageFootprint(Addr virt_page_addr)
1385 {
1386
1387 std::pair<AccessPatternTable::iterator, bool> ret;
1388
1389 AccessInfo tmp_access_info;
1390 tmp_access_info.lastTimeAccessed = 0;
1391 tmp_access_info.accessesPerPage = 0;
1392 tmp_access_info.totalReuseDistance = 0;
1393 tmp_access_info.sumDistance = 0;
1394 tmp_access_info.meanDistance = 0;
1395
1396 ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
1397 tmp_access_info));
1398
1399 bool first_page_access = ret.second;
1400
1401 if (first_page_access) {
1402 numUniquePages++;
1403 } else {
1404 int accessed_before;
1405 accessed_before = curTick() - ret.first->second.lastTimeAccessed;
1406 ret.first->second.totalReuseDistance += accessed_before;
1407 }
1408
1409 ret.first->second.accessesPerPage++;
1410 ret.first->second.lastTimeAccessed = curTick();
1411
1412 if (accessDistance) {
1413 ret.first->second.localTLBAccesses
1414 .push_back(localNumTLBAccesses.value());
1415 }
1416 }
1417
1418 void
1419 GpuTLB::exitCallback()
1420 {
1421 std::ostream *page_stat_file = nullptr;
1422
1423 if (accessDistance) {
1424
1425 // print per page statistics to a separate file (.csv format)
1426 // simout is the gem5 output directory (default is m5out or the one
1427 // specified with -d
1428 page_stat_file = simout.create(name().c_str())->stream();
1429
1430 // print header
1431 *page_stat_file << "page,max_access_distance,mean_access_distance, "
1432 << "stddev_distance" << std::endl;
1433 }
1434
1435 // update avg. reuse distance footprint
1436 AccessPatternTable::iterator iter, iter_begin, iter_end;
1437 unsigned int sum_avg_reuse_distance_per_page = 0;
1438
1439 // iterate through all pages seen by this TLB
1440 for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
1441 sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
1442 iter->second.accessesPerPage;
1443
1444 if (accessDistance) {
1445 unsigned int tmp = iter->second.localTLBAccesses[0];
1446 unsigned int prev = tmp;
1447
1448 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1449 if (i) {
1450 tmp = prev + 1;
1451 }
1452
1453 prev = iter->second.localTLBAccesses[i];
1454 // update the localTLBAccesses value
1455 // with the actual differece
1456 iter->second.localTLBAccesses[i] -= tmp;
1457 // compute the sum of AccessDistance per page
1458 // used later for mean
1459 iter->second.sumDistance +=
1460 iter->second.localTLBAccesses[i];
1461 }
1462
1463 iter->second.meanDistance =
1464 iter->second.sumDistance / iter->second.accessesPerPage;
1465
1466 // compute std_dev and max (we need a second round because we
1467 // need to know the mean value
1468 unsigned int max_distance = 0;
1469 unsigned int stddev_distance = 0;
1470
1471 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1472 unsigned int tmp_access_distance =
1473 iter->second.localTLBAccesses[i];
1474
1475 if (tmp_access_distance > max_distance) {
1476 max_distance = tmp_access_distance;
1477 }
1478
1479 unsigned int diff =
1480 tmp_access_distance - iter->second.meanDistance;
1481 stddev_distance += pow(diff, 2);
1482
1483 }
1484
1485 stddev_distance =
1486 sqrt(stddev_distance/iter->second.accessesPerPage);
1487
1488 if (page_stat_file) {
1489 *page_stat_file << std::hex << iter->first << ",";
1490 *page_stat_file << std::dec << max_distance << ",";
1491 *page_stat_file << std::dec << iter->second.meanDistance
1492 << ",";
1493 *page_stat_file << std::dec << stddev_distance;
1494 *page_stat_file << std::endl;
1495 }
1496
1497 // erase the localTLBAccesses array
1498 iter->second.localTLBAccesses.clear();
1499 }
1500 }
1501
1502 if (!TLBFootprint.empty()) {
1503 avgReuseDistance =
1504 sum_avg_reuse_distance_per_page / TLBFootprint.size();
1505 }
1506
1507 //clear the TLBFootprint map
1508 TLBFootprint.clear();
1509 }
1510 } // namespace X86ISA
1511
1512 X86ISA::GpuTLB*
1513 X86GPUTLBParams::create()
1514 {
1515 return new X86ISA::GpuTLB(this);
1516 }
1517