2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
36 #include "gpu-compute/gpu_tlb.hh"
41 #include "arch/x86/faults.hh"
42 #include "arch/x86/insts/microldstop.hh"
43 #include "arch/x86/pagetable.hh"
44 #include "arch/x86/pagetable_walker.hh"
45 #include "arch/x86/regs/misc.hh"
46 #include "arch/x86/regs/msr.hh"
47 #include "arch/x86/x86_traits.hh"
48 #include "base/bitfield.hh"
49 #include "base/logging.hh"
50 #include "base/output.hh"
51 #include "base/trace.hh"
52 #include "cpu/base.hh"
53 #include "cpu/thread_context.hh"
54 #include "debug/GPUPrefetch.hh"
55 #include "debug/GPUTLB.hh"
56 #include "mem/packet_access.hh"
57 #include "mem/page_table.hh"
58 #include "mem/request.hh"
59 #include "sim/process.hh"
60 #include "sim/pseudo_inst.hh"
65 GpuTLB::GpuTLB(const Params
*p
)
66 : ClockedObject(p
), configAddress(0), size(p
->size
),
67 cleanupEvent([this]{ cleanup(); }, name(), false,
69 exitEvent([this]{ exitCallback(); }, name())
72 assert(assoc
<= size
);
74 allocationPolicy
= p
->allocationPolicy
;
75 hasMemSidePort
= false;
76 accessDistance
= p
->accessDistance
;
78 tlb
.assign(size
, TlbEntry());
80 freeList
.resize(numSets
);
81 entryList
.resize(numSets
);
83 for (int set
= 0; set
< numSets
; ++set
) {
84 for (int way
= 0; way
< assoc
; ++way
) {
85 int x
= set
* assoc
+ way
;
86 freeList
[set
].push_back(&tlb
.at(x
));
93 * @warning: the set-associative version assumes you have a
94 * fixed page size of 4KB.
95 * If the page size is greather than 4KB (as defined in the
96 * TheISA::PageBytes), then there are various issues w/ the current
97 * implementation (you'd have the same 8KB page being replicated in
100 setMask
= numSets
- 1;
102 maxCoalescedReqs
= p
->maxOutstandingReqs
;
104 // Do not allow maxCoalescedReqs to be more than the TLB associativity
105 if (maxCoalescedReqs
> assoc
) {
106 maxCoalescedReqs
= assoc
;
107 cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc
);
111 hitLatency
= p
->hitLatency
;
112 missLatency1
= p
->missLatency1
;
113 missLatency2
= p
->missLatency2
;
115 // create the slave ports based on the number of connected ports
116 for (size_t i
= 0; i
< p
->port_slave_connection_count
; ++i
) {
117 cpuSidePort
.push_back(new CpuSidePort(csprintf("%s-port%d",
118 name(), i
), this, i
));
121 // create the master ports based on the number of connected ports
122 for (size_t i
= 0; i
< p
->port_master_connection_count
; ++i
) {
123 memSidePort
.push_back(new MemSidePort(csprintf("%s-port%d",
124 name(), i
), this, i
));
128 // fixme: this is never called?
131 // make sure all the hash-maps are empty
132 assert(translationReturnEvent
.empty());
136 GpuTLB::getPort(const std::string
&if_name
, PortID idx
)
138 if (if_name
== "slave") {
139 if (idx
>= static_cast<PortID
>(cpuSidePort
.size())) {
140 panic("TLBCoalescer::getPort: unknown index %d\n", idx
);
143 return *cpuSidePort
[idx
];
144 } else if (if_name
== "master") {
145 if (idx
>= static_cast<PortID
>(memSidePort
.size())) {
146 panic("TLBCoalescer::getPort: unknown index %d\n", idx
);
149 hasMemSidePort
= true;
151 return *memSidePort
[idx
];
153 panic("TLBCoalescer::getPort: unknown port %s\n", if_name
);
158 GpuTLB::insert(Addr vpn
, TlbEntry
&entry
)
160 TlbEntry
*newEntry
= nullptr;
163 * vpn holds the virtual page address
164 * The least significant bits are simply masked
166 int set
= (vpn
>> TheISA::PageShift
) & setMask
;
168 if (!freeList
[set
].empty()) {
169 newEntry
= freeList
[set
].front();
170 freeList
[set
].pop_front();
172 newEntry
= entryList
[set
].back();
173 entryList
[set
].pop_back();
177 newEntry
->vaddr
= vpn
;
178 entryList
[set
].push_front(newEntry
);
183 GpuTLB::EntryList::iterator
184 GpuTLB::lookupIt(Addr va
, bool update_lru
)
186 int set
= (va
>> TheISA::PageShift
) & setMask
;
192 auto entry
= entryList
[set
].begin();
193 for (; entry
!= entryList
[set
].end(); ++entry
) {
194 int page_size
= (*entry
)->size();
196 if ((*entry
)->vaddr
<= va
&& (*entry
)->vaddr
+ page_size
> va
) {
197 DPRINTF(GPUTLB
, "Matched vaddr %#x to entry starting at %#x "
198 "with size %#x.\n", va
, (*entry
)->vaddr
, page_size
);
201 entryList
[set
].push_front(*entry
);
202 entryList
[set
].erase(entry
);
203 entry
= entryList
[set
].begin();
214 GpuTLB::lookup(Addr va
, bool update_lru
)
216 int set
= (va
>> TheISA::PageShift
) & setMask
;
218 auto entry
= lookupIt(va
, update_lru
);
220 if (entry
== entryList
[set
].end())
227 GpuTLB::invalidateAll()
229 DPRINTF(GPUTLB
, "Invalidating all entries.\n");
231 for (int i
= 0; i
< numSets
; ++i
) {
232 while (!entryList
[i
].empty()) {
233 TlbEntry
*entry
= entryList
[i
].front();
234 entryList
[i
].pop_front();
235 freeList
[i
].push_back(entry
);
241 GpuTLB::setConfigAddress(uint32_t addr
)
243 configAddress
= addr
;
247 GpuTLB::invalidateNonGlobal()
249 DPRINTF(GPUTLB
, "Invalidating all non global entries.\n");
251 for (int i
= 0; i
< numSets
; ++i
) {
252 for (auto entryIt
= entryList
[i
].begin();
253 entryIt
!= entryList
[i
].end();) {
254 if (!(*entryIt
)->global
) {
255 freeList
[i
].push_back(*entryIt
);
256 entryList
[i
].erase(entryIt
++);
265 GpuTLB::demapPage(Addr va
, uint64_t asn
)
268 int set
= (va
>> TheISA::PageShift
) & setMask
;
269 auto entry
= lookupIt(va
, false);
271 if (entry
!= entryList
[set
].end()) {
272 freeList
[set
].push_back(*entry
);
273 entryList
[set
].erase(entry
);
283 localMiscRegAccess(bool read
, MiscRegIndex regNum
,
284 ThreadContext
*tc
, PacketPtr pkt
)
287 RegVal data
= htole(tc
->readMiscReg(regNum
));
288 // Make sure we don't trot off the end of data.
289 pkt
->setData((uint8_t *)&data
);
291 RegVal data
= htole(tc
->readMiscRegNoEffect(regNum
));
292 tc
->setMiscReg(regNum
, letoh(data
));
297 } // anonymous namespace
300 GpuTLB::translateInt(bool read
, const RequestPtr
&req
, ThreadContext
*tc
)
302 DPRINTF(GPUTLB
, "Addresses references internal memory.\n");
303 Addr vaddr
= req
->getVaddr();
304 Addr prefix
= (vaddr
>> 3) & IntAddrPrefixMask
;
306 if (prefix
== IntAddrPrefixCPUID
) {
307 panic("CPUID memory space not yet implemented!\n");
308 } else if (prefix
== IntAddrPrefixMSR
) {
309 vaddr
= (vaddr
>> 3) & ~IntAddrPrefixMask
;
312 if (!msrAddrToIndex(regNum
, vaddr
))
313 return std::make_shared
<GeneralProtection
>(0);
315 req
->setLocalAccessor(
316 [read
,regNum
](ThreadContext
*tc
, PacketPtr pkt
)
318 return localMiscRegAccess(read
, regNum
, tc
, pkt
);
323 } else if (prefix
== IntAddrPrefixIO
) {
324 // TODO If CPL > IOPL or in virtual mode, check the I/O permission
325 // bitmap in the TSS.
327 Addr IOPort
= vaddr
& ~IntAddrPrefixMask
;
328 // Make sure the address fits in the expected 16 bit IO address
330 assert(!(IOPort
& ~0xFFFF));
331 if (IOPort
== 0xCF8 && req
->getSize() == 4) {
332 req
->setLocalAccessor(
333 [read
](ThreadContext
*tc
, PacketPtr pkt
)
335 return localMiscRegAccess(
336 read
, MISCREG_PCI_CONFIG_ADDRESS
, tc
, pkt
);
339 } else if ((IOPort
& ~mask(2)) == 0xCFC) {
340 req
->setFlags(Request::UNCACHEABLE
| Request::STRICT_ORDER
);
342 tc
->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS
);
343 if (bits(configAddress
, 31, 31)) {
344 req
->setPaddr(PhysAddrPrefixPciConfig
|
345 mbits(configAddress
, 30, 2) |
348 req
->setPaddr(PhysAddrPrefixIO
| IOPort
);
351 req
->setFlags(Request::UNCACHEABLE
| Request::STRICT_ORDER
);
352 req
->setPaddr(PhysAddrPrefixIO
| IOPort
);
356 panic("Access to unrecognized internal address space %#x.\n",
362 * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
363 * and false on a TLB miss.
364 * Many of the checks about different modes have been converted to
365 * assertions, since these parts of the code are not really used.
366 * On a hit it will update the LRU stack.
369 GpuTLB::tlbLookup(const RequestPtr
&req
,
370 ThreadContext
*tc
, bool update_stats
)
372 bool tlb_hit
= false;
374 uint32_t flags
= req
->getFlags();
375 int seg
= flags
& SegmentFlagMask
;
378 assert(seg
!= SEGMENT_REG_MS
);
379 Addr vaddr
= req
->getVaddr();
380 DPRINTF(GPUTLB
, "TLB Lookup for vaddr %#x.\n", vaddr
);
381 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
384 DPRINTF(GPUTLB
, "In protected mode.\n");
385 // make sure we are in 64-bit mode
386 assert(m5Reg
.mode
== LongMode
);
388 // If paging is enabled, do the translation.
390 DPRINTF(GPUTLB
, "Paging enabled.\n");
391 //update LRU stack on a hit
392 TlbEntry
*entry
= lookup(vaddr
, true);
398 // functional tlb access for memory initialization
399 // i.e., memory seeding or instr. seeding -> don't update
404 localNumTLBAccesses
++;
418 GpuTLB::translate(const RequestPtr
&req
, ThreadContext
*tc
,
419 Translation
*translation
, Mode mode
,
420 bool &delayedResponse
, bool timing
, int &latency
)
422 uint32_t flags
= req
->getFlags();
423 int seg
= flags
& SegmentFlagMask
;
424 bool storeCheck
= flags
& (StoreCheck
<< FlagShift
);
426 // If this is true, we're dealing with a request
427 // to a non-memory address space.
428 if (seg
== SEGMENT_REG_MS
) {
429 return translateInt(mode
== Mode::Read
, req
, tc
);
432 delayedResponse
= false;
433 Addr vaddr
= req
->getVaddr();
434 DPRINTF(GPUTLB
, "Translating vaddr %#x.\n", vaddr
);
436 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
438 // If protected mode has been enabled...
440 DPRINTF(GPUTLB
, "In protected mode.\n");
441 // If we're not in 64-bit mode, do protection/limit checks
442 if (m5Reg
.mode
!= LongMode
) {
443 DPRINTF(GPUTLB
, "Not in long mode. Checking segment "
446 // Check for a null segment selector.
447 if (!(seg
== SEGMENT_REG_TSG
|| seg
== SYS_SEGMENT_REG_IDTR
||
448 seg
== SEGMENT_REG_HS
|| seg
== SEGMENT_REG_LS
)
449 && !tc
->readMiscRegNoEffect(MISCREG_SEG_SEL(seg
))) {
450 return std::make_shared
<GeneralProtection
>(0);
453 bool expandDown
= false;
454 SegAttr attr
= tc
->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg
));
456 if (seg
>= SEGMENT_REG_ES
&& seg
<= SEGMENT_REG_HS
) {
457 if (!attr
.writable
&& (mode
== BaseTLB::Write
||
459 return std::make_shared
<GeneralProtection
>(0);
461 if (!attr
.readable
&& mode
== BaseTLB::Read
)
462 return std::make_shared
<GeneralProtection
>(0);
464 expandDown
= attr
.expandDown
;
468 Addr base
= tc
->readMiscRegNoEffect(MISCREG_SEG_BASE(seg
));
469 Addr limit
= tc
->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg
));
470 // This assumes we're not in 64 bit mode. If we were, the
471 // default address size is 64 bits, overridable to 32.
473 bool sizeOverride
= (flags
& (AddrSizeFlagBit
<< FlagShift
));
474 SegAttr csAttr
= tc
->readMiscRegNoEffect(MISCREG_CS_ATTR
);
476 if ((csAttr
.defaultSize
&& sizeOverride
) ||
477 (!csAttr
.defaultSize
&& !sizeOverride
)) {
481 Addr offset
= bits(vaddr
- base
, size
- 1, 0);
482 Addr endOffset
= offset
+ req
->getSize() - 1;
485 DPRINTF(GPUTLB
, "Checking an expand down segment.\n");
486 warn_once("Expand down segments are untested.\n");
488 if (offset
<= limit
|| endOffset
<= limit
)
489 return std::make_shared
<GeneralProtection
>(0);
491 if (offset
> limit
|| endOffset
> limit
)
492 return std::make_shared
<GeneralProtection
>(0);
496 // If paging is enabled, do the translation.
498 DPRINTF(GPUTLB
, "Paging enabled.\n");
499 // The vaddr already has the segment base applied.
500 TlbEntry
*entry
= lookup(vaddr
);
501 localNumTLBAccesses
++;
506 latency
= missLatency1
;
510 fatal("GpuTLB doesn't support full-system mode\n");
512 DPRINTF(GPUTLB
, "Handling a TLB miss for address %#x "
513 "at pc %#x.\n", vaddr
, tc
->instAddr());
515 Process
*p
= tc
->getProcessPtr();
516 const EmulationPageTable::Entry
*pte
=
517 p
->pTable
->lookup(vaddr
);
519 if (!pte
&& mode
!= BaseTLB::Execute
) {
520 // penalize a "page fault" more
522 latency
+= missLatency2
;
524 if (p
->fixupFault(vaddr
))
525 pte
= p
->pTable
->lookup(vaddr
);
529 return std::make_shared
<PageFault
>(vaddr
, true,
533 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
535 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n",
536 alignedVaddr
, pte
->paddr
);
538 TlbEntry
gpuEntry(p
->pid(), alignedVaddr
,
539 pte
->paddr
, false, false);
540 entry
= insert(alignedVaddr
, gpuEntry
);
543 DPRINTF(GPUTLB
, "Miss was serviced.\n");
549 latency
= hitLatency
;
553 // Do paging protection checks.
554 bool inUser
= (m5Reg
.cpl
== 3 &&
555 !(flags
& (CPL0FlagBit
<< FlagShift
)));
557 CR0 cr0
= tc
->readMiscRegNoEffect(MISCREG_CR0
);
558 bool badWrite
= (!entry
->writable
&& (inUser
|| cr0
.wp
));
560 if ((inUser
&& !entry
->user
) || (mode
== BaseTLB::Write
&&
562 // The page must have been present to get into the TLB in
563 // the first place. We'll assume the reserved bits are
564 // fine even though we're not checking them.
565 return std::make_shared
<PageFault
>(vaddr
, true, mode
,
569 if (storeCheck
&& badWrite
) {
570 // This would fault if this were a write, so return a page
571 // fault that reflects that happening.
572 return std::make_shared
<PageFault
>(vaddr
, true,
578 DPRINTF(GPUTLB
, "Entry found with paddr %#x, doing protection "
579 "checks.\n", entry
->paddr
);
581 int page_size
= entry
->size();
582 Addr paddr
= entry
->paddr
| (vaddr
& (page_size
- 1));
583 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
584 req
->setPaddr(paddr
);
586 if (entry
->uncacheable
)
587 req
->setFlags(Request::UNCACHEABLE
);
589 //Use the address which already has segmentation applied.
590 DPRINTF(GPUTLB
, "Paging disabled.\n");
591 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, vaddr
);
592 req
->setPaddr(vaddr
);
596 DPRINTF(GPUTLB
, "In real mode.\n");
597 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, vaddr
);
598 req
->setPaddr(vaddr
);
601 // Check for an access to the local APIC
603 LocalApicBase localApicBase
=
604 tc
->readMiscRegNoEffect(MISCREG_APIC_BASE
);
606 Addr baseAddr
= localApicBase
.base
* PageBytes
;
607 Addr paddr
= req
->getPaddr();
609 if (baseAddr
<= paddr
&& baseAddr
+ PageBytes
> paddr
) {
610 // Force the access to be uncacheable.
611 req
->setFlags(Request::UNCACHEABLE
);
612 req
->setPaddr(x86LocalAPICAddress(tc
->contextId(),
621 GpuTLB::translateAtomic(const RequestPtr
&req
, ThreadContext
*tc
,
622 Mode mode
, int &latency
)
624 bool delayedResponse
;
626 return GpuTLB::translate(req
, tc
, nullptr, mode
, delayedResponse
,
631 GpuTLB::translateTiming(const RequestPtr
&req
, ThreadContext
*tc
,
632 Translation
*translation
, Mode mode
, int &latency
)
634 bool delayedResponse
;
637 Fault fault
= GpuTLB::translate(req
, tc
, translation
, mode
,
638 delayedResponse
, true, latency
);
640 if (!delayedResponse
)
641 translation
->finish(fault
, req
, tc
, mode
);
652 GpuTLB::serialize(CheckpointOut
&cp
) const
657 GpuTLB::unserialize(CheckpointIn
&cp
)
664 ClockedObject::regStats();
667 .name(name() + ".local_TLB_accesses")
668 .desc("Number of TLB accesses")
672 .name(name() + ".local_TLB_hits")
673 .desc("Number of TLB hits")
677 .name(name() + ".local_TLB_misses")
678 .desc("Number of TLB misses")
682 .name(name() + ".local_TLB_miss_rate")
683 .desc("TLB miss rate")
687 .name(name() + ".access_cycles")
688 .desc("Cycles spent accessing this TLB level")
692 .name(name() + ".page_table_cycles")
693 .desc("Cycles spent accessing the page table")
696 localTLBMissRate
= 100 * localNumTLBMisses
/ localNumTLBAccesses
;
699 .name(name() + ".unique_pages")
700 .desc("Number of unique pages touched")
704 .name(name() + ".local_cycles")
705 .desc("Number of cycles spent in queue for all incoming reqs")
709 .name(name() + ".local_latency")
710 .desc("Avg. latency over incoming coalesced reqs")
713 localLatency
= localCycles
/ localNumTLBAccesses
;
716 .name(name() + ".global_TLB_accesses")
717 .desc("Number of TLB accesses")
721 .name(name() + ".global_TLB_hits")
722 .desc("Number of TLB hits")
726 .name(name() + ".global_TLB_misses")
727 .desc("Number of TLB misses")
731 .name(name() + ".global_TLB_miss_rate")
732 .desc("TLB miss rate")
735 globalTLBMissRate
= 100 * globalNumTLBMisses
/ globalNumTLBAccesses
;
738 .name(name() + ".avg_reuse_distance")
739 .desc("avg. reuse distance over all pages (in ticks)")
745 * Do the TLB lookup for this coalesced request and schedule
746 * another event <TLB access latency> cycles later.
750 GpuTLB::issueTLBLookup(PacketPtr pkt
)
753 assert(pkt
->senderState
);
755 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
758 TranslationState
*sender_state
=
759 safe_cast
<TranslationState
*>(pkt
->senderState
);
761 bool update_stats
= !sender_state
->prefetch
;
762 ThreadContext
* tmp_tc
= sender_state
->tc
;
764 DPRINTF(GPUTLB
, "Translation req. for virt. page addr %#x\n",
767 int req_cnt
= sender_state
->reqCnt
.back();
770 accessCycles
-= (curTick() * req_cnt
);
771 localCycles
-= curTick();
772 updatePageFootprint(virt_page_addr
);
773 globalNumTLBAccesses
+= req_cnt
;
776 tlbOutcome lookup_outcome
= TLB_MISS
;
777 const RequestPtr
&tmp_req
= pkt
->req
;
779 // Access the TLB and figure out if it's a hit or a miss.
780 bool success
= tlbLookup(tmp_req
, tmp_tc
, update_stats
);
783 lookup_outcome
= TLB_HIT
;
784 // Put the entry in SenderState
785 TlbEntry
*entry
= lookup(tmp_req
->getVaddr(), false);
788 auto p
= sender_state
->tc
->getProcessPtr();
789 sender_state
->tlbEntry
=
790 new TlbEntry(p
->pid(), entry
->vaddr
, entry
->paddr
,
794 // the reqCnt has an entry per level, so its size tells us
795 // which level we are in
796 sender_state
->hitLevel
= sender_state
->reqCnt
.size();
797 globalNumTLBHits
+= req_cnt
;
801 globalNumTLBMisses
+= req_cnt
;
805 * We now know the TLB lookup outcome (if it's a hit or a miss), as
806 * well as the TLB access latency.
808 * We create and schedule a new TLBEvent which will help us take the
809 * appropriate actions (e.g., update TLB on a hit, send request to
810 * lower level TLB on a miss, or start a page walk if this was the
813 TLBEvent
*tlb_event
=
814 new TLBEvent(this, virt_page_addr
, lookup_outcome
, pkt
);
816 if (translationReturnEvent
.count(virt_page_addr
)) {
817 panic("Virtual Page Address %#x already has a return event\n",
821 translationReturnEvent
[virt_page_addr
] = tlb_event
;
824 DPRINTF(GPUTLB
, "schedule translationReturnEvent @ curTick %d\n",
825 curTick() + cyclesToTicks(Cycles(hitLatency
)));
827 schedule(tlb_event
, curTick() + cyclesToTicks(Cycles(hitLatency
)));
830 GpuTLB::TLBEvent::TLBEvent(GpuTLB
* _tlb
, Addr _addr
,
831 tlbOutcome tlb_outcome
, PacketPtr _pkt
)
832 : Event(CPU_Tick_Pri
), tlb(_tlb
), virtPageAddr(_addr
),
833 outcome(tlb_outcome
), pkt(_pkt
)
838 * Do Paging protection checks. If we encounter a page fault, then
839 * an assertion is fired.
842 GpuTLB::pagingProtectionChecks(ThreadContext
*tc
, PacketPtr pkt
,
843 TlbEntry
* tlb_entry
, Mode mode
)
845 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
846 uint32_t flags
= pkt
->req
->getFlags();
847 bool storeCheck
= flags
& (StoreCheck
<< FlagShift
);
849 // Do paging protection checks.
851 = (m5Reg
.cpl
== 3 && !(flags
& (CPL0FlagBit
<< FlagShift
)));
852 CR0 cr0
= tc
->readMiscRegNoEffect(MISCREG_CR0
);
854 bool badWrite
= (!tlb_entry
->writable
&& (inUser
|| cr0
.wp
));
856 if ((inUser
&& !tlb_entry
->user
) ||
857 (mode
== BaseTLB::Write
&& badWrite
)) {
858 // The page must have been present to get into the TLB in
859 // the first place. We'll assume the reserved bits are
860 // fine even though we're not checking them.
861 panic("Page fault detected");
864 if (storeCheck
&& badWrite
) {
865 // This would fault if this were a write, so return a page
866 // fault that reflects that happening.
867 panic("Page fault detected");
872 * handleTranslationReturn is called on a TLB hit,
873 * when a TLB miss returns or when a page fault returns.
874 * The latter calls handelHit with TLB miss as tlbOutcome.
877 GpuTLB::handleTranslationReturn(Addr virt_page_addr
,
878 tlbOutcome tlb_outcome
, PacketPtr pkt
)
881 Addr vaddr
= pkt
->req
->getVaddr();
883 TranslationState
*sender_state
=
884 safe_cast
<TranslationState
*>(pkt
->senderState
);
886 ThreadContext
*tc
= sender_state
->tc
;
887 Mode mode
= sender_state
->tlbMode
;
889 TlbEntry
*local_entry
, *new_entry
;
891 if (tlb_outcome
== TLB_HIT
) {
892 DPRINTF(GPUTLB
, "Translation Done - TLB Hit for addr %#x\n",
894 local_entry
= sender_state
->tlbEntry
;
896 DPRINTF(GPUTLB
, "Translation Done - TLB Miss for addr %#x\n",
900 * We are returning either from a page walk or from a hit at a
901 * lower TLB level. The senderState should be "carrying" a pointer
902 * to the correct TLBEntry.
904 new_entry
= sender_state
->tlbEntry
;
906 local_entry
= new_entry
;
908 if (allocationPolicy
) {
909 DPRINTF(GPUTLB
, "allocating entry w/ addr %#x\n",
912 local_entry
= insert(virt_page_addr
, *new_entry
);
919 * At this point the packet carries an up-to-date tlbEntry pointer
920 * in its senderState.
921 * Next step is to do the paging protection checks.
923 DPRINTF(GPUTLB
, "Entry found with vaddr %#x, doing protection checks "
924 "while paddr was %#x.\n", local_entry
->vaddr
,
927 pagingProtectionChecks(tc
, pkt
, local_entry
, mode
);
928 int page_size
= local_entry
->size();
929 Addr paddr
= local_entry
->paddr
| (vaddr
& (page_size
- 1));
930 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
932 // Since this packet will be sent through the cpu side slave port,
933 // it must be converted to a response pkt if it is not one already
934 if (pkt
->isRequest()) {
935 pkt
->makeTimingResponse();
938 pkt
->req
->setPaddr(paddr
);
940 if (local_entry
->uncacheable
) {
941 pkt
->req
->setFlags(Request::UNCACHEABLE
);
944 //send packet back to coalescer
945 cpuSidePort
[0]->sendTimingResp(pkt
);
946 //schedule cleanup event
947 cleanupQueue
.push(virt_page_addr
);
949 // schedule this only once per cycle.
950 // The check is required because we might have multiple translations
951 // returning the same cycle
952 // this is a maximum priority event and must be on the same cycle
953 // as the cleanup event in TLBCoalescer to avoid a race with
954 // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
955 if (!cleanupEvent
.scheduled())
956 schedule(cleanupEvent
, curTick());
960 * Here we take the appropriate actions based on the result of the
964 GpuTLB::translationReturn(Addr virtPageAddr
, tlbOutcome outcome
,
967 DPRINTF(GPUTLB
, "Triggered TLBEvent for addr %#x\n", virtPageAddr
);
969 assert(translationReturnEvent
[virtPageAddr
]);
972 TranslationState
*tmp_sender_state
=
973 safe_cast
<TranslationState
*>(pkt
->senderState
);
975 int req_cnt
= tmp_sender_state
->reqCnt
.back();
976 bool update_stats
= !tmp_sender_state
->prefetch
;
979 if (outcome
== TLB_HIT
) {
980 handleTranslationReturn(virtPageAddr
, TLB_HIT
, pkt
);
983 accessCycles
+= (req_cnt
* curTick());
984 localCycles
+= curTick();
987 } else if (outcome
== TLB_MISS
) {
989 DPRINTF(GPUTLB
, "This is a TLB miss\n");
991 accessCycles
+= (req_cnt
*curTick());
992 localCycles
+= curTick();
995 if (hasMemSidePort
) {
996 // the one cyle added here represent the delay from when we get
997 // the reply back till when we propagate it to the coalescer
1000 accessCycles
+= (req_cnt
* 1);
1005 * There is a TLB below. Send the coalesced request.
1006 * We actually send the very first packet of all the
1007 * pending packets for this virtual page address.
1009 if (!memSidePort
[0]->sendTimingReq(pkt
)) {
1010 DPRINTF(GPUTLB
, "Failed sending translation request to "
1011 "lower level TLB for addr %#x\n", virtPageAddr
);
1013 memSidePort
[0]->retries
.push_back(pkt
);
1015 DPRINTF(GPUTLB
, "Sent translation request to lower level "
1016 "TLB for addr %#x\n", virtPageAddr
);
1019 //this is the last level TLB. Start a page walk
1020 DPRINTF(GPUTLB
, "Last level TLB - start a page walk for "
1021 "addr %#x\n", virtPageAddr
);
1024 pageTableCycles
-= (req_cnt
*curTick());
1026 TLBEvent
*tlb_event
= translationReturnEvent
[virtPageAddr
];
1028 tlb_event
->updateOutcome(PAGE_WALK
);
1030 curTick() + cyclesToTicks(Cycles(missLatency2
)));
1032 } else if (outcome
== PAGE_WALK
) {
1034 pageTableCycles
+= (req_cnt
*curTick());
1036 // Need to access the page table and update the TLB
1037 DPRINTF(GPUTLB
, "Doing a page walk for address %#x\n",
1040 TranslationState
*sender_state
=
1041 safe_cast
<TranslationState
*>(pkt
->senderState
);
1043 Process
*p
= sender_state
->tc
->getProcessPtr();
1044 Addr vaddr
= pkt
->req
->getVaddr();
1046 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
1047 assert(alignedVaddr
== virtPageAddr
);
1049 const EmulationPageTable::Entry
*pte
= p
->pTable
->lookup(vaddr
);
1050 if (!pte
&& sender_state
->tlbMode
!= BaseTLB::Execute
&&
1051 p
->fixupFault(vaddr
)) {
1052 pte
= p
->pTable
->lookup(vaddr
);
1056 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1059 sender_state
->tlbEntry
=
1060 new TlbEntry(p
->pid(), virtPageAddr
, pte
->paddr
, false,
1063 sender_state
->tlbEntry
= nullptr;
1066 handleTranslationReturn(virtPageAddr
, TLB_MISS
, pkt
);
1067 } else if (outcome
== MISS_RETURN
) {
1068 /** we add an extra cycle in the return path of the translation
1069 * requests in between the various TLB levels.
1071 handleTranslationReturn(virtPageAddr
, TLB_MISS
, pkt
);
1073 panic("Unexpected TLB outcome %d", outcome
);
1078 GpuTLB::TLBEvent::process()
1080 tlb
->translationReturn(virtPageAddr
, outcome
, pkt
);
1084 GpuTLB::TLBEvent::description() const
1086 return "trigger translationDoneEvent";
1090 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome
)
1096 GpuTLB::TLBEvent::getTLBEventVaddr()
1098 return virtPageAddr
;
1102 * recvTiming receives a coalesced timing request from a TLBCoalescer
1103 * and it calls issueTLBLookup()
1104 * It only rejects the packet if we have exceeded the max
1105 * outstanding number of requests for the TLB
1108 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt
)
1110 if (tlb
->outstandingReqs
< tlb
->maxCoalescedReqs
) {
1111 tlb
->issueTLBLookup(pkt
);
1112 // update number of outstanding translation requests
1113 tlb
->outstandingReqs
++;
1116 DPRINTF(GPUTLB
, "Reached maxCoalescedReqs number %d\n",
1117 tlb
->outstandingReqs
);
1123 * handleFuncTranslationReturn is called on a TLB hit,
1124 * when a TLB miss returns or when a page fault returns.
1125 * It updates LRU, inserts the TLB entry on a miss
1126 * depending on the allocation policy and does the required
1127 * protection checks. It does NOT create a new packet to
1128 * update the packet's addr; this is done in hsail-gpu code.
1131 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt
, tlbOutcome tlb_outcome
)
1133 TranslationState
*sender_state
=
1134 safe_cast
<TranslationState
*>(pkt
->senderState
);
1136 ThreadContext
*tc
= sender_state
->tc
;
1137 Mode mode
= sender_state
->tlbMode
;
1138 Addr vaddr
= pkt
->req
->getVaddr();
1140 TlbEntry
*local_entry
, *new_entry
;
1142 if (tlb_outcome
== TLB_HIT
) {
1143 DPRINTF(GPUTLB
, "Functional Translation Done - TLB hit for addr "
1146 local_entry
= sender_state
->tlbEntry
;
1148 DPRINTF(GPUTLB
, "Functional Translation Done - TLB miss for addr "
1152 * We are returning either from a page walk or from a hit at a
1153 * lower TLB level. The senderState should be "carrying" a pointer
1154 * to the correct TLBEntry.
1156 new_entry
= sender_state
->tlbEntry
;
1158 local_entry
= new_entry
;
1160 if (allocationPolicy
) {
1161 Addr virt_page_addr
= roundDown(vaddr
, TheISA::PageBytes
);
1163 DPRINTF(GPUTLB
, "allocating entry w/ addr %#x\n",
1166 local_entry
= insert(virt_page_addr
, *new_entry
);
1169 assert(local_entry
);
1172 DPRINTF(GPUTLB
, "Entry found with vaddr %#x, doing protection checks "
1173 "while paddr was %#x.\n", local_entry
->vaddr
,
1174 local_entry
->paddr
);
1177 * Do paging checks if it's a normal functional access. If it's for a
1178 * prefetch, then sometimes you can try to prefetch something that
1179 * won't pass protection. We don't actually want to fault becuase there
1180 * is no demand access to deem this a violation. Just put it in the
1181 * TLB and it will fault if indeed a future demand access touches it in
1184 * This feature could be used to explore security issues around
1185 * speculative memory accesses.
1187 if (!sender_state
->prefetch
&& sender_state
->tlbEntry
)
1188 pagingProtectionChecks(tc
, pkt
, local_entry
, mode
);
1190 int page_size
= local_entry
->size();
1191 Addr paddr
= local_entry
->paddr
| (vaddr
& (page_size
- 1));
1192 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
1194 pkt
->req
->setPaddr(paddr
);
1196 if (local_entry
->uncacheable
)
1197 pkt
->req
->setFlags(Request::UNCACHEABLE
);
1200 // This is used for atomic translations. Need to
1201 // make it all happen during the same cycle.
1203 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt
)
1205 TranslationState
*sender_state
=
1206 safe_cast
<TranslationState
*>(pkt
->senderState
);
1208 ThreadContext
*tc
= sender_state
->tc
;
1209 bool update_stats
= !sender_state
->prefetch
;
1211 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
1215 tlb
->updatePageFootprint(virt_page_addr
);
1217 // do the TLB lookup without updating the stats
1218 bool success
= tlb
->tlbLookup(pkt
->req
, tc
, update_stats
);
1219 tlbOutcome tlb_outcome
= success
? TLB_HIT
: TLB_MISS
;
1221 // functional mode means no coalescing
1222 // global metrics are the same as the local metrics
1224 tlb
->globalNumTLBAccesses
++;
1227 sender_state
->hitLevel
= sender_state
->reqCnt
.size();
1228 tlb
->globalNumTLBHits
++;
1234 tlb
->globalNumTLBMisses
++;
1235 if (tlb
->hasMemSidePort
) {
1236 // there is a TLB below -> propagate down the TLB hierarchy
1237 tlb
->memSidePort
[0]->sendFunctional(pkt
);
1238 // If no valid translation from a prefetch, then just return
1239 if (sender_state
->prefetch
&& !pkt
->req
->hasPaddr())
1242 // Need to access the page table and update the TLB
1243 DPRINTF(GPUTLB
, "Doing a page walk for address %#x\n",
1246 Process
*p
= tc
->getProcessPtr();
1248 Addr vaddr
= pkt
->req
->getVaddr();
1250 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
1251 assert(alignedVaddr
== virt_page_addr
);
1254 const EmulationPageTable::Entry
*pte
=
1255 p
->pTable
->lookup(vaddr
);
1256 if (!pte
&& sender_state
->tlbMode
!= BaseTLB::Execute
&&
1257 p
->fixupFault(vaddr
)) {
1258 pte
= p
->pTable
->lookup(vaddr
);
1261 if (!sender_state
->prefetch
) {
1262 // no PageFaults are permitted after
1263 // the second page table lookup
1266 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1269 sender_state
->tlbEntry
=
1270 new TlbEntry(p
->pid(), virt_page_addr
,
1271 pte
->paddr
, false, false);
1273 // If this was a prefetch, then do the normal thing if it
1274 // was a successful translation. Otherwise, send an empty
1275 // TLB entry back so that it can be figured out as empty
1276 // and handled accordingly.
1278 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1281 sender_state
->tlbEntry
=
1282 new TlbEntry(p
->pid(), virt_page_addr
,
1283 pte
->paddr
, false, false);
1285 DPRINTF(GPUPrefetch
, "Prefetch failed %#x\n",
1288 sender_state
->tlbEntry
= nullptr;
1295 DPRINTF(GPUPrefetch
, "Functional Hit for vaddr %#x\n",
1296 tlb
->lookup(pkt
->req
->getVaddr()));
1298 TlbEntry
*entry
= tlb
->lookup(pkt
->req
->getVaddr(),
1303 auto p
= sender_state
->tc
->getProcessPtr();
1304 sender_state
->tlbEntry
=
1305 new TlbEntry(p
->pid(), entry
->vaddr
, entry
->paddr
,
1308 // This is the function that would populate pkt->req with the paddr of
1309 // the translation. But if no translation happens (i.e Prefetch fails)
1310 // then the early returns in the above code wiill keep this function
1312 tlb
->handleFuncTranslationReturn(pkt
, tlb_outcome
);
1316 GpuTLB::CpuSidePort::recvReqRetry()
1318 // The CPUSidePort never sends anything but replies. No retries
1320 panic("recvReqRetry called");
1324 GpuTLB::CpuSidePort::getAddrRanges() const
1326 // currently not checked by the master
1327 AddrRangeList ranges
;
1333 * MemSidePort receives the packet back.
1334 * We need to call the handleTranslationReturn
1335 * and propagate up the hierarchy.
1338 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt
)
1340 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
1343 DPRINTF(GPUTLB
, "MemSidePort recvTiming for virt_page_addr %#x\n",
1346 TLBEvent
*tlb_event
= tlb
->translationReturnEvent
[virt_page_addr
];
1348 assert(virt_page_addr
== tlb_event
->getTLBEventVaddr());
1350 tlb_event
->updateOutcome(MISS_RETURN
);
1351 tlb
->schedule(tlb_event
, curTick()+tlb
->clockPeriod());
1357 GpuTLB::MemSidePort::recvReqRetry()
1359 // No retries should reach the TLB. The retries
1360 // should only reach the TLBCoalescer.
1361 panic("recvReqRetry called");
1367 while (!cleanupQueue
.empty()) {
1368 Addr cleanup_addr
= cleanupQueue
.front();
1372 TLBEvent
* old_tlb_event
= translationReturnEvent
[cleanup_addr
];
1373 delete old_tlb_event
;
1374 translationReturnEvent
.erase(cleanup_addr
);
1376 // update number of outstanding requests
1380 /** the higher level coalescer should retry if it has
1381 * any pending requests.
1383 for (int i
= 0; i
< cpuSidePort
.size(); ++i
) {
1384 cpuSidePort
[i
]->sendRetryReq();
1389 GpuTLB::updatePageFootprint(Addr virt_page_addr
)
1392 std::pair
<AccessPatternTable::iterator
, bool> ret
;
1394 AccessInfo tmp_access_info
;
1395 tmp_access_info
.lastTimeAccessed
= 0;
1396 tmp_access_info
.accessesPerPage
= 0;
1397 tmp_access_info
.totalReuseDistance
= 0;
1398 tmp_access_info
.sumDistance
= 0;
1399 tmp_access_info
.meanDistance
= 0;
1401 ret
= TLBFootprint
.insert(
1402 AccessPatternTable::value_type(virt_page_addr
, tmp_access_info
));
1404 bool first_page_access
= ret
.second
;
1406 if (first_page_access
) {
1409 int accessed_before
;
1410 accessed_before
= curTick() - ret
.first
->second
.lastTimeAccessed
;
1411 ret
.first
->second
.totalReuseDistance
+= accessed_before
;
1414 ret
.first
->second
.accessesPerPage
++;
1415 ret
.first
->second
.lastTimeAccessed
= curTick();
1417 if (accessDistance
) {
1418 ret
.first
->second
.localTLBAccesses
1419 .push_back(localNumTLBAccesses
.value());
1424 GpuTLB::exitCallback()
1426 std::ostream
*page_stat_file
= nullptr;
1428 if (accessDistance
) {
1430 // print per page statistics to a separate file (.csv format)
1431 // simout is the gem5 output directory (default is m5out or the one
1432 // specified with -d
1433 page_stat_file
= simout
.create(name().c_str())->stream();
1437 << "page,max_access_distance,mean_access_distance, "
1438 << "stddev_distance" << std::endl
;
1441 // update avg. reuse distance footprint
1442 unsigned int sum_avg_reuse_distance_per_page
= 0;
1444 // iterate through all pages seen by this TLB
1445 for (auto &iter
: TLBFootprint
) {
1446 sum_avg_reuse_distance_per_page
+= iter
.second
.totalReuseDistance
/
1447 iter
.second
.accessesPerPage
;
1449 if (accessDistance
) {
1450 unsigned int tmp
= iter
.second
.localTLBAccesses
[0];
1451 unsigned int prev
= tmp
;
1453 for (int i
= 0; i
< iter
.second
.localTLBAccesses
.size(); ++i
) {
1458 prev
= iter
.second
.localTLBAccesses
[i
];
1459 // update the localTLBAccesses value
1460 // with the actual differece
1461 iter
.second
.localTLBAccesses
[i
] -= tmp
;
1462 // compute the sum of AccessDistance per page
1463 // used later for mean
1464 iter
.second
.sumDistance
+=
1465 iter
.second
.localTLBAccesses
[i
];
1468 iter
.second
.meanDistance
=
1469 iter
.second
.sumDistance
/ iter
.second
.accessesPerPage
;
1471 // compute std_dev and max (we need a second round because we
1472 // need to know the mean value
1473 unsigned int max_distance
= 0;
1474 unsigned int stddev_distance
= 0;
1476 for (int i
= 0; i
< iter
.second
.localTLBAccesses
.size(); ++i
) {
1477 unsigned int tmp_access_distance
=
1478 iter
.second
.localTLBAccesses
[i
];
1480 if (tmp_access_distance
> max_distance
) {
1481 max_distance
= tmp_access_distance
;
1485 tmp_access_distance
- iter
.second
.meanDistance
;
1486 stddev_distance
+= pow(diff
, 2);
1491 sqrt(stddev_distance
/iter
.second
.accessesPerPage
);
1493 if (page_stat_file
) {
1494 *page_stat_file
<< std::hex
<< iter
.first
<< ",";
1495 *page_stat_file
<< std::dec
<< max_distance
<< ",";
1496 *page_stat_file
<< std::dec
<< iter
.second
.meanDistance
1498 *page_stat_file
<< std::dec
<< stddev_distance
;
1499 *page_stat_file
<< std::endl
;
1502 // erase the localTLBAccesses array
1503 iter
.second
.localTLBAccesses
.clear();
1507 if (!TLBFootprint
.empty()) {
1509 sum_avg_reuse_distance_per_page
/ TLBFootprint
.size();
1512 //clear the TLBFootprint map
1513 TLBFootprint
.clear();
1515 } // namespace X86ISA
1518 X86GPUTLBParams::create()
1520 return new X86ISA::GpuTLB(this);