2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
36 #include "gpu-compute/gpu_tlb.hh"
41 #include "arch/x86/faults.hh"
42 #include "arch/x86/insts/microldstop.hh"
43 #include "arch/x86/pagetable.hh"
44 #include "arch/x86/pagetable_walker.hh"
45 #include "arch/x86/regs/misc.hh"
46 #include "arch/x86/regs/msr.hh"
47 #include "arch/x86/x86_traits.hh"
48 #include "base/bitfield.hh"
49 #include "base/logging.hh"
50 #include "base/output.hh"
51 #include "base/trace.hh"
52 #include "cpu/base.hh"
53 #include "cpu/thread_context.hh"
54 #include "debug/GPUPrefetch.hh"
55 #include "debug/GPUTLB.hh"
56 #include "mem/packet_access.hh"
57 #include "mem/page_table.hh"
58 #include "mem/request.hh"
59 #include "sim/process.hh"
60 #include "sim/pseudo_inst.hh"
65 GpuTLB::GpuTLB(const Params
*p
)
66 : ClockedObject(p
), configAddress(0), size(p
->size
),
67 cleanupEvent([this]{ cleanup(); }, name(), false,
69 exitEvent([this]{ exitCallback(); }, name())
72 assert(assoc
<= size
);
74 allocationPolicy
= p
->allocationPolicy
;
75 hasMemSidePort
= false;
76 accessDistance
= p
->accessDistance
;
77 clock
= p
->clk_domain
->clockPeriod();
79 tlb
.assign(size
, TlbEntry());
81 freeList
.resize(numSets
);
82 entryList
.resize(numSets
);
84 for (int set
= 0; set
< numSets
; ++set
) {
85 for (int way
= 0; way
< assoc
; ++way
) {
86 int x
= set
* assoc
+ way
;
87 freeList
[set
].push_back(&tlb
.at(x
));
94 * @warning: the set-associative version assumes you have a
95 * fixed page size of 4KB.
96 * If the page size is greather than 4KB (as defined in the
97 * TheISA::PageBytes), then there are various issues w/ the current
98 * implementation (you'd have the same 8KB page being replicated in
101 setMask
= numSets
- 1;
103 maxCoalescedReqs
= p
->maxOutstandingReqs
;
105 // Do not allow maxCoalescedReqs to be more than the TLB associativity
106 if (maxCoalescedReqs
> assoc
) {
107 maxCoalescedReqs
= assoc
;
108 cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc
);
112 hitLatency
= p
->hitLatency
;
113 missLatency1
= p
->missLatency1
;
114 missLatency2
= p
->missLatency2
;
116 // create the slave ports based on the number of connected ports
117 for (size_t i
= 0; i
< p
->port_slave_connection_count
; ++i
) {
118 cpuSidePort
.push_back(new CpuSidePort(csprintf("%s-port%d",
119 name(), i
), this, i
));
122 // create the master ports based on the number of connected ports
123 for (size_t i
= 0; i
< p
->port_master_connection_count
; ++i
) {
124 memSidePort
.push_back(new MemSidePort(csprintf("%s-port%d",
125 name(), i
), this, i
));
129 // fixme: this is never called?
132 // make sure all the hash-maps are empty
133 assert(translationReturnEvent
.empty());
137 GpuTLB::getPort(const std::string
&if_name
, PortID idx
)
139 if (if_name
== "slave") {
140 if (idx
>= static_cast<PortID
>(cpuSidePort
.size())) {
141 panic("TLBCoalescer::getPort: unknown index %d\n", idx
);
144 return *cpuSidePort
[idx
];
145 } else if (if_name
== "master") {
146 if (idx
>= static_cast<PortID
>(memSidePort
.size())) {
147 panic("TLBCoalescer::getPort: unknown index %d\n", idx
);
150 hasMemSidePort
= true;
152 return *memSidePort
[idx
];
154 panic("TLBCoalescer::getPort: unknown port %s\n", if_name
);
159 GpuTLB::insert(Addr vpn
, TlbEntry
&entry
)
161 TlbEntry
*newEntry
= nullptr;
164 * vpn holds the virtual page address
165 * The least significant bits are simply masked
167 int set
= (vpn
>> TheISA::PageShift
) & setMask
;
169 if (!freeList
[set
].empty()) {
170 newEntry
= freeList
[set
].front();
171 freeList
[set
].pop_front();
173 newEntry
= entryList
[set
].back();
174 entryList
[set
].pop_back();
178 newEntry
->vaddr
= vpn
;
179 entryList
[set
].push_front(newEntry
);
184 GpuTLB::EntryList::iterator
185 GpuTLB::lookupIt(Addr va
, bool update_lru
)
187 int set
= (va
>> TheISA::PageShift
) & setMask
;
193 auto entry
= entryList
[set
].begin();
194 for (; entry
!= entryList
[set
].end(); ++entry
) {
195 int page_size
= (*entry
)->size();
197 if ((*entry
)->vaddr
<= va
&& (*entry
)->vaddr
+ page_size
> va
) {
198 DPRINTF(GPUTLB
, "Matched vaddr %#x to entry starting at %#x "
199 "with size %#x.\n", va
, (*entry
)->vaddr
, page_size
);
202 entryList
[set
].push_front(*entry
);
203 entryList
[set
].erase(entry
);
204 entry
= entryList
[set
].begin();
215 GpuTLB::lookup(Addr va
, bool update_lru
)
217 int set
= (va
>> TheISA::PageShift
) & setMask
;
219 auto entry
= lookupIt(va
, update_lru
);
221 if (entry
== entryList
[set
].end())
228 GpuTLB::invalidateAll()
230 DPRINTF(GPUTLB
, "Invalidating all entries.\n");
232 for (int i
= 0; i
< numSets
; ++i
) {
233 while (!entryList
[i
].empty()) {
234 TlbEntry
*entry
= entryList
[i
].front();
235 entryList
[i
].pop_front();
236 freeList
[i
].push_back(entry
);
242 GpuTLB::setConfigAddress(uint32_t addr
)
244 configAddress
= addr
;
248 GpuTLB::invalidateNonGlobal()
250 DPRINTF(GPUTLB
, "Invalidating all non global entries.\n");
252 for (int i
= 0; i
< numSets
; ++i
) {
253 for (auto entryIt
= entryList
[i
].begin();
254 entryIt
!= entryList
[i
].end();) {
255 if (!(*entryIt
)->global
) {
256 freeList
[i
].push_back(*entryIt
);
257 entryList
[i
].erase(entryIt
++);
266 GpuTLB::demapPage(Addr va
, uint64_t asn
)
269 int set
= (va
>> TheISA::PageShift
) & setMask
;
270 auto entry
= lookupIt(va
, false);
272 if (entry
!= entryList
[set
].end()) {
273 freeList
[set
].push_back(*entry
);
274 entryList
[set
].erase(entry
);
284 localMiscRegAccess(bool read
, MiscRegIndex regNum
,
285 ThreadContext
*tc
, PacketPtr pkt
)
288 RegVal data
= htole(tc
->readMiscReg(regNum
));
289 // Make sure we don't trot off the end of data.
290 pkt
->setData((uint8_t *)&data
);
292 RegVal data
= htole(tc
->readMiscRegNoEffect(regNum
));
293 tc
->setMiscReg(regNum
, letoh(data
));
298 } // anonymous namespace
301 GpuTLB::translateInt(bool read
, const RequestPtr
&req
, ThreadContext
*tc
)
303 DPRINTF(GPUTLB
, "Addresses references internal memory.\n");
304 Addr vaddr
= req
->getVaddr();
305 Addr prefix
= (vaddr
>> 3) & IntAddrPrefixMask
;
307 if (prefix
== IntAddrPrefixCPUID
) {
308 panic("CPUID memory space not yet implemented!\n");
309 } else if (prefix
== IntAddrPrefixMSR
) {
310 vaddr
= (vaddr
>> 3) & ~IntAddrPrefixMask
;
313 if (!msrAddrToIndex(regNum
, vaddr
))
314 return std::make_shared
<GeneralProtection
>(0);
316 req
->setLocalAccessor(
317 [read
,regNum
,vaddr
](ThreadContext
*tc
, PacketPtr pkt
)
319 return localMiscRegAccess(read
, regNum
, tc
, pkt
);
324 } else if (prefix
== IntAddrPrefixIO
) {
325 // TODO If CPL > IOPL or in virtual mode, check the I/O permission
326 // bitmap in the TSS.
328 Addr IOPort
= vaddr
& ~IntAddrPrefixMask
;
329 // Make sure the address fits in the expected 16 bit IO address
331 assert(!(IOPort
& ~0xFFFF));
332 if (IOPort
== 0xCF8 && req
->getSize() == 4) {
333 req
->setLocalAccessor(
334 [read
](ThreadContext
*tc
, PacketPtr pkt
)
336 return localMiscRegAccess(
337 read
, MISCREG_PCI_CONFIG_ADDRESS
, tc
, pkt
);
340 } else if ((IOPort
& ~mask(2)) == 0xCFC) {
341 req
->setFlags(Request::UNCACHEABLE
| Request::STRICT_ORDER
);
343 tc
->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS
);
344 if (bits(configAddress
, 31, 31)) {
345 req
->setPaddr(PhysAddrPrefixPciConfig
|
346 mbits(configAddress
, 30, 2) |
349 req
->setPaddr(PhysAddrPrefixIO
| IOPort
);
352 req
->setFlags(Request::UNCACHEABLE
| Request::STRICT_ORDER
);
353 req
->setPaddr(PhysAddrPrefixIO
| IOPort
);
357 panic("Access to unrecognized internal address space %#x.\n",
363 * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
364 * and false on a TLB miss.
365 * Many of the checks about different modes have been converted to
366 * assertions, since these parts of the code are not really used.
367 * On a hit it will update the LRU stack.
370 GpuTLB::tlbLookup(const RequestPtr
&req
,
371 ThreadContext
*tc
, bool update_stats
)
373 bool tlb_hit
= false;
375 uint32_t flags
= req
->getFlags();
376 int seg
= flags
& SegmentFlagMask
;
379 assert(seg
!= SEGMENT_REG_MS
);
380 Addr vaddr
= req
->getVaddr();
381 DPRINTF(GPUTLB
, "TLB Lookup for vaddr %#x.\n", vaddr
);
382 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
385 DPRINTF(GPUTLB
, "In protected mode.\n");
386 // make sure we are in 64-bit mode
387 assert(m5Reg
.mode
== LongMode
);
389 // If paging is enabled, do the translation.
391 DPRINTF(GPUTLB
, "Paging enabled.\n");
392 //update LRU stack on a hit
393 TlbEntry
*entry
= lookup(vaddr
, true);
399 // functional tlb access for memory initialization
400 // i.e., memory seeding or instr. seeding -> don't update
405 localNumTLBAccesses
++;
419 GpuTLB::translate(const RequestPtr
&req
, ThreadContext
*tc
,
420 Translation
*translation
, Mode mode
,
421 bool &delayedResponse
, bool timing
, int &latency
)
423 uint32_t flags
= req
->getFlags();
424 int seg
= flags
& SegmentFlagMask
;
425 bool storeCheck
= flags
& (StoreCheck
<< FlagShift
);
427 // If this is true, we're dealing with a request
428 // to a non-memory address space.
429 if (seg
== SEGMENT_REG_MS
) {
430 return translateInt(mode
== Mode::Read
, req
, tc
);
433 delayedResponse
= false;
434 Addr vaddr
= req
->getVaddr();
435 DPRINTF(GPUTLB
, "Translating vaddr %#x.\n", vaddr
);
437 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
439 // If protected mode has been enabled...
441 DPRINTF(GPUTLB
, "In protected mode.\n");
442 // If we're not in 64-bit mode, do protection/limit checks
443 if (m5Reg
.mode
!= LongMode
) {
444 DPRINTF(GPUTLB
, "Not in long mode. Checking segment "
447 // Check for a null segment selector.
448 if (!(seg
== SEGMENT_REG_TSG
|| seg
== SYS_SEGMENT_REG_IDTR
||
449 seg
== SEGMENT_REG_HS
|| seg
== SEGMENT_REG_LS
)
450 && !tc
->readMiscRegNoEffect(MISCREG_SEG_SEL(seg
))) {
451 return std::make_shared
<GeneralProtection
>(0);
454 bool expandDown
= false;
455 SegAttr attr
= tc
->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg
));
457 if (seg
>= SEGMENT_REG_ES
&& seg
<= SEGMENT_REG_HS
) {
458 if (!attr
.writable
&& (mode
== BaseTLB::Write
||
460 return std::make_shared
<GeneralProtection
>(0);
462 if (!attr
.readable
&& mode
== BaseTLB::Read
)
463 return std::make_shared
<GeneralProtection
>(0);
465 expandDown
= attr
.expandDown
;
469 Addr base
= tc
->readMiscRegNoEffect(MISCREG_SEG_BASE(seg
));
470 Addr limit
= tc
->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg
));
471 // This assumes we're not in 64 bit mode. If we were, the
472 // default address size is 64 bits, overridable to 32.
474 bool sizeOverride
= (flags
& (AddrSizeFlagBit
<< FlagShift
));
475 SegAttr csAttr
= tc
->readMiscRegNoEffect(MISCREG_CS_ATTR
);
477 if ((csAttr
.defaultSize
&& sizeOverride
) ||
478 (!csAttr
.defaultSize
&& !sizeOverride
)) {
482 Addr offset
= bits(vaddr
- base
, size
- 1, 0);
483 Addr endOffset
= offset
+ req
->getSize() - 1;
486 DPRINTF(GPUTLB
, "Checking an expand down segment.\n");
487 warn_once("Expand down segments are untested.\n");
489 if (offset
<= limit
|| endOffset
<= limit
)
490 return std::make_shared
<GeneralProtection
>(0);
492 if (offset
> limit
|| endOffset
> limit
)
493 return std::make_shared
<GeneralProtection
>(0);
497 // If paging is enabled, do the translation.
499 DPRINTF(GPUTLB
, "Paging enabled.\n");
500 // The vaddr already has the segment base applied.
501 TlbEntry
*entry
= lookup(vaddr
);
502 localNumTLBAccesses
++;
507 latency
= missLatency1
;
511 fatal("GpuTLB doesn't support full-system mode\n");
513 DPRINTF(GPUTLB
, "Handling a TLB miss for address %#x "
514 "at pc %#x.\n", vaddr
, tc
->instAddr());
516 Process
*p
= tc
->getProcessPtr();
517 const EmulationPageTable::Entry
*pte
=
518 p
->pTable
->lookup(vaddr
);
520 if (!pte
&& mode
!= BaseTLB::Execute
) {
521 // penalize a "page fault" more
523 latency
+= missLatency2
;
525 if (p
->fixupFault(vaddr
))
526 pte
= p
->pTable
->lookup(vaddr
);
530 return std::make_shared
<PageFault
>(vaddr
, true,
534 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
536 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n",
537 alignedVaddr
, pte
->paddr
);
539 TlbEntry
gpuEntry(p
->pid(), alignedVaddr
,
540 pte
->paddr
, false, false);
541 entry
= insert(alignedVaddr
, gpuEntry
);
544 DPRINTF(GPUTLB
, "Miss was serviced.\n");
550 latency
= hitLatency
;
554 // Do paging protection checks.
555 bool inUser
= (m5Reg
.cpl
== 3 &&
556 !(flags
& (CPL0FlagBit
<< FlagShift
)));
558 CR0 cr0
= tc
->readMiscRegNoEffect(MISCREG_CR0
);
559 bool badWrite
= (!entry
->writable
&& (inUser
|| cr0
.wp
));
561 if ((inUser
&& !entry
->user
) || (mode
== BaseTLB::Write
&&
563 // The page must have been present to get into the TLB in
564 // the first place. We'll assume the reserved bits are
565 // fine even though we're not checking them.
566 return std::make_shared
<PageFault
>(vaddr
, true, mode
,
570 if (storeCheck
&& badWrite
) {
571 // This would fault if this were a write, so return a page
572 // fault that reflects that happening.
573 return std::make_shared
<PageFault
>(vaddr
, true,
579 DPRINTF(GPUTLB
, "Entry found with paddr %#x, doing protection "
580 "checks.\n", entry
->paddr
);
582 int page_size
= entry
->size();
583 Addr paddr
= entry
->paddr
| (vaddr
& (page_size
- 1));
584 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
585 req
->setPaddr(paddr
);
587 if (entry
->uncacheable
)
588 req
->setFlags(Request::UNCACHEABLE
);
590 //Use the address which already has segmentation applied.
591 DPRINTF(GPUTLB
, "Paging disabled.\n");
592 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, vaddr
);
593 req
->setPaddr(vaddr
);
597 DPRINTF(GPUTLB
, "In real mode.\n");
598 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, vaddr
);
599 req
->setPaddr(vaddr
);
602 // Check for an access to the local APIC
604 LocalApicBase localApicBase
=
605 tc
->readMiscRegNoEffect(MISCREG_APIC_BASE
);
607 Addr baseAddr
= localApicBase
.base
* PageBytes
;
608 Addr paddr
= req
->getPaddr();
610 if (baseAddr
<= paddr
&& baseAddr
+ PageBytes
> paddr
) {
611 // Force the access to be uncacheable.
612 req
->setFlags(Request::UNCACHEABLE
);
613 req
->setPaddr(x86LocalAPICAddress(tc
->contextId(),
622 GpuTLB::translateAtomic(const RequestPtr
&req
, ThreadContext
*tc
,
623 Mode mode
, int &latency
)
625 bool delayedResponse
;
627 return GpuTLB::translate(req
, tc
, nullptr, mode
, delayedResponse
, false,
632 GpuTLB::translateTiming(const RequestPtr
&req
, ThreadContext
*tc
,
633 Translation
*translation
, Mode mode
, int &latency
)
635 bool delayedResponse
;
638 Fault fault
= GpuTLB::translate(req
, tc
, translation
, mode
,
639 delayedResponse
, true, latency
);
641 if (!delayedResponse
)
642 translation
->finish(fault
, req
, tc
, mode
);
653 GpuTLB::serialize(CheckpointOut
&cp
) const
658 GpuTLB::unserialize(CheckpointIn
&cp
)
665 ClockedObject::regStats();
668 .name(name() + ".local_TLB_accesses")
669 .desc("Number of TLB accesses")
673 .name(name() + ".local_TLB_hits")
674 .desc("Number of TLB hits")
678 .name(name() + ".local_TLB_misses")
679 .desc("Number of TLB misses")
683 .name(name() + ".local_TLB_miss_rate")
684 .desc("TLB miss rate")
688 .name(name() + ".access_cycles")
689 .desc("Cycles spent accessing this TLB level")
693 .name(name() + ".page_table_cycles")
694 .desc("Cycles spent accessing the page table")
697 localTLBMissRate
= 100 * localNumTLBMisses
/ localNumTLBAccesses
;
700 .name(name() + ".unique_pages")
701 .desc("Number of unique pages touched")
705 .name(name() + ".local_cycles")
706 .desc("Number of cycles spent in queue for all incoming reqs")
710 .name(name() + ".local_latency")
711 .desc("Avg. latency over incoming coalesced reqs")
714 localLatency
= localCycles
/ localNumTLBAccesses
;
717 .name(name() + ".global_TLB_accesses")
718 .desc("Number of TLB accesses")
722 .name(name() + ".global_TLB_hits")
723 .desc("Number of TLB hits")
727 .name(name() + ".global_TLB_misses")
728 .desc("Number of TLB misses")
732 .name(name() + ".global_TLB_miss_rate")
733 .desc("TLB miss rate")
736 globalTLBMissRate
= 100 * globalNumTLBMisses
/ globalNumTLBAccesses
;
739 .name(name() + ".avg_reuse_distance")
740 .desc("avg. reuse distance over all pages (in ticks)")
746 * Do the TLB lookup for this coalesced request and schedule
747 * another event <TLB access latency> cycles later.
751 GpuTLB::issueTLBLookup(PacketPtr pkt
)
754 assert(pkt
->senderState
);
756 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
759 TranslationState
*sender_state
=
760 safe_cast
<TranslationState
*>(pkt
->senderState
);
762 bool update_stats
= !sender_state
->prefetch
;
763 ThreadContext
* tmp_tc
= sender_state
->tc
;
765 DPRINTF(GPUTLB
, "Translation req. for virt. page addr %#x\n",
768 int req_cnt
= sender_state
->reqCnt
.back();
771 accessCycles
-= (curTick() * req_cnt
);
772 localCycles
-= curTick();
773 updatePageFootprint(virt_page_addr
);
774 globalNumTLBAccesses
+= req_cnt
;
777 tlbOutcome lookup_outcome
= TLB_MISS
;
778 const RequestPtr
&tmp_req
= pkt
->req
;
780 // Access the TLB and figure out if it's a hit or a miss.
781 bool success
= tlbLookup(tmp_req
, tmp_tc
, update_stats
);
784 lookup_outcome
= TLB_HIT
;
785 // Put the entry in SenderState
786 TlbEntry
*entry
= lookup(tmp_req
->getVaddr(), false);
789 auto p
= sender_state
->tc
->getProcessPtr();
790 sender_state
->tlbEntry
=
791 new TlbEntry(p
->pid(), entry
->vaddr
, entry
->paddr
,
795 // the reqCnt has an entry per level, so its size tells us
796 // which level we are in
797 sender_state
->hitLevel
= sender_state
->reqCnt
.size();
798 globalNumTLBHits
+= req_cnt
;
802 globalNumTLBMisses
+= req_cnt
;
806 * We now know the TLB lookup outcome (if it's a hit or a miss), as well
807 * as the TLB access latency.
809 * We create and schedule a new TLBEvent which will help us take the
810 * appropriate actions (e.g., update TLB on a hit, send request to lower
811 * level TLB on a miss, or start a page walk if this was the last-level
814 TLBEvent
*tlb_event
=
815 new TLBEvent(this, virt_page_addr
, lookup_outcome
, pkt
);
817 if (translationReturnEvent
.count(virt_page_addr
)) {
818 panic("Virtual Page Address %#x already has a return event\n",
822 translationReturnEvent
[virt_page_addr
] = tlb_event
;
825 DPRINTF(GPUTLB
, "schedule translationReturnEvent @ curTick %d\n",
826 curTick() + this->ticks(hitLatency
));
828 schedule(tlb_event
, curTick() + this->ticks(hitLatency
));
831 GpuTLB::TLBEvent::TLBEvent(GpuTLB
* _tlb
, Addr _addr
, tlbOutcome tlb_outcome
,
833 : Event(CPU_Tick_Pri
), tlb(_tlb
), virtPageAddr(_addr
),
834 outcome(tlb_outcome
), pkt(_pkt
)
839 * Do Paging protection checks. If we encounter a page fault, then
840 * an assertion is fired.
843 GpuTLB::pagingProtectionChecks(ThreadContext
*tc
, PacketPtr pkt
,
844 TlbEntry
* tlb_entry
, Mode mode
)
846 HandyM5Reg m5Reg
= tc
->readMiscRegNoEffect(MISCREG_M5_REG
);
847 uint32_t flags
= pkt
->req
->getFlags();
848 bool storeCheck
= flags
& (StoreCheck
<< FlagShift
);
850 // Do paging protection checks.
851 bool inUser
= (m5Reg
.cpl
== 3 && !(flags
& (CPL0FlagBit
<< FlagShift
)));
852 CR0 cr0
= tc
->readMiscRegNoEffect(MISCREG_CR0
);
854 bool badWrite
= (!tlb_entry
->writable
&& (inUser
|| cr0
.wp
));
856 if ((inUser
&& !tlb_entry
->user
) ||
857 (mode
== BaseTLB::Write
&& badWrite
)) {
858 // The page must have been present to get into the TLB in
859 // the first place. We'll assume the reserved bits are
860 // fine even though we're not checking them.
861 panic("Page fault detected");
864 if (storeCheck
&& badWrite
) {
865 // This would fault if this were a write, so return a page
866 // fault that reflects that happening.
867 panic("Page fault detected");
872 * handleTranslationReturn is called on a TLB hit,
873 * when a TLB miss returns or when a page fault returns.
874 * The latter calls handelHit with TLB miss as tlbOutcome.
877 GpuTLB::handleTranslationReturn(Addr virt_page_addr
, tlbOutcome tlb_outcome
,
882 Addr vaddr
= pkt
->req
->getVaddr();
884 TranslationState
*sender_state
=
885 safe_cast
<TranslationState
*>(pkt
->senderState
);
887 ThreadContext
*tc
= sender_state
->tc
;
888 Mode mode
= sender_state
->tlbMode
;
890 TlbEntry
*local_entry
, *new_entry
;
892 if (tlb_outcome
== TLB_HIT
) {
893 DPRINTF(GPUTLB
, "Translation Done - TLB Hit for addr %#x\n", vaddr
);
894 local_entry
= sender_state
->tlbEntry
;
896 DPRINTF(GPUTLB
, "Translation Done - TLB Miss for addr %#x\n",
899 // We are returning either from a page walk or from a hit at a lower
900 // TLB level. The senderState should be "carrying" a pointer to the
902 new_entry
= sender_state
->tlbEntry
;
904 local_entry
= new_entry
;
906 if (allocationPolicy
) {
907 DPRINTF(GPUTLB
, "allocating entry w/ addr %#x\n",
910 local_entry
= insert(virt_page_addr
, *new_entry
);
917 * At this point the packet carries an up-to-date tlbEntry pointer
918 * in its senderState.
919 * Next step is to do the paging protection checks.
921 DPRINTF(GPUTLB
, "Entry found with vaddr %#x, doing protection checks "
922 "while paddr was %#x.\n", local_entry
->vaddr
,
925 pagingProtectionChecks(tc
, pkt
, local_entry
, mode
);
926 int page_size
= local_entry
->size();
927 Addr paddr
= local_entry
->paddr
| (vaddr
& (page_size
- 1));
928 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
930 // Since this packet will be sent through the cpu side slave port,
931 // it must be converted to a response pkt if it is not one already
932 if (pkt
->isRequest()) {
933 pkt
->makeTimingResponse();
936 pkt
->req
->setPaddr(paddr
);
938 if (local_entry
->uncacheable
) {
939 pkt
->req
->setFlags(Request::UNCACHEABLE
);
942 //send packet back to coalescer
943 cpuSidePort
[0]->sendTimingResp(pkt
);
944 //schedule cleanup event
945 cleanupQueue
.push(virt_page_addr
);
947 // schedule this only once per cycle.
948 // The check is required because we might have multiple translations
949 // returning the same cycle
950 // this is a maximum priority event and must be on the same cycle
951 // as the cleanup event in TLBCoalescer to avoid a race with
952 // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
953 if (!cleanupEvent
.scheduled())
954 schedule(cleanupEvent
, curTick());
958 * Here we take the appropriate actions based on the result of the
962 GpuTLB::translationReturn(Addr virtPageAddr
, tlbOutcome outcome
,
965 DPRINTF(GPUTLB
, "Triggered TLBEvent for addr %#x\n", virtPageAddr
);
967 assert(translationReturnEvent
[virtPageAddr
]);
970 TranslationState
*tmp_sender_state
=
971 safe_cast
<TranslationState
*>(pkt
->senderState
);
973 int req_cnt
= tmp_sender_state
->reqCnt
.back();
974 bool update_stats
= !tmp_sender_state
->prefetch
;
977 if (outcome
== TLB_HIT
) {
978 handleTranslationReturn(virtPageAddr
, TLB_HIT
, pkt
);
981 accessCycles
+= (req_cnt
* curTick());
982 localCycles
+= curTick();
985 } else if (outcome
== TLB_MISS
) {
987 DPRINTF(GPUTLB
, "This is a TLB miss\n");
989 accessCycles
+= (req_cnt
*curTick());
990 localCycles
+= curTick();
993 if (hasMemSidePort
) {
994 // the one cyle added here represent the delay from when we get
995 // the reply back till when we propagate it to the coalescer
998 accessCycles
+= (req_cnt
* 1);
1003 * There is a TLB below. Send the coalesced request.
1004 * We actually send the very first packet of all the
1005 * pending packets for this virtual page address.
1007 if (!memSidePort
[0]->sendTimingReq(pkt
)) {
1008 DPRINTF(GPUTLB
, "Failed sending translation request to "
1009 "lower level TLB for addr %#x\n", virtPageAddr
);
1011 memSidePort
[0]->retries
.push_back(pkt
);
1013 DPRINTF(GPUTLB
, "Sent translation request to lower level "
1014 "TLB for addr %#x\n", virtPageAddr
);
1017 //this is the last level TLB. Start a page walk
1018 DPRINTF(GPUTLB
, "Last level TLB - start a page walk for "
1019 "addr %#x\n", virtPageAddr
);
1022 pageTableCycles
-= (req_cnt
*curTick());
1024 TLBEvent
*tlb_event
= translationReturnEvent
[virtPageAddr
];
1026 tlb_event
->updateOutcome(PAGE_WALK
);
1027 schedule(tlb_event
, curTick() + ticks(missLatency2
));
1029 } else if (outcome
== PAGE_WALK
) {
1031 pageTableCycles
+= (req_cnt
*curTick());
1033 // Need to access the page table and update the TLB
1034 DPRINTF(GPUTLB
, "Doing a page walk for address %#x\n",
1037 TranslationState
*sender_state
=
1038 safe_cast
<TranslationState
*>(pkt
->senderState
);
1040 Process
*p
= sender_state
->tc
->getProcessPtr();
1041 Addr vaddr
= pkt
->req
->getVaddr();
1043 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
1044 assert(alignedVaddr
== virtPageAddr
);
1046 const EmulationPageTable::Entry
*pte
= p
->pTable
->lookup(vaddr
);
1047 if (!pte
&& sender_state
->tlbMode
!= BaseTLB::Execute
&&
1048 p
->fixupFault(vaddr
)) {
1049 pte
= p
->pTable
->lookup(vaddr
);
1053 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1056 sender_state
->tlbEntry
=
1057 new TlbEntry(p
->pid(), virtPageAddr
, pte
->paddr
, false,
1060 sender_state
->tlbEntry
= nullptr;
1063 handleTranslationReturn(virtPageAddr
, TLB_MISS
, pkt
);
1064 } else if (outcome
== MISS_RETURN
) {
1065 /** we add an extra cycle in the return path of the translation
1066 * requests in between the various TLB levels.
1068 handleTranslationReturn(virtPageAddr
, TLB_MISS
, pkt
);
1070 panic("Unexpected TLB outcome %d", outcome
);
1075 GpuTLB::TLBEvent::process()
1077 tlb
->translationReturn(virtPageAddr
, outcome
, pkt
);
1081 GpuTLB::TLBEvent::description() const
1083 return "trigger translationDoneEvent";
1087 GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome
)
1093 GpuTLB::TLBEvent::getTLBEventVaddr()
1095 return virtPageAddr
;
1099 * recvTiming receives a coalesced timing request from a TLBCoalescer
1100 * and it calls issueTLBLookup()
1101 * It only rejects the packet if we have exceeded the max
1102 * outstanding number of requests for the TLB
1105 GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt
)
1107 if (tlb
->outstandingReqs
< tlb
->maxCoalescedReqs
) {
1108 tlb
->issueTLBLookup(pkt
);
1109 // update number of outstanding translation requests
1110 tlb
->outstandingReqs
++;
1113 DPRINTF(GPUTLB
, "Reached maxCoalescedReqs number %d\n",
1114 tlb
->outstandingReqs
);
1120 * handleFuncTranslationReturn is called on a TLB hit,
1121 * when a TLB miss returns or when a page fault returns.
1122 * It updates LRU, inserts the TLB entry on a miss
1123 * depending on the allocation policy and does the required
1124 * protection checks. It does NOT create a new packet to
1125 * update the packet's addr; this is done in hsail-gpu code.
1128 GpuTLB::handleFuncTranslationReturn(PacketPtr pkt
, tlbOutcome tlb_outcome
)
1130 TranslationState
*sender_state
=
1131 safe_cast
<TranslationState
*>(pkt
->senderState
);
1133 ThreadContext
*tc
= sender_state
->tc
;
1134 Mode mode
= sender_state
->tlbMode
;
1135 Addr vaddr
= pkt
->req
->getVaddr();
1137 TlbEntry
*local_entry
, *new_entry
;
1139 if (tlb_outcome
== TLB_HIT
) {
1140 DPRINTF(GPUTLB
, "Functional Translation Done - TLB hit for addr "
1143 local_entry
= sender_state
->tlbEntry
;
1145 DPRINTF(GPUTLB
, "Functional Translation Done - TLB miss for addr "
1148 // We are returning either from a page walk or from a hit at a lower
1149 // TLB level. The senderState should be "carrying" a pointer to the
1150 // correct TLBEntry.
1151 new_entry
= sender_state
->tlbEntry
;
1153 local_entry
= new_entry
;
1155 if (allocationPolicy
) {
1156 Addr virt_page_addr
= roundDown(vaddr
, TheISA::PageBytes
);
1158 DPRINTF(GPUTLB
, "allocating entry w/ addr %#x\n",
1161 local_entry
= insert(virt_page_addr
, *new_entry
);
1164 assert(local_entry
);
1167 DPRINTF(GPUTLB
, "Entry found with vaddr %#x, doing protection checks "
1168 "while paddr was %#x.\n", local_entry
->vaddr
,
1169 local_entry
->paddr
);
1172 * Do paging checks if it's a normal functional access. If it's for a
1173 * prefetch, then sometimes you can try to prefetch something that
1174 * won't pass protection. We don't actually want to fault becuase there
1175 * is no demand access to deem this a violation. Just put it in the
1176 * TLB and it will fault if indeed a future demand access touches it in
1179 * This feature could be used to explore security issues around
1180 * speculative memory accesses.
1182 if (!sender_state
->prefetch
&& sender_state
->tlbEntry
)
1183 pagingProtectionChecks(tc
, pkt
, local_entry
, mode
);
1185 int page_size
= local_entry
->size();
1186 Addr paddr
= local_entry
->paddr
| (vaddr
& (page_size
- 1));
1187 DPRINTF(GPUTLB
, "Translated %#x -> %#x.\n", vaddr
, paddr
);
1189 pkt
->req
->setPaddr(paddr
);
1191 if (local_entry
->uncacheable
)
1192 pkt
->req
->setFlags(Request::UNCACHEABLE
);
1195 // This is used for atomic translations. Need to
1196 // make it all happen during the same cycle.
1198 GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt
)
1200 TranslationState
*sender_state
=
1201 safe_cast
<TranslationState
*>(pkt
->senderState
);
1203 ThreadContext
*tc
= sender_state
->tc
;
1204 bool update_stats
= !sender_state
->prefetch
;
1206 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
1210 tlb
->updatePageFootprint(virt_page_addr
);
1212 // do the TLB lookup without updating the stats
1213 bool success
= tlb
->tlbLookup(pkt
->req
, tc
, update_stats
);
1214 tlbOutcome tlb_outcome
= success
? TLB_HIT
: TLB_MISS
;
1216 // functional mode means no coalescing
1217 // global metrics are the same as the local metrics
1219 tlb
->globalNumTLBAccesses
++;
1222 sender_state
->hitLevel
= sender_state
->reqCnt
.size();
1223 tlb
->globalNumTLBHits
++;
1229 tlb
->globalNumTLBMisses
++;
1230 if (tlb
->hasMemSidePort
) {
1231 // there is a TLB below -> propagate down the TLB hierarchy
1232 tlb
->memSidePort
[0]->sendFunctional(pkt
);
1233 // If no valid translation from a prefetch, then just return
1234 if (sender_state
->prefetch
&& !pkt
->req
->hasPaddr())
1237 // Need to access the page table and update the TLB
1238 DPRINTF(GPUTLB
, "Doing a page walk for address %#x\n",
1241 Process
*p
= tc
->getProcessPtr();
1243 Addr vaddr
= pkt
->req
->getVaddr();
1245 Addr alignedVaddr
= p
->pTable
->pageAlign(vaddr
);
1246 assert(alignedVaddr
== virt_page_addr
);
1249 const EmulationPageTable::Entry
*pte
=
1250 p
->pTable
->lookup(vaddr
);
1251 if (!pte
&& sender_state
->tlbMode
!= BaseTLB::Execute
&&
1252 p
->fixupFault(vaddr
)) {
1253 pte
= p
->pTable
->lookup(vaddr
);
1256 if (!sender_state
->prefetch
) {
1257 // no PageFaults are permitted after
1258 // the second page table lookup
1261 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1264 sender_state
->tlbEntry
=
1265 new TlbEntry(p
->pid(), virt_page_addr
,
1266 pte
->paddr
, false, false);
1268 // If this was a prefetch, then do the normal thing if it
1269 // was a successful translation. Otherwise, send an empty
1270 // TLB entry back so that it can be figured out as empty and
1271 // handled accordingly.
1273 DPRINTF(GPUTLB
, "Mapping %#x to %#x\n", alignedVaddr
,
1276 sender_state
->tlbEntry
=
1277 new TlbEntry(p
->pid(), virt_page_addr
,
1278 pte
->paddr
, false, false);
1280 DPRINTF(GPUPrefetch
, "Prefetch failed %#x\n",
1283 sender_state
->tlbEntry
= nullptr;
1290 DPRINTF(GPUPrefetch
, "Functional Hit for vaddr %#x\n",
1291 tlb
->lookup(pkt
->req
->getVaddr()));
1293 TlbEntry
*entry
= tlb
->lookup(pkt
->req
->getVaddr(),
1298 auto p
= sender_state
->tc
->getProcessPtr();
1299 sender_state
->tlbEntry
=
1300 new TlbEntry(p
->pid(), entry
->vaddr
, entry
->paddr
,
1303 // This is the function that would populate pkt->req with the paddr of
1304 // the translation. But if no translation happens (i.e Prefetch fails)
1305 // then the early returns in the above code wiill keep this function
1307 tlb
->handleFuncTranslationReturn(pkt
, tlb_outcome
);
1311 GpuTLB::CpuSidePort::recvReqRetry()
1313 // The CPUSidePort never sends anything but replies. No retries
1315 panic("recvReqRetry called");
1319 GpuTLB::CpuSidePort::getAddrRanges() const
1321 // currently not checked by the master
1322 AddrRangeList ranges
;
1328 * MemSidePort receives the packet back.
1329 * We need to call the handleTranslationReturn
1330 * and propagate up the hierarchy.
1333 GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt
)
1335 Addr virt_page_addr
= roundDown(pkt
->req
->getVaddr(),
1338 DPRINTF(GPUTLB
, "MemSidePort recvTiming for virt_page_addr %#x\n",
1341 TLBEvent
*tlb_event
= tlb
->translationReturnEvent
[virt_page_addr
];
1343 assert(virt_page_addr
== tlb_event
->getTLBEventVaddr());
1345 tlb_event
->updateOutcome(MISS_RETURN
);
1346 tlb
->schedule(tlb_event
, curTick()+tlb
->ticks(1));
1352 GpuTLB::MemSidePort::recvReqRetry()
1354 // No retries should reach the TLB. The retries
1355 // should only reach the TLBCoalescer.
1356 panic("recvReqRetry called");
1362 while (!cleanupQueue
.empty()) {
1363 Addr cleanup_addr
= cleanupQueue
.front();
1367 TLBEvent
* old_tlb_event
= translationReturnEvent
[cleanup_addr
];
1368 delete old_tlb_event
;
1369 translationReturnEvent
.erase(cleanup_addr
);
1371 // update number of outstanding requests
1375 /** the higher level coalescer should retry if it has
1376 * any pending requests.
1378 for (int i
= 0; i
< cpuSidePort
.size(); ++i
) {
1379 cpuSidePort
[i
]->sendRetryReq();
1384 GpuTLB::updatePageFootprint(Addr virt_page_addr
)
1387 std::pair
<AccessPatternTable::iterator
, bool> ret
;
1389 AccessInfo tmp_access_info
;
1390 tmp_access_info
.lastTimeAccessed
= 0;
1391 tmp_access_info
.accessesPerPage
= 0;
1392 tmp_access_info
.totalReuseDistance
= 0;
1393 tmp_access_info
.sumDistance
= 0;
1394 tmp_access_info
.meanDistance
= 0;
1396 ret
= TLBFootprint
.insert(AccessPatternTable::value_type(virt_page_addr
,
1399 bool first_page_access
= ret
.second
;
1401 if (first_page_access
) {
1404 int accessed_before
;
1405 accessed_before
= curTick() - ret
.first
->second
.lastTimeAccessed
;
1406 ret
.first
->second
.totalReuseDistance
+= accessed_before
;
1409 ret
.first
->second
.accessesPerPage
++;
1410 ret
.first
->second
.lastTimeAccessed
= curTick();
1412 if (accessDistance
) {
1413 ret
.first
->second
.localTLBAccesses
1414 .push_back(localNumTLBAccesses
.value());
1419 GpuTLB::exitCallback()
1421 std::ostream
*page_stat_file
= nullptr;
1423 if (accessDistance
) {
1425 // print per page statistics to a separate file (.csv format)
1426 // simout is the gem5 output directory (default is m5out or the one
1427 // specified with -d
1428 page_stat_file
= simout
.create(name().c_str())->stream();
1431 *page_stat_file
<< "page,max_access_distance,mean_access_distance, "
1432 << "stddev_distance" << std::endl
;
1435 // update avg. reuse distance footprint
1436 AccessPatternTable::iterator iter
, iter_begin
, iter_end
;
1437 unsigned int sum_avg_reuse_distance_per_page
= 0;
1439 // iterate through all pages seen by this TLB
1440 for (iter
= TLBFootprint
.begin(); iter
!= TLBFootprint
.end(); iter
++) {
1441 sum_avg_reuse_distance_per_page
+= iter
->second
.totalReuseDistance
/
1442 iter
->second
.accessesPerPage
;
1444 if (accessDistance
) {
1445 unsigned int tmp
= iter
->second
.localTLBAccesses
[0];
1446 unsigned int prev
= tmp
;
1448 for (int i
= 0; i
< iter
->second
.localTLBAccesses
.size(); ++i
) {
1453 prev
= iter
->second
.localTLBAccesses
[i
];
1454 // update the localTLBAccesses value
1455 // with the actual differece
1456 iter
->second
.localTLBAccesses
[i
] -= tmp
;
1457 // compute the sum of AccessDistance per page
1458 // used later for mean
1459 iter
->second
.sumDistance
+=
1460 iter
->second
.localTLBAccesses
[i
];
1463 iter
->second
.meanDistance
=
1464 iter
->second
.sumDistance
/ iter
->second
.accessesPerPage
;
1466 // compute std_dev and max (we need a second round because we
1467 // need to know the mean value
1468 unsigned int max_distance
= 0;
1469 unsigned int stddev_distance
= 0;
1471 for (int i
= 0; i
< iter
->second
.localTLBAccesses
.size(); ++i
) {
1472 unsigned int tmp_access_distance
=
1473 iter
->second
.localTLBAccesses
[i
];
1475 if (tmp_access_distance
> max_distance
) {
1476 max_distance
= tmp_access_distance
;
1480 tmp_access_distance
- iter
->second
.meanDistance
;
1481 stddev_distance
+= pow(diff
, 2);
1486 sqrt(stddev_distance
/iter
->second
.accessesPerPage
);
1488 if (page_stat_file
) {
1489 *page_stat_file
<< std::hex
<< iter
->first
<< ",";
1490 *page_stat_file
<< std::dec
<< max_distance
<< ",";
1491 *page_stat_file
<< std::dec
<< iter
->second
.meanDistance
1493 *page_stat_file
<< std::dec
<< stddev_distance
;
1494 *page_stat_file
<< std::endl
;
1497 // erase the localTLBAccesses array
1498 iter
->second
.localTLBAccesses
.clear();
1502 if (!TLBFootprint
.empty()) {
1504 sum_avg_reuse_distance_per_page
/ TLBFootprint
.size();
1507 //clear the TLBFootprint map
1508 TLBFootprint
.clear();
1510 } // namespace X86ISA
1513 X86GPUTLBParams::create()
1515 return new X86ISA::GpuTLB(this);