2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
36 #ifndef __GPU_TLB_HH__
37 #define __GPU_TLB_HH__
45 #include "arch/generic/tlb.hh"
46 #include "arch/x86/pagetable.hh"
47 #include "arch/x86/pagetable_walker.hh"
48 #include "arch/x86/regs/segment.hh"
49 #include "base/callback.hh"
50 #include "base/logging.hh"
51 #include "base/statistics.hh"
52 #include "gpu-compute/compute_unit.hh"
53 #include "mem/port.hh"
54 #include "mem/request.hh"
55 #include "params/X86GPUTLB.hh"
56 #include "sim/clocked_object.hh"
57 #include "sim/sim_object.hh"
65 class GpuTLB : public ClockedObject
70 typedef std::list<TlbEntry*> EntryList;
72 uint32_t configAddress;
74 // TLB clock: will inherit clock from shader's clock period in terms
75 // of nuber of ticks of curTime (aka global simulation clock)
76 // The assignment of TLB clock from shader clock is done in the python
81 // clock related functions ; maps to-and-from Simulation ticks and
83 Tick frequency() const { return SimClock::Frequency / clock; }
86 ticks(int numCycles) const
88 return (Tick)clock * numCycles;
91 Tick curCycle() const { return curTick() / clock; }
92 Tick tickToCycles(Tick val) const { return val / clock;}
94 typedef X86GPUTLBParams Params;
95 GpuTLB(const Params *p);
98 typedef enum BaseTLB::Mode Mode;
103 virtual ~Translation() { }
106 * Signal that the translation has been delayed due to a hw page
109 virtual void markDelayed() = 0;
112 * The memory for this object may be dynamically allocated, and it
113 * may be responsible for cleaning itslef up which will happen in
114 * this function. Once it's called the object is no longer valid.
116 virtual void finish(Fault fault, const RequestPtr &req,
117 ThreadContext *tc, Mode mode) = 0;
121 TlbEntry *lookup(Addr va, bool update_lru=true);
122 void setConfigAddress(uint32_t addr);
125 EntryList::iterator lookupIt(Addr va, bool update_lru=true);
130 void invalidateAll();
131 void invalidateNonGlobal();
132 void demapPage(Addr va, uint64_t asn);
140 * true if this is a fully-associative TLB
146 * Allocation Policy: true if we always allocate on a hit, false
147 * otherwise. Default is true.
149 bool allocationPolicy;
152 * if true, then this is not the last level TLB
157 * Print out accessDistance stats. One stat file
162 std::vector<TlbEntry> tlb;
165 * It's a per-set list. As long as we have not reached
166 * the full capacity of the given set, grab an entry from
169 std::vector<EntryList> freeList;
172 * An entryList per set is the equivalent of an LRU stack;
173 * it's used to guide replacement decisions. The head of the list
174 * contains the MRU TLB entry of the given set. If the freeList
175 * for this set is empty, the last element of the list
176 * is evicted (i.e., dropped on the floor).
178 std::vector<EntryList> entryList;
180 Fault translateInt(const RequestPtr &req, ThreadContext *tc);
182 Fault translate(const RequestPtr &req, ThreadContext *tc,
183 Translation *translation, Mode mode, bool &delayedResponse,
184 bool timing, int &latency);
187 // latencies for a TLB hit, miss and page fault
192 // local_stats are as seen from the TLB
193 // without taking into account coalescing
194 Stats::Scalar localNumTLBAccesses;
195 Stats::Scalar localNumTLBHits;
196 Stats::Scalar localNumTLBMisses;
197 Stats::Formula localTLBMissRate;
199 // global_stats are as seen from the
200 // CU's perspective taking into account
201 // all coalesced requests.
202 Stats::Scalar globalNumTLBAccesses;
203 Stats::Scalar globalNumTLBHits;
204 Stats::Scalar globalNumTLBMisses;
205 Stats::Formula globalTLBMissRate;
207 // from the CU perspective (global)
208 Stats::Scalar accessCycles;
209 // from the CU perspective (global)
210 Stats::Scalar pageTableCycles;
211 Stats::Scalar numUniquePages;
212 // from the perspective of this TLB
213 Stats::Scalar localCycles;
214 // from the perspective of this TLB
215 Stats::Formula localLatency;
216 // I take the avg. per page and then
217 // the avg. over all pages.
218 Stats::Scalar avgReuseDistance;
221 void updatePageFootprint(Addr virt_page_addr);
222 void printAccessPattern();
225 Fault translateAtomic(const RequestPtr &req, ThreadContext *tc,
226 Mode mode, int &latency);
228 void translateTiming(const RequestPtr &req, ThreadContext *tc,
229 Translation *translation, Mode mode,
232 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
233 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
235 TlbEntry *insert(Addr vpn, TlbEntry &entry);
238 virtual void serialize(CheckpointOut& cp) const;
239 virtual void unserialize(CheckpointIn& cp);
240 void issueTranslation();
241 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
242 bool tlbLookup(const RequestPtr &req,
243 ThreadContext *tc, bool update_stats);
245 void handleTranslationReturn(Addr addr, tlbOutcome outcome,
248 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
250 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
251 TlbEntry *tlb_entry, Mode mode);
253 void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
254 Addr phys_page_addr);
256 void issueTLBLookup(PacketPtr pkt);
258 // CpuSidePort is the TLB Port closer to the CPU/CU side
259 class CpuSidePort : public SlavePort
262 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
264 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
270 virtual bool recvTimingReq(PacketPtr pkt);
271 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
272 virtual void recvFunctional(PacketPtr pkt);
273 virtual void recvRangeChange() { }
274 virtual void recvReqRetry();
275 virtual void recvRespRetry() { panic("recvRespRetry called"); }
276 virtual AddrRangeList getAddrRanges() const;
280 * MemSidePort is the TLB Port closer to the memory side
281 * If this is a last level TLB then this port will not be connected.
283 * Future action item: if we ever do real page walks, then this port
284 * should be connected to a RubyPort.
286 class MemSidePort : public MasterPort
289 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
291 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
293 std::deque<PacketPtr> retries;
299 virtual bool recvTimingResp(PacketPtr pkt);
300 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
301 virtual void recvFunctional(PacketPtr pkt) { }
302 virtual void recvRangeChange() { }
303 virtual void recvReqRetry();
306 // TLB ports on the cpu Side
307 std::vector<CpuSidePort*> cpuSidePort;
308 // TLB ports on the memory side
309 std::vector<MemSidePort*> memSidePort;
311 Port &getPort(const std::string &if_name,
312 PortID idx=InvalidPortID) override;
315 * TLB TranslationState: this currently is a somewhat bastardization of
316 * the usage of SenderState, whereby the receiver of a packet is not
317 * usually supposed to need to look at the contents of the senderState,
318 * you're really only supposed to look at what you pushed on, pop it
319 * off, and send it back.
321 * However, since there is state that we want to pass to the TLBs using
322 * the send/recv Timing/Functional/etc. APIs, which don't allow for new
323 * arguments, we need a common TLB senderState to pass between TLBs,
324 * both "forwards" and "backwards."
326 * So, basically, the rule is that any packet received by a TLB port
327 * (cpuside OR memside) must be safely castable to a TranslationState.
330 struct TranslationState : public Packet::SenderState
332 // TLB mode, read or write
334 // Thread context associated with this req
338 * TLB entry to be populated and passed back and filled in
339 * previous TLBs. Equivalent to the data cache concept of
343 // Is this a TLB prefetch request?
345 // When was the req for this translation issued
347 // Remember where this came from
348 std::vector<SlavePort*>ports;
350 // keep track of #uncoalesced reqs per packet per TLB level;
351 // reqCnt per level >= reqCnt higher level
352 std::vector<int> reqCnt;
353 // TLB level this packet hit in; 0 if it hit in the page table
355 Packet::SenderState *saved;
357 TranslationState(Mode tlb_mode, ThreadContext *_tc,
358 bool _prefetch=false,
359 Packet::SenderState *_saved=nullptr)
360 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
361 prefetch(_prefetch), issueTime(0),
362 hitLevel(0),saved(_saved) { }
365 // maximum number of permitted coalesced requests per cycle
366 int maxCoalescedReqs;
368 // Current number of outstandings coalesced requests.
369 // Should be <= maxCoalescedReqs
373 * A TLBEvent is scheduled after the TLB lookup and helps us take the
374 * appropriate actions:
375 * (e.g., update TLB on a hit,
376 * send request to lower level TLB on a miss,
377 * or start a page walk if this was the last-level TLB).
379 void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
382 class TLBEvent : public Event
388 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
394 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
398 const char *description() const;
400 // updateOutcome updates the tlbOutcome of a TLBEvent
401 void updateOutcome(tlbOutcome _outcome);
402 Addr getTLBEventVaddr();
405 std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
407 // this FIFO queue keeps track of the virt. page addresses
408 // that are pending cleanup
409 std::queue<Addr> cleanupQueue;
411 // the cleanupEvent is scheduled after a TLBEvent triggers in order to
412 // free memory and do the required clean-up
415 EventFunctionWrapper cleanupEvent;
418 * This hash map will use the virtual page address as a key
419 * and will keep track of total number of accesses per page
424 unsigned int lastTimeAccessed; // last access to this page
425 unsigned int accessesPerPage;
426 // need to divide it by accessesPerPage at the end
427 unsigned int totalReuseDistance;
430 * The field below will help us compute the access distance,
431 * that is the number of (coalesced) TLB accesses that
432 * happened in between each access to this page
434 * localTLBAccesses[x] is the value of localTLBNumAccesses
435 * when the page <Addr> was accessed for the <x>th time
437 std::vector<unsigned int> localTLBAccesses;
438 unsigned int sumDistance;
439 unsigned int meanDistance;
442 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
443 AccessPatternTable TLBFootprint;
445 // Called at the end of simulation to dump page access stats.
448 EventFunctionWrapper exitEvent;
452 #endif // __GPU_TLB_HH__