9ca478d916d3084610ea80a70b2043fd39815b06
[gem5.git] / src / gpu-compute / gpu_tlb.hh
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 *
33 * Authors: Lisa Hsu
34 */
35
36 #ifndef __GPU_TLB_HH__
37 #define __GPU_TLB_HH__
38
39 #include <fstream>
40 #include <list>
41 #include <queue>
42 #include <string>
43 #include <vector>
44
45 #include "arch/generic/tlb.hh"
46 #include "arch/x86/pagetable.hh"
47 #include "arch/x86/pagetable_walker.hh"
48 #include "arch/x86/regs/segment.hh"
49 #include "base/callback.hh"
50 #include "base/logging.hh"
51 #include "base/statistics.hh"
52 #include "gpu-compute/compute_unit.hh"
53 #include "mem/mem_object.hh"
54 #include "mem/port.hh"
55 #include "mem/request.hh"
56 #include "params/X86GPUTLB.hh"
57 #include "sim/sim_object.hh"
58
59 class BaseTLB;
60 class Packet;
61 class ThreadContext;
62
63 namespace X86ISA
64 {
65 class GpuTLB : public MemObject
66 {
67 protected:
68 friend class Walker;
69
70 typedef std::list<TlbEntry*> EntryList;
71
72 uint32_t configAddress;
73
74 // TLB clock: will inherit clock from shader's clock period in terms
75 // of nuber of ticks of curTime (aka global simulation clock)
76 // The assignment of TLB clock from shader clock is done in the python
77 // config files.
78 int clock;
79
80 public:
81 // clock related functions ; maps to-and-from Simulation ticks and
82 // object clocks.
83 Tick frequency() const { return SimClock::Frequency / clock; }
84
85 Tick
86 ticks(int numCycles) const
87 {
88 return (Tick)clock * numCycles;
89 }
90
91 Tick curCycle() const { return curTick() / clock; }
92 Tick tickToCycles(Tick val) const { return val / clock;}
93
94 typedef X86GPUTLBParams Params;
95 GpuTLB(const Params *p);
96 ~GpuTLB();
97
98 typedef enum BaseTLB::Mode Mode;
99
100 class Translation
101 {
102 public:
103 virtual ~Translation() { }
104
105 /**
106 * Signal that the translation has been delayed due to a hw page
107 * table walk.
108 */
109 virtual void markDelayed() = 0;
110
111 /**
112 * The memory for this object may be dynamically allocated, and it
113 * may be responsible for cleaning itslef up which will happen in
114 * this function. Once it's called the object is no longer valid.
115 */
116 virtual void finish(Fault fault, const RequestPtr &req,
117 ThreadContext *tc, Mode mode) = 0;
118 };
119
120 void dumpAll();
121 TlbEntry *lookup(Addr va, bool update_lru=true);
122 void setConfigAddress(uint32_t addr);
123
124 protected:
125 EntryList::iterator lookupIt(Addr va, bool update_lru=true);
126 Walker *walker;
127
128 public:
129 Walker *getWalker();
130 void invalidateAll();
131 void invalidateNonGlobal();
132 void demapPage(Addr va, uint64_t asn);
133
134 protected:
135 int size;
136 int assoc;
137 int numSets;
138
139 /**
140 * true if this is a fully-associative TLB
141 */
142 bool FA;
143 Addr setMask;
144
145 /**
146 * Allocation Policy: true if we always allocate on a hit, false
147 * otherwise. Default is true.
148 */
149 bool allocationPolicy;
150
151 /**
152 * if true, then this is not the last level TLB
153 */
154 bool hasMemSidePort;
155
156 /**
157 * Print out accessDistance stats. One stat file
158 * per TLB.
159 */
160 bool accessDistance;
161
162 std::vector<TlbEntry> tlb;
163
164 /*
165 * It's a per-set list. As long as we have not reached
166 * the full capacity of the given set, grab an entry from
167 * the freeList.
168 */
169 std::vector<EntryList> freeList;
170
171 /**
172 * An entryList per set is the equivalent of an LRU stack;
173 * it's used to guide replacement decisions. The head of the list
174 * contains the MRU TLB entry of the given set. If the freeList
175 * for this set is empty, the last element of the list
176 * is evicted (i.e., dropped on the floor).
177 */
178 std::vector<EntryList> entryList;
179
180 Fault translateInt(const RequestPtr &req, ThreadContext *tc);
181
182 Fault translate(const RequestPtr &req, ThreadContext *tc,
183 Translation *translation, Mode mode, bool &delayedResponse,
184 bool timing, int &latency);
185
186 public:
187 // latencies for a TLB hit, miss and page fault
188 int hitLatency;
189 int missLatency1;
190 int missLatency2;
191
192 // local_stats are as seen from the TLB
193 // without taking into account coalescing
194 Stats::Scalar localNumTLBAccesses;
195 Stats::Scalar localNumTLBHits;
196 Stats::Scalar localNumTLBMisses;
197 Stats::Formula localTLBMissRate;
198
199 // global_stats are as seen from the
200 // CU's perspective taking into account
201 // all coalesced requests.
202 Stats::Scalar globalNumTLBAccesses;
203 Stats::Scalar globalNumTLBHits;
204 Stats::Scalar globalNumTLBMisses;
205 Stats::Formula globalTLBMissRate;
206
207 // from the CU perspective (global)
208 Stats::Scalar accessCycles;
209 // from the CU perspective (global)
210 Stats::Scalar pageTableCycles;
211 Stats::Scalar numUniquePages;
212 // from the perspective of this TLB
213 Stats::Scalar localCycles;
214 // from the perspective of this TLB
215 Stats::Formula localLatency;
216 // I take the avg. per page and then
217 // the avg. over all pages.
218 Stats::Scalar avgReuseDistance;
219
220 void regStats();
221 void updatePageFootprint(Addr virt_page_addr);
222 void printAccessPattern();
223
224
225 Fault translateAtomic(const RequestPtr &req, ThreadContext *tc,
226 Mode mode, int &latency);
227
228 void translateTiming(const RequestPtr &req, ThreadContext *tc,
229 Translation *translation, Mode mode,
230 int &latency);
231
232 Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
233 Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
234
235 TlbEntry *insert(Addr vpn, TlbEntry &entry);
236
237 // Checkpointing
238 virtual void serialize(CheckpointOut& cp) const;
239 virtual void unserialize(CheckpointIn& cp);
240 void issueTranslation();
241 enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
242 bool tlbLookup(const RequestPtr &req,
243 ThreadContext *tc, bool update_stats);
244
245 void handleTranslationReturn(Addr addr, tlbOutcome outcome,
246 PacketPtr pkt);
247
248 void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
249
250 void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
251 TlbEntry *tlb_entry, Mode mode);
252
253 void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
254 Addr phys_page_addr);
255
256 void issueTLBLookup(PacketPtr pkt);
257
258 // CpuSidePort is the TLB Port closer to the CPU/CU side
259 class CpuSidePort : public SlavePort
260 {
261 public:
262 CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
263 PortID _index)
264 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
265
266 protected:
267 GpuTLB *tlb;
268 int index;
269
270 virtual bool recvTimingReq(PacketPtr pkt);
271 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
272 virtual void recvFunctional(PacketPtr pkt);
273 virtual void recvRangeChange() { }
274 virtual void recvReqRetry();
275 virtual void recvRespRetry() { panic("recvRespRetry called"); }
276 virtual AddrRangeList getAddrRanges() const;
277 };
278
279 /**
280 * MemSidePort is the TLB Port closer to the memory side
281 * If this is a last level TLB then this port will not be connected.
282 *
283 * Future action item: if we ever do real page walks, then this port
284 * should be connected to a RubyPort.
285 */
286 class MemSidePort : public MasterPort
287 {
288 public:
289 MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
290 PortID _index)
291 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
292
293 std::deque<PacketPtr> retries;
294
295 protected:
296 GpuTLB *tlb;
297 int index;
298
299 virtual bool recvTimingResp(PacketPtr pkt);
300 virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
301 virtual void recvFunctional(PacketPtr pkt) { }
302 virtual void recvRangeChange() { }
303 virtual void recvReqRetry();
304 };
305
306 // TLB ports on the cpu Side
307 std::vector<CpuSidePort*> cpuSidePort;
308 // TLB ports on the memory side
309 std::vector<MemSidePort*> memSidePort;
310
311 BaseMasterPort &getMasterPort(const std::string &if_name,
312 PortID idx=InvalidPortID);
313
314 BaseSlavePort &getSlavePort(const std::string &if_name,
315 PortID idx=InvalidPortID);
316
317 /**
318 * TLB TranslationState: this currently is a somewhat bastardization of
319 * the usage of SenderState, whereby the receiver of a packet is not
320 * usually supposed to need to look at the contents of the senderState,
321 * you're really only supposed to look at what you pushed on, pop it
322 * off, and send it back.
323 *
324 * However, since there is state that we want to pass to the TLBs using
325 * the send/recv Timing/Functional/etc. APIs, which don't allow for new
326 * arguments, we need a common TLB senderState to pass between TLBs,
327 * both "forwards" and "backwards."
328 *
329 * So, basically, the rule is that any packet received by a TLB port
330 * (cpuside OR memside) must be safely castable to a TranslationState.
331 */
332
333 struct TranslationState : public Packet::SenderState
334 {
335 // TLB mode, read or write
336 Mode tlbMode;
337 // Thread context associated with this req
338 ThreadContext *tc;
339
340 /*
341 * TLB entry to be populated and passed back and filled in
342 * previous TLBs. Equivalent to the data cache concept of
343 * "data return."
344 */
345 TlbEntry *tlbEntry;
346 // Is this a TLB prefetch request?
347 bool prefetch;
348 // When was the req for this translation issued
349 uint64_t issueTime;
350 // Remember where this came from
351 std::vector<SlavePort*>ports;
352
353 // keep track of #uncoalesced reqs per packet per TLB level;
354 // reqCnt per level >= reqCnt higher level
355 std::vector<int> reqCnt;
356 // TLB level this packet hit in; 0 if it hit in the page table
357 int hitLevel;
358 Packet::SenderState *saved;
359
360 TranslationState(Mode tlb_mode, ThreadContext *_tc,
361 bool _prefetch=false,
362 Packet::SenderState *_saved=nullptr)
363 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
364 prefetch(_prefetch), issueTime(0),
365 hitLevel(0),saved(_saved) { }
366 };
367
368 // maximum number of permitted coalesced requests per cycle
369 int maxCoalescedReqs;
370
371 // Current number of outstandings coalesced requests.
372 // Should be <= maxCoalescedReqs
373 int outstandingReqs;
374
375 /**
376 * A TLBEvent is scheduled after the TLB lookup and helps us take the
377 * appropriate actions:
378 * (e.g., update TLB on a hit,
379 * send request to lower level TLB on a miss,
380 * or start a page walk if this was the last-level TLB).
381 */
382 void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
383 PacketPtr pkt);
384
385 class TLBEvent : public Event
386 {
387 private:
388 GpuTLB *tlb;
389 Addr virtPageAddr;
390 /**
391 * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
392 */
393 tlbOutcome outcome;
394 PacketPtr pkt;
395
396 public:
397 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
398 PacketPtr _pkt);
399
400 void process();
401 const char *description() const;
402
403 // updateOutcome updates the tlbOutcome of a TLBEvent
404 void updateOutcome(tlbOutcome _outcome);
405 Addr getTLBEventVaddr();
406 };
407
408 std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
409
410 // this FIFO queue keeps track of the virt. page addresses
411 // that are pending cleanup
412 std::queue<Addr> cleanupQueue;
413
414 // the cleanupEvent is scheduled after a TLBEvent triggers in order to
415 // free memory and do the required clean-up
416 void cleanup();
417
418 EventFunctionWrapper cleanupEvent;
419
420 /**
421 * This hash map will use the virtual page address as a key
422 * and will keep track of total number of accesses per page
423 */
424
425 struct AccessInfo
426 {
427 unsigned int lastTimeAccessed; // last access to this page
428 unsigned int accessesPerPage;
429 // need to divide it by accessesPerPage at the end
430 unsigned int totalReuseDistance;
431
432 /**
433 * The field below will help us compute the access distance,
434 * that is the number of (coalesced) TLB accesses that
435 * happened in between each access to this page
436 *
437 * localTLBAccesses[x] is the value of localTLBNumAccesses
438 * when the page <Addr> was accessed for the <x>th time
439 */
440 std::vector<unsigned int> localTLBAccesses;
441 unsigned int sumDistance;
442 unsigned int meanDistance;
443 };
444
445 typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
446 AccessPatternTable TLBFootprint;
447
448 // Called at the end of simulation to dump page access stats.
449 void exitCallback();
450
451 EventFunctionWrapper exitEvent;
452 };
453 }
454
455 #endif // __GPU_TLB_HH__