src/gpu-compute/gpu_tlb.hh

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Lisa Hsu
  34  */
  35
  36 #ifndef __GPU_TLB_HH__
  37 #define __GPU_TLB_HH__
  38
  39 #include <fstream>
  40 #include <list>
  41 #include <queue>
  42 #include <string>
  43 #include <vector>
  44
  45 #include "arch/generic/tlb.hh"
  46 #include "arch/x86/pagetable.hh"
  47 #include "arch/x86/pagetable_walker.hh"
  48 #include "arch/x86/regs/segment.hh"
  49 #include "base/callback.hh"
  50 #include "base/logging.hh"
  51 #include "base/statistics.hh"
  52 #include "gpu-compute/compute_unit.hh"
  53 #include "mem/port.hh"
  54 #include "mem/request.hh"
  55 #include "params/X86GPUTLB.hh"
  56 #include "sim/clocked_object.hh"
  57 #include "sim/sim_object.hh"
  58
  59 class BaseTLB;
  60 class Packet;
  61 class ThreadContext;
  62
  63 namespace X86ISA
  64 {
  65     class GpuTLB : public ClockedObject
  66     {
  67       protected:
  68         friend class Walker;
  69
  70         typedef std::list<TlbEntry*> EntryList;
  71
  72         uint32_t configAddress;
  73
  74         // TLB clock: will inherit clock from shader's clock period in terms
  75         // of nuber of ticks of curTime (aka global simulation clock)
  76         // The assignment of TLB clock from shader clock is done in the python
  77         // config files.
  78         int clock;
  79
  80       public:
  81         // clock related functions ; maps to-and-from Simulation ticks and
  82         // object clocks.
  83         Tick frequency() const { return SimClock::Frequency / clock; }
  84
  85         Tick
  86         ticks(int numCycles) const
  87         {
  88             return (Tick)clock * numCycles;
  89         }
  90
  91         Tick curCycle() const { return curTick() / clock; }
  92         Tick tickToCycles(Tick val) const { return val / clock;}
  93
  94         typedef X86GPUTLBParams Params;
  95         GpuTLB(const Params *p);
  96         ~GpuTLB();
  97
  98         typedef enum BaseTLB::Mode Mode;
  99
 100         class Translation
 101         {
 102           public:
 103             virtual ~Translation() { }
 104
 105             /**
 106              * Signal that the translation has been delayed due to a hw page
 107              * table walk.
 108              */
 109             virtual void markDelayed() = 0;
 110
 111             /**
 112              * The memory for this object may be dynamically allocated, and it
 113              * may be responsible for cleaning itslef up which will happen in
 114              * this function. Once it's called the object is no longer valid.
 115              */
 116             virtual void finish(Fault fault, const RequestPtr &req,
 117                                 ThreadContext *tc, Mode mode) = 0;
 118         };
 119
 120         void dumpAll();
 121         TlbEntry *lookup(Addr va, bool update_lru=true);
 122         void setConfigAddress(uint32_t addr);
 123
 124       protected:
 125         EntryList::iterator lookupIt(Addr va, bool update_lru=true);
 126         Walker *walker;
 127
 128       public:
 129         Walker *getWalker();
 130         void invalidateAll();
 131         void invalidateNonGlobal();
 132         void demapPage(Addr va, uint64_t asn);
 133
 134       protected:
 135         int size;
 136         int assoc;
 137         int numSets;
 138
 139         /**
 140          *  true if this is a fully-associative TLB
 141          */
 142         bool FA;
 143         Addr setMask;
 144
 145         /**
 146          * Allocation Policy: true if we always allocate on a hit, false
 147          * otherwise. Default is true.
 148          */
 149         bool allocationPolicy;
 150
 151         /**
 152          * if true, then this is not the last level TLB
 153          */
 154         bool hasMemSidePort;
 155
 156         /**
 157          * Print out accessDistance stats. One stat file
 158          * per TLB.
 159          */
 160         bool accessDistance;
 161
 162         std::vector<TlbEntry> tlb;
 163
 164         /*
 165          * It's a per-set list. As long as we have not reached
 166          * the full capacity of the given set, grab an entry from
 167          * the freeList.
 168          */
 169         std::vector<EntryList> freeList;
 170
 171         /**
 172          * An entryList per set is the equivalent of an LRU stack;
 173          * it's used to guide replacement decisions. The head of the list
 174          * contains the MRU TLB entry of the given set. If the freeList
 175          * for this set is empty, the last element of the list
 176          * is evicted (i.e., dropped on the floor).
 177          */
 178         std::vector<EntryList> entryList;
 179
 180         Fault translateInt(const RequestPtr &req, ThreadContext *tc);
 181
 182         Fault translate(const RequestPtr &req, ThreadContext *tc,
 183                 Translation *translation, Mode mode, bool &delayedResponse,
 184                 bool timing, int &latency);
 185
 186       public:
 187         // latencies for a TLB hit, miss and page fault
 188         int hitLatency;
 189         int missLatency1;
 190         int missLatency2;
 191
 192         // local_stats are as seen from the TLB
 193         // without taking into account coalescing
 194         Stats::Scalar localNumTLBAccesses;
 195         Stats::Scalar localNumTLBHits;
 196         Stats::Scalar localNumTLBMisses;
 197         Stats::Formula localTLBMissRate;
 198
 199         // global_stats are as seen from the
 200         // CU's perspective taking into account
 201         // all coalesced requests.
 202         Stats::Scalar globalNumTLBAccesses;
 203         Stats::Scalar globalNumTLBHits;
 204         Stats::Scalar globalNumTLBMisses;
 205         Stats::Formula globalTLBMissRate;
 206
 207         // from the CU perspective (global)
 208         Stats::Scalar accessCycles;
 209         // from the CU perspective (global)
 210         Stats::Scalar pageTableCycles;
 211         Stats::Scalar numUniquePages;
 212         // from the perspective of this TLB
 213         Stats::Scalar localCycles;
 214         // from the perspective of this TLB
 215         Stats::Formula localLatency;
 216         // I take the avg. per page and then
 217         // the avg. over all pages.
 218         Stats::Scalar avgReuseDistance;
 219
 220         void regStats();
 221         void updatePageFootprint(Addr virt_page_addr);
 222         void printAccessPattern();
 223
 224
 225         Fault translateAtomic(const RequestPtr &req, ThreadContext *tc,
 226                               Mode mode, int &latency);
 227
 228         void translateTiming(const RequestPtr &req, ThreadContext *tc,
 229                              Translation *translation, Mode mode,
 230                              int &latency);
 231
 232         Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
 233         Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
 234
 235         TlbEntry *insert(Addr vpn, TlbEntry &entry);
 236
 237         // Checkpointing
 238         virtual void serialize(CheckpointOut& cp) const;
 239         virtual void unserialize(CheckpointIn& cp);
 240         void issueTranslation();
 241         enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
 242         bool tlbLookup(const RequestPtr &req,
 243                        ThreadContext *tc, bool update_stats);
 244
 245         void handleTranslationReturn(Addr addr, tlbOutcome outcome,
 246                                      PacketPtr pkt);
 247
 248         void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
 249
 250         void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
 251                                     TlbEntry *tlb_entry, Mode mode);
 252
 253         void updatePhysAddresses(Addr virt_page_addr, TlbEntry *tlb_entry,
 254                                  Addr phys_page_addr);
 255
 256         void issueTLBLookup(PacketPtr pkt);
 257
 258         // CpuSidePort is the TLB Port closer to the CPU/CU side
 259         class CpuSidePort : public SlavePort
 260         {
 261           public:
 262             CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
 263                         PortID _index)
 264                 : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
 265
 266           protected:
 267             GpuTLB *tlb;
 268             int index;
 269
 270             virtual bool recvTimingReq(PacketPtr pkt);
 271             virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 272             virtual void recvFunctional(PacketPtr pkt);
 273             virtual void recvRangeChange() { }
 274             virtual void recvReqRetry();
 275             virtual void recvRespRetry() { panic("recvRespRetry called"); }
 276             virtual AddrRangeList getAddrRanges() const;
 277         };
 278
 279         /**
 280          * MemSidePort is the TLB Port closer to the memory side
 281          * If this is a last level TLB then this port will not be connected.
 282          *
 283          * Future action item: if we ever do real page walks, then this port
 284          * should be connected to a RubyPort.
 285          */
 286         class MemSidePort : public MasterPort
 287         {
 288           public:
 289             MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
 290                         PortID _index)
 291                 : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
 292
 293             std::deque<PacketPtr> retries;
 294
 295           protected:
 296             GpuTLB *tlb;
 297             int index;
 298
 299             virtual bool recvTimingResp(PacketPtr pkt);
 300             virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
 301             virtual void recvFunctional(PacketPtr pkt) { }
 302             virtual void recvRangeChange() { }
 303             virtual void recvReqRetry();
 304         };
 305
 306         // TLB ports on the cpu Side
 307         std::vector<CpuSidePort*> cpuSidePort;
 308         // TLB ports on the memory side
 309         std::vector<MemSidePort*> memSidePort;
 310
 311         Port &getPort(const std::string &if_name,
 312                       PortID idx=InvalidPortID) override;
 313
 314         /**
 315          * TLB TranslationState: this currently is a somewhat bastardization of
 316          * the usage of SenderState, whereby the receiver of a packet is not
 317          * usually supposed to need to look at the contents of the senderState,
 318          * you're really only supposed to look at what you pushed on, pop it
 319          * off, and send it back.
 320          *
 321          * However, since there is state that we want to pass to the TLBs using
 322          * the send/recv Timing/Functional/etc. APIs, which don't allow for new
 323          * arguments, we need a common TLB senderState to pass between TLBs,
 324          * both "forwards" and "backwards."
 325          *
 326          * So, basically, the rule is that any packet received by a TLB port
 327          * (cpuside OR memside) must be safely castable to a TranslationState.
 328          */
 329
 330         struct TranslationState : public Packet::SenderState
 331         {
 332             // TLB mode, read or write
 333             Mode tlbMode;
 334             // Thread context associated with this req
 335             ThreadContext *tc;
 336
 337             /*
 338             * TLB entry to be populated and passed back and filled in
 339             * previous TLBs.  Equivalent to the data cache concept of
 340             * "data return."
 341             */
 342             TlbEntry *tlbEntry;
 343             // Is this a TLB prefetch request?
 344             bool prefetch;
 345             // When was the req for this translation issued
 346             uint64_t issueTime;
 347             // Remember where this came from
 348             std::vector<SlavePort*>ports;
 349
 350             // keep track of #uncoalesced reqs per packet per TLB level;
 351             // reqCnt per level >= reqCnt higher level
 352             std::vector<int> reqCnt;
 353             // TLB level this packet hit in; 0 if it hit in the page table
 354             int hitLevel;
 355             Packet::SenderState *saved;
 356
 357             TranslationState(Mode tlb_mode, ThreadContext *_tc,
 358                              bool _prefetch=false,
 359                              Packet::SenderState *_saved=nullptr)
 360                 : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
 361                   prefetch(_prefetch), issueTime(0),
 362                   hitLevel(0),saved(_saved) { }
 363         };
 364
 365         // maximum number of permitted coalesced requests per cycle
 366         int maxCoalescedReqs;
 367
 368         // Current number of outstandings coalesced requests.
 369         // Should be <= maxCoalescedReqs
 370         int outstandingReqs;
 371
 372         /**
 373          * A TLBEvent is scheduled after the TLB lookup and helps us take the
 374          * appropriate actions:
 375          *  (e.g., update TLB on a hit,
 376          *  send request to lower level TLB on a miss,
 377          *  or start a page walk if this was the last-level TLB).
 378          */
 379         void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
 380                                PacketPtr pkt);
 381
 382         class TLBEvent : public Event
 383         {
 384             private:
 385                 GpuTLB *tlb;
 386                 Addr virtPageAddr;
 387                 /**
 388                  * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
 389                  */
 390                 tlbOutcome outcome;
 391                 PacketPtr pkt;
 392
 393             public:
 394                 TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
 395                         PacketPtr _pkt);
 396
 397                 void process();
 398                 const char *description() const;
 399
 400                 // updateOutcome updates the tlbOutcome of a TLBEvent
 401                 void updateOutcome(tlbOutcome _outcome);
 402                 Addr getTLBEventVaddr();
 403         };
 404
 405         std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
 406
 407         // this FIFO queue keeps track of the virt. page addresses
 408         // that are pending cleanup
 409         std::queue<Addr> cleanupQueue;
 410
 411         // the cleanupEvent is scheduled after a TLBEvent triggers in order to
 412         // free memory and do the required clean-up
 413         void cleanup();
 414
 415         EventFunctionWrapper cleanupEvent;
 416
 417         /**
 418          * This hash map will use the virtual page address as a key
 419          * and will keep track of total number of accesses per page
 420          */
 421
 422         struct AccessInfo
 423         {
 424             unsigned int lastTimeAccessed; // last access to this page
 425             unsigned int accessesPerPage;
 426             // need to divide it by accessesPerPage at the end
 427             unsigned int totalReuseDistance;
 428
 429             /**
 430              * The field below will help us compute the access distance,
 431              * that is the number of (coalesced) TLB accesses that
 432              * happened in between each access to this page
 433              *
 434              * localTLBAccesses[x] is the value of localTLBNumAccesses
 435              * when the page <Addr> was accessed for the <x>th time
 436              */
 437             std::vector<unsigned int> localTLBAccesses;
 438             unsigned int sumDistance;
 439             unsigned int meanDistance;
 440         };
 441
 442         typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
 443         AccessPatternTable TLBFootprint;
 444
 445         // Called at the end of simulation to dump page access stats.
 446         void exitCallback();
 447
 448         EventFunctionWrapper exitEvent;
 449     };
 450 }
 451
 452 #endif // __GPU_TLB_HH__