src/mem/ruby/system/GPUCoalescer.hh

   1 /*
   2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
  35 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
  36
  37 #include <iostream>
  38 #include <unordered_map>
  39
  40 #include "base/statistics.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/misc.hh"
  43 #include "mem/request.hh"
  44 #include "mem/ruby/common/Address.hh"
  45 #include "mem/ruby/common/Consumer.hh"
  46 #include "mem/ruby/protocol/PrefetchBit.hh"
  47 #include "mem/ruby/protocol/RubyAccessMode.hh"
  48 #include "mem/ruby/protocol/RubyRequestType.hh"
  49 #include "mem/ruby/protocol/SequencerRequestType.hh"
  50 #include "mem/ruby/system/Sequencer.hh"
  51 #include "mem/token_port.hh"
  52
  53 class DataBlock;
  54 class CacheMsg;
  55 class MachineID;
  56 class CacheMemory;
  57
  58 class RubyGPUCoalescerParams;
  59
  60 // List of packets that belongs to a specific instruction.
  61 typedef std::list<PacketPtr> PerInstPackets;
  62
  63 class UncoalescedTable
  64 {
  65   public:
  66     UncoalescedTable(GPUCoalescer *gc);
  67     ~UncoalescedTable() {}
  68
  69     void insertPacket(PacketPtr pkt);
  70     bool packetAvailable();
  71     void printRequestTable(std::stringstream& ss);
  72
  73     // Modify packets remaining map. Init sets value iff the seqNum has not
  74     // yet been seen before. get/set act as a regular getter/setter.
  75     void initPacketsRemaining(InstSeqNum seqNum, int count);
  76     int getPacketsRemaining(InstSeqNum seqNum);
  77     void setPacketsRemaining(InstSeqNum seqNum, int count);
  78
  79     // Returns a pointer to the list of packets corresponding to an
  80     // instruction in the instruction map or nullptr if there are no
  81     // instructions at the offset.
  82     PerInstPackets* getInstPackets(int offset);
  83     void updateResources();
  84     bool areRequestsDone(const InstSeqNum instSeqNum);
  85
  86     // Check if a packet hasn't been removed from instMap in too long.
  87     // Panics if a deadlock is detected and returns nothing otherwise.
  88     void checkDeadlock(Tick threshold);
  89
  90   private:
  91     GPUCoalescer *coalescer;
  92
  93     // Maps an instructions unique sequence number to a queue of packets
  94     // which need responses. This data structure assumes the sequence number
  95     // is monotonically increasing (which is true for CU class) in order to
  96     // issue packets in age order.
  97     std::map<InstSeqNum, PerInstPackets> instMap;
  98
  99     std::map<InstSeqNum, int> instPktsRemaining;
 100 };
 101
 102 class CoalescedRequest
 103 {
 104   public:
 105     CoalescedRequest(uint64_t _seqNum)
 106         : seqNum(_seqNum), issueTime(Cycles(0)),
 107           rubyType(RubyRequestType_NULL)
 108     {}
 109     ~CoalescedRequest() {}
 110
 111     void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
 112     void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
 113     void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
 114     void setRubyType(RubyRequestType type) { rubyType = type; }
 115
 116     uint64_t getSeqNum() const { return seqNum; }
 117     PacketPtr getFirstPkt() const { return pkts[0]; }
 118     Cycles getIssueTime() const { return issueTime; }
 119     RubyRequestType getRubyType() const { return rubyType; }
 120     std::vector<PacketPtr>& getPackets() { return pkts; }
 121
 122   private:
 123     uint64_t seqNum;
 124     Cycles issueTime;
 125     RubyRequestType rubyType;
 126     std::vector<PacketPtr> pkts;
 127 };
 128
 129 // PendingWriteInst tracks the number of outstanding Ruby requests
 130 // per write instruction. Once all requests associated with one instruction
 131 // are completely done in Ruby, we call back the requestor to mark
 132 // that this instruction is complete.
 133 class PendingWriteInst
 134 {
 135   public:
 136     PendingWriteInst()
 137         : numPendingStores(0),
 138           originalPort(nullptr),
 139           gpuDynInstPtr(nullptr)
 140     {}
 141
 142     ~PendingWriteInst()
 143     {}
 144
 145     void
 146     addPendingReq(RubyPort::MemResponsePort* port, GPUDynInstPtr inst,
 147                   bool usingRubyTester)
 148     {
 149         assert(port);
 150         originalPort = port;
 151
 152         if (!usingRubyTester) {
 153             gpuDynInstPtr = inst;
 154         }
 155
 156         numPendingStores++;
 157     }
 158
 159     // return true if no more ack is expected
 160     bool
 161     receiveWriteCompleteAck()
 162     {
 163         assert(numPendingStores > 0);
 164         numPendingStores--;
 165         return (numPendingStores == 0) ? true : false;
 166     }
 167
 168     // ack the original requestor that this write instruction is complete
 169     void
 170     ackWriteCompletion(bool usingRubyTester)
 171     {
 172         assert(numPendingStores == 0);
 173
 174         // make a response packet
 175         PacketPtr pkt = new Packet(std::make_shared<Request>(),
 176                                    MemCmd::WriteCompleteResp);
 177
 178         if (!usingRubyTester) {
 179             assert(gpuDynInstPtr);
 180             ComputeUnit::DataPort::SenderState* ss =
 181                     new ComputeUnit::DataPort::SenderState
 182                                             (gpuDynInstPtr, 0, nullptr);
 183             pkt->senderState = ss;
 184         }
 185
 186         // send the ack response to the requestor
 187         originalPort->sendTimingResp(pkt);
 188     }
 189
 190     int
 191     getNumPendingStores() {
 192         return numPendingStores;
 193     }
 194
 195   private:
 196     // the number of stores waiting for writeCompleteCallback
 197     int numPendingStores;
 198     // The original port that sent one of packets associated with this
 199     // write instruction. We may have more than one packet per instruction,
 200     // which implies multiple ports per instruction. However, we need
 201     // only 1 of the ports to call back the CU. Therefore, here we keep
 202     // track the port that sent the first packet of this instruction.
 203     RubyPort::MemResponsePort* originalPort;
 204     // similar to the originalPort, this gpuDynInstPtr is set only for
 205     // the first packet of this instruction.
 206     GPUDynInstPtr gpuDynInstPtr;
 207 };
 208
 209 class GPUCoalescer : public RubyPort
 210 {
 211   public:
 212     class GMTokenPort : public TokenResponsePort
 213     {
 214       public:
 215         GMTokenPort(const std::string& name, ClockedObject *owner,
 216                     PortID id = InvalidPortID)
 217             : TokenResponsePort(name, owner, id)
 218         { }
 219         ~GMTokenPort() { }
 220
 221       protected:
 222         Tick recvAtomic(PacketPtr) { return Tick(0); }
 223         void recvFunctional(PacketPtr) { }
 224         bool recvTimingReq(PacketPtr) { return false; }
 225         AddrRangeList getAddrRanges() const
 226         {
 227             AddrRangeList ranges;
 228             return ranges;
 229         }
 230     };
 231
 232     typedef RubyGPUCoalescerParams Params;
 233     GPUCoalescer(const Params &);
 234     ~GPUCoalescer();
 235
 236     Port &getPort(const std::string &if_name,
 237                   PortID idx = InvalidPortID) override;
 238
 239     // Public Methods
 240     void wakeup(); // Used only for deadlock detection
 241     void printRequestTable(std::stringstream& ss);
 242
 243     void printProgress(std::ostream& out) const;
 244     void resetStats() override;
 245     void collateStats();
 246     void regStats() override;
 247
 248     // each store request needs two callbacks:
 249     //  (1) writeCallback is called when the store is received and processed
 250     //      by TCP. This writeCallback does not guarantee the store is actually
 251     //      completed at its destination cache or memory. writeCallback helps
 252     //      release hardware resources (e.g., its entry in coalescedTable)
 253     //      allocated for the store so that subsequent requests will not be
 254     //      blocked unnecessarily due to hardware resource constraints.
 255     //  (2) writeCompleteCallback is called when the store is fully completed
 256     //      at its destination cache or memory. writeCompleteCallback
 257     //      guarantees that the store is fully completed. This callback
 258     //      will decrement hardware counters in CU
 259     void writeCallback(Addr address, DataBlock& data);
 260
 261     void writeCallback(Addr address,
 262                        MachineType mach,
 263                        DataBlock& data);
 264
 265     void writeCallback(Addr address,
 266                        MachineType mach,
 267                        DataBlock& data,
 268                        Cycles initialRequestTime,
 269                        Cycles forwardRequestTime,
 270                        Cycles firstResponseTime,
 271                        bool isRegion);
 272
 273     void writeCallback(Addr address,
 274                        MachineType mach,
 275                        DataBlock& data,
 276                        Cycles initialRequestTime,
 277                        Cycles forwardRequestTime,
 278                        Cycles firstResponseTime);
 279
 280     void writeCompleteCallback(Addr address,
 281                                uint64_t instSeqNum,
 282                                MachineType mach);
 283
 284     void readCallback(Addr address, DataBlock& data);
 285
 286     void readCallback(Addr address,
 287                       MachineType mach,
 288                       DataBlock& data);
 289
 290     void readCallback(Addr address,
 291                       MachineType mach,
 292                       DataBlock& data,
 293                       Cycles initialRequestTime,
 294                       Cycles forwardRequestTime,
 295                       Cycles firstResponseTime);
 296
 297     void readCallback(Addr address,
 298                       MachineType mach,
 299                       DataBlock& data,
 300                       Cycles initialRequestTime,
 301                       Cycles forwardRequestTime,
 302                       Cycles firstResponseTime,
 303                       bool isRegion);
 304
 305     /* atomics need their own callback because the data
 306        might be const coming from SLICC */
 307     virtual void atomicCallback(Addr address,
 308                                 MachineType mach,
 309                                 const DataBlock& data);
 310
 311     RequestStatus makeRequest(PacketPtr pkt) override;
 312     int outstandingCount() const override { return m_outstanding_count; }
 313
 314     bool
 315     isDeadlockEventScheduled() const override
 316     {
 317         return deadlockCheckEvent.scheduled();
 318     }
 319
 320     void
 321     descheduleDeadlockEvent() override
 322     {
 323         deschedule(deadlockCheckEvent);
 324     }
 325
 326     bool empty() const;
 327
 328     void print(std::ostream& out) const;
 329
 330     void evictionCallback(Addr address);
 331     void completeIssue();
 332
 333     void insertKernel(int wavefront_id, PacketPtr pkt);
 334
 335     GMTokenPort& getGMTokenPort() { return gmTokenPort; }
 336
 337     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
 338
 339     Stats::Histogram& getLatencyHist() { return m_latencyHist; }
 340     Stats::Histogram& getTypeLatencyHist(uint32_t t)
 341     { return *m_typeLatencyHist[t]; }
 342
 343     Stats::Histogram& getMissLatencyHist()
 344     { return m_missLatencyHist; }
 345     Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
 346     { return *m_missTypeLatencyHist[t]; }
 347
 348     Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
 349     { return *m_missMachLatencyHist[t]; }
 350
 351     Stats::Histogram&
 352     getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
 353     { return *m_missTypeMachLatencyHist[r][t]; }
 354
 355     Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
 356     { return *m_IssueToInitialDelayHist[t]; }
 357
 358     Stats::Histogram&
 359     getInitialToForwardDelayHist(const MachineType t) const
 360     { return *m_InitialToForwardDelayHist[t]; }
 361
 362     Stats::Histogram&
 363     getForwardRequestToFirstResponseHist(const MachineType t) const
 364     { return *m_ForwardToFirstResponseDelayHist[t]; }
 365
 366     Stats::Histogram&
 367     getFirstResponseToCompletionDelayHist(const MachineType t) const
 368     { return *m_FirstResponseToCompletionDelayHist[t]; }
 369
 370   protected:
 371     bool tryCacheAccess(Addr addr, RubyRequestType type,
 372                         Addr pc, RubyAccessMode access_mode,
 373                         int size, DataBlock*& data_ptr);
 374
 375     // since the two following issue functions are protocol-specific,
 376     // they must be implemented in a derived coalescer
 377     virtual void issueRequest(CoalescedRequest* crequest) = 0;
 378     virtual void issueMemSyncRequest(PacketPtr pkt) {}
 379
 380     void kernelCallback(int wavefront_id);
 381
 382     void hitCallback(CoalescedRequest* crequest,
 383                      MachineType mach,
 384                      DataBlock& data,
 385                      bool success,
 386                      Cycles initialRequestTime,
 387                      Cycles forwardRequestTime,
 388                      Cycles firstResponseTime,
 389                      bool isRegion);
 390     void recordMissLatency(CoalescedRequest* crequest,
 391                            MachineType mach,
 392                            Cycles initialRequestTime,
 393                            Cycles forwardRequestTime,
 394                            Cycles firstResponseTime,
 395                            bool success, bool isRegion);
 396     void completeHitCallback(std::vector<PacketPtr> & mylist);
 397
 398     virtual RubyRequestType getRequestType(PacketPtr pkt);
 399
 400     GPUDynInstPtr getDynInst(PacketPtr pkt) const;
 401
 402     // Attempt to remove a packet from the uncoalescedTable and coalesce
 403     // with a previous request from the same instruction. If there is no
 404     // previous instruction and the max number of outstanding requests has
 405     // not be reached, a new coalesced request is created and added to the
 406     // "target" list of the coalescedTable.
 407     bool coalescePacket(PacketPtr pkt);
 408
 409     EventFunctionWrapper issueEvent;
 410
 411   protected:
 412     int m_max_outstanding_requests;
 413     Cycles m_deadlock_threshold;
 414
 415     CacheMemory* m_dataCache_ptr;
 416     CacheMemory* m_instCache_ptr;
 417
 418     // coalescingWindow is the maximum number of instructions that are
 419     // allowed to be coalesced in a single cycle.
 420     int coalescingWindow;
 421
 422     // The uncoalescedTable contains several "columns" which hold memory
 423     // request packets for an instruction. The maximum size is the number of
 424     // columns * the wavefront size.
 425     UncoalescedTable uncoalescedTable;
 426
 427     // An MSHR-like struct for holding coalesced requests. The requests in
 428     // this table may or may not be outstanding in the memory hierarchy. The
 429     // maximum size is equal to the maximum outstanding requests for a CU
 430     // (typically the number of blocks in TCP). If there are duplicates of
 431     // an address, the are serviced in age order.
 432     std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
 433     // Map of instruction sequence number to coalesced requests that get
 434     // created in coalescePacket, used in completeIssue to send the fully
 435     // coalesced request
 436     std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
 437
 438     // a map btw an instruction sequence number and PendingWriteInst
 439     // this is used to do a final call back for each write when it is
 440     // completely done in the memory system
 441     std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
 442
 443     // Global outstanding request count, across all request tables
 444     int m_outstanding_count;
 445     bool m_deadlock_check_scheduled;
 446     std::unordered_map<int, PacketPtr> kernelEndList;
 447     std::vector<int> newKernelEnds;
 448
 449     int m_store_waiting_on_load_cycles;
 450     int m_store_waiting_on_store_cycles;
 451     int m_load_waiting_on_store_cycles;
 452     int m_load_waiting_on_load_cycles;
 453
 454     bool m_runningGarnetStandalone;
 455
 456     EventFunctionWrapper deadlockCheckEvent;
 457     bool assumingRfOCoherence;
 458
 459 // TODO - Need to update the following stats once the VIPER protocol
 460 //        is re-integrated.
 461 //    // m5 style stats for TCP hit/miss counts
 462 //    Stats::Scalar GPU_TCPLdHits;
 463 //    Stats::Scalar GPU_TCPLdTransfers;
 464 //    Stats::Scalar GPU_TCCLdHits;
 465 //    Stats::Scalar GPU_LdMiss;
 466 //
 467 //    Stats::Scalar GPU_TCPStHits;
 468 //    Stats::Scalar GPU_TCPStTransfers;
 469 //    Stats::Scalar GPU_TCCStHits;
 470 //    Stats::Scalar GPU_StMiss;
 471 //
 472 //    Stats::Scalar CP_TCPLdHits;
 473 //    Stats::Scalar CP_TCPLdTransfers;
 474 //    Stats::Scalar CP_TCCLdHits;
 475 //    Stats::Scalar CP_LdMiss;
 476 //
 477 //    Stats::Scalar CP_TCPStHits;
 478 //    Stats::Scalar CP_TCPStTransfers;
 479 //    Stats::Scalar CP_TCCStHits;
 480 //    Stats::Scalar CP_StMiss;
 481
 482     //! Histogram for number of outstanding requests per cycle.
 483     Stats::Histogram m_outstandReqHist;
 484
 485     //! Histogram for holding latency profile of all requests.
 486     Stats::Histogram m_latencyHist;
 487     std::vector<Stats::Histogram *> m_typeLatencyHist;
 488
 489     //! Histogram for holding latency profile of all requests that
 490     //! miss in the controller connected to this sequencer.
 491     Stats::Histogram m_missLatencyHist;
 492     std::vector<Stats::Histogram *> m_missTypeLatencyHist;
 493
 494     //! Histograms for profiling the latencies for requests that
 495     //! required external messages.
 496     std::vector<Stats::Histogram *> m_missMachLatencyHist;
 497     std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
 498
 499     //! Histograms for recording the breakdown of miss latency
 500     std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
 501     std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
 502     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
 503     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
 504
 505 // TODO - Need to update the following stats once the VIPER protocol
 506 //        is re-integrated.
 507 //    Stats::Distribution numHopDelays;
 508 //    Stats::Distribution tcpToTccDelay;
 509 //    Stats::Distribution tccToSdDelay;
 510 //    Stats::Distribution sdToSdDelay;
 511 //    Stats::Distribution sdToTccDelay;
 512 //    Stats::Distribution tccToTcpDelay;
 513 //
 514 //    Stats::Average avgTcpToTcc;
 515 //    Stats::Average avgTccToSd;
 516 //    Stats::Average avgSdToSd;
 517 //    Stats::Average avgSdToTcc;
 518 //    Stats::Average avgTccToTcp;
 519
 520   private:
 521     // Token port is used to send/receive tokens to/from GPU's global memory
 522     // pipeline across the port boundary. There is one per <wave size> data
 523     // ports in the CU.
 524     GMTokenPort gmTokenPort;
 525
 526     // Private copy constructor and assignment operator
 527     GPUCoalescer(const GPUCoalescer& obj);
 528     GPUCoalescer& operator=(const GPUCoalescer& obj);
 529 };
 530
 531 inline std::ostream&
 532 operator<<(std::ostream& out, const GPUCoalescer& obj)
 533 {
 534     obj.print(out);
 535     out << std::flush;
 536     return out;
 537 }
 538
 539 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__