src/mem/ruby/system/GPUCoalescer.hh

   1 /*
   2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Sooraj Puthoor
  34  */
  35
  36 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
  37 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
  38
  39 #include <iostream>
  40 #include <unordered_map>
  41
  42 #include "base/statistics.hh"
  43 #include "mem/protocol/HSAScope.hh"
  44 #include "mem/protocol/HSASegment.hh"
  45 #include "mem/protocol/PrefetchBit.hh"
  46 #include "mem/protocol/RubyAccessMode.hh"
  47 #include "mem/protocol/RubyRequestType.hh"
  48 #include "mem/protocol/SequencerRequestType.hh"
  49 #include "mem/request.hh"
  50 #include "mem/ruby/common/Address.hh"
  51 #include "mem/ruby/common/Consumer.hh"
  52 #include "mem/ruby/system/Sequencer.hh"
  53
  54 class DataBlock;
  55 class CacheMsg;
  56 class MachineID;
  57 class CacheMemory;
  58
  59 class RubyGPUCoalescerParams;
  60
  61 HSAScope reqScopeToHSAScope(Request* req);
  62 HSASegment reqSegmentToHSASegment(Request* req);
  63
  64 struct GPUCoalescerRequest
  65 {
  66     PacketPtr pkt;
  67     RubyRequestType m_type;
  68     Cycles issue_time;
  69
  70     GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
  71                         Cycles _issue_time)
  72         : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
  73     {}
  74 };
  75
  76 class RequestDesc
  77 {
  78   public:
  79     RequestDesc(PacketPtr pkt, RubyRequestType p_type, RubyRequestType s_type)
  80         : pkt(pkt), primaryType(p_type), secondaryType(s_type)
  81     {
  82     }
  83
  84     RequestDesc() : pkt(nullptr), primaryType(RubyRequestType_NULL),
  85         secondaryType(RubyRequestType_NULL)
  86     {
  87     }
  88
  89     PacketPtr pkt;
  90     RubyRequestType primaryType;
  91     RubyRequestType secondaryType;
  92 };
  93
  94 std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
  95
  96 class GPUCoalescer : public RubyPort
  97 {
  98   public:
  99     typedef RubyGPUCoalescerParams Params;
 100     GPUCoalescer(const Params *);
 101     ~GPUCoalescer();
 102
 103     // Public Methods
 104     void wakeup(); // Used only for deadlock detection
 105
 106     void printProgress(std::ostream& out) const;
 107     void resetStats();
 108     void collateStats();
 109     void regStats();
 110
 111     void writeCallback(Addr address, DataBlock& data);
 112
 113     void writeCallback(Addr address,
 114                        MachineType mach,
 115                        DataBlock& data);
 116
 117     void writeCallback(Addr address,
 118                        MachineType mach,
 119                        DataBlock& data,
 120                        Cycles initialRequestTime,
 121                        Cycles forwardRequestTime,
 122                        Cycles firstResponseTime,
 123                        bool isRegion);
 124
 125     void writeCallback(Addr address,
 126                        MachineType mach,
 127                        DataBlock& data,
 128                        Cycles initialRequestTime,
 129                        Cycles forwardRequestTime,
 130                        Cycles firstResponseTime);
 131
 132     void readCallback(Addr address, DataBlock& data);
 133
 134     void readCallback(Addr address,
 135                       MachineType mach,
 136                       DataBlock& data);
 137
 138     void readCallback(Addr address,
 139                       MachineType mach,
 140                       DataBlock& data,
 141                       Cycles initialRequestTime,
 142                       Cycles forwardRequestTime,
 143                       Cycles firstResponseTime);
 144
 145     void readCallback(Addr address,
 146                       MachineType mach,
 147                       DataBlock& data,
 148                       Cycles initialRequestTime,
 149                       Cycles forwardRequestTime,
 150                       Cycles firstResponseTime,
 151                       bool isRegion);
 152     /* atomics need their own callback because the data
 153        might be const coming from SLICC */
 154     void atomicCallback(Addr address,
 155                         MachineType mach,
 156                         const DataBlock& data);
 157
 158     void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
 159     void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
 160
 161     // Alternate implementations in VIPER Coalescer
 162     virtual RequestStatus makeRequest(PacketPtr pkt);
 163
 164     int outstandingCount() const { return m_outstanding_count; }
 165
 166     bool
 167     isDeadlockEventScheduled() const
 168     {
 169         return deadlockCheckEvent.scheduled();
 170     }
 171
 172     void
 173     descheduleDeadlockEvent()
 174     {
 175         deschedule(deadlockCheckEvent);
 176     }
 177
 178     bool empty() const;
 179
 180     void print(std::ostream& out) const;
 181     void checkCoherence(Addr address);
 182
 183     void markRemoved();
 184     void removeRequest(GPUCoalescerRequest* request);
 185     void evictionCallback(Addr address);
 186     void completeIssue();
 187
 188     void insertKernel(int wavefront_id, PacketPtr pkt);
 189
 190     void recordRequestType(SequencerRequestType requestType);
 191     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
 192
 193     Stats::Histogram& getLatencyHist() { return m_latencyHist; }
 194     Stats::Histogram& getTypeLatencyHist(uint32_t t)
 195     { return *m_typeLatencyHist[t]; }
 196
 197     Stats::Histogram& getMissLatencyHist()
 198     { return m_missLatencyHist; }
 199     Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
 200     { return *m_missTypeLatencyHist[t]; }
 201
 202     Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
 203     { return *m_missMachLatencyHist[t]; }
 204
 205     Stats::Histogram&
 206     getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
 207     { return *m_missTypeMachLatencyHist[r][t]; }
 208
 209     Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
 210     { return *m_IssueToInitialDelayHist[t]; }
 211
 212     Stats::Histogram&
 213     getInitialToForwardDelayHist(const MachineType t) const
 214     { return *m_InitialToForwardDelayHist[t]; }
 215
 216     Stats::Histogram&
 217     getForwardRequestToFirstResponseHist(const MachineType t) const
 218     { return *m_ForwardToFirstResponseDelayHist[t]; }
 219
 220     Stats::Histogram&
 221     getFirstResponseToCompletionDelayHist(const MachineType t) const
 222     { return *m_FirstResponseToCompletionDelayHist[t]; }
 223
 224   // Changed to protected to enable inheritance by VIPER Coalescer
 225   protected:
 226     bool tryCacheAccess(Addr addr, RubyRequestType type,
 227                         Addr pc, RubyAccessMode access_mode,
 228                         int size, DataBlock*& data_ptr);
 229     // Alternate implementations in VIPER Coalescer
 230     virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
 231
 232     void kernelCallback(int wavfront_id);
 233
 234     void hitCallback(GPUCoalescerRequest* request,
 235                      MachineType mach,
 236                      DataBlock& data,
 237                      bool success,
 238                      Cycles initialRequestTime,
 239                      Cycles forwardRequestTime,
 240                      Cycles firstResponseTime,
 241                      bool isRegion);
 242     void recordMissLatency(GPUCoalescerRequest* request,
 243                            MachineType mach,
 244                            Cycles initialRequestTime,
 245                            Cycles forwardRequestTime,
 246                            Cycles firstResponseTime,
 247                            bool success, bool isRegion);
 248     void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
 249     PacketPtr mapAddrToPkt(Addr address);
 250
 251
 252     RequestStatus getRequestStatus(PacketPtr pkt,
 253                                    RubyRequestType request_type);
 254     bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
 255
 256     bool handleLlsc(Addr address, GPUCoalescerRequest* request);
 257
 258     EventFunctionWrapper issueEvent;
 259
 260
 261   // Changed to protected to enable inheritance by VIPER Coalescer
 262   protected:
 263     int m_max_outstanding_requests;
 264     int m_deadlock_threshold;
 265
 266     CacheMemory* m_dataCache_ptr;
 267     CacheMemory* m_instCache_ptr;
 268
 269     // The cache access latency for this GPU data cache. This is assessed at the
 270     // beginning of each access. This should be very similar to the
 271     // implementation in Sequencer() as this is very much like a Sequencer
 272     Cycles m_data_cache_hit_latency;
 273
 274     // We need to track both the primary and secondary request types.
 275     // The secondary request type comprises a subset of RubyRequestTypes that
 276     // are understood by the L1 Controller. A primary request type can be any
 277     // RubyRequestType.
 278     typedef std::unordered_map<Addr, std::vector<RequestDesc>> CoalescingTable;
 279     CoalescingTable reqCoalescer;
 280     std::vector<Addr> newRequests;
 281
 282     typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
 283     RequestTable m_writeRequestTable;
 284     RequestTable m_readRequestTable;
 285     // Global outstanding request count, across all request tables
 286     int m_outstanding_count;
 287     bool m_deadlock_check_scheduled;
 288     std::unordered_map<int, PacketPtr> kernelEndList;
 289     std::vector<int> newKernelEnds;
 290
 291     int m_store_waiting_on_load_cycles;
 292     int m_store_waiting_on_store_cycles;
 293     int m_load_waiting_on_store_cycles;
 294     int m_load_waiting_on_load_cycles;
 295
 296     bool m_runningGarnetStandalone;
 297
 298     EventFunctionWrapper deadlockCheckEvent;
 299     bool assumingRfOCoherence;
 300
 301     // m5 style stats for TCP hit/miss counts
 302     Stats::Scalar GPU_TCPLdHits;
 303     Stats::Scalar GPU_TCPLdTransfers;
 304     Stats::Scalar GPU_TCCLdHits;
 305     Stats::Scalar GPU_LdMiss;
 306
 307     Stats::Scalar GPU_TCPStHits;
 308     Stats::Scalar GPU_TCPStTransfers;
 309     Stats::Scalar GPU_TCCStHits;
 310     Stats::Scalar GPU_StMiss;
 311
 312     Stats::Scalar CP_TCPLdHits;
 313     Stats::Scalar CP_TCPLdTransfers;
 314     Stats::Scalar CP_TCCLdHits;
 315     Stats::Scalar CP_LdMiss;
 316
 317     Stats::Scalar CP_TCPStHits;
 318     Stats::Scalar CP_TCPStTransfers;
 319     Stats::Scalar CP_TCCStHits;
 320     Stats::Scalar CP_StMiss;
 321
 322     //! Histogram for number of outstanding requests per cycle.
 323     Stats::Histogram m_outstandReqHist;
 324
 325     //! Histogram for holding latency profile of all requests.
 326     Stats::Histogram m_latencyHist;
 327     std::vector<Stats::Histogram *> m_typeLatencyHist;
 328
 329     //! Histogram for holding latency profile of all requests that
 330     //! miss in the controller connected to this sequencer.
 331     Stats::Histogram m_missLatencyHist;
 332     std::vector<Stats::Histogram *> m_missTypeLatencyHist;
 333
 334     //! Histograms for profiling the latencies for requests that
 335     //! required external messages.
 336     std::vector<Stats::Histogram *> m_missMachLatencyHist;
 337     std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
 338
 339     //! Histograms for recording the breakdown of miss latency
 340     std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
 341     std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
 342     std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
 343     std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
 344
 345 private:
 346     // Private copy constructor and assignment operator
 347     GPUCoalescer(const GPUCoalescer& obj);
 348     GPUCoalescer& operator=(const GPUCoalescer& obj);
 349 };
 350
 351 inline std::ostream&
 352 operator<<(std::ostream& out, const GPUCoalescer& obj)
 353 {
 354     obj.print(out);
 355     out << std::flush;
 356     return out;
 357 }
 358
 359 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__