2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
34 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
35 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
38 #include <unordered_map>
40 #include "base/statistics.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/misc.hh"
43 #include "mem/request.hh"
44 #include "mem/ruby/common/Address.hh"
45 #include "mem/ruby/common/Consumer.hh"
46 #include "mem/ruby/protocol/PrefetchBit.hh"
47 #include "mem/ruby/protocol/RubyAccessMode.hh"
48 #include "mem/ruby/protocol/RubyRequestType.hh"
49 #include "mem/ruby/protocol/SequencerRequestType.hh"
50 #include "mem/ruby/system/Sequencer.hh"
51 #include "mem/token_port.hh"
58 class RubyGPUCoalescerParams;
60 // List of packets that belongs to a specific instruction.
61 typedef std::list<PacketPtr> PerInstPackets;
63 class UncoalescedTable
66 UncoalescedTable(GPUCoalescer *gc);
67 ~UncoalescedTable() {}
69 void insertPacket(PacketPtr pkt);
70 bool packetAvailable();
71 void printRequestTable(std::stringstream& ss);
73 // Modify packets remaining map. Init sets value iff the seqNum has not
74 // yet been seen before. get/set act as a regular getter/setter.
75 void initPacketsRemaining(InstSeqNum seqNum, int count);
76 int getPacketsRemaining(InstSeqNum seqNum);
77 void setPacketsRemaining(InstSeqNum seqNum, int count);
79 // Returns a pointer to the list of packets corresponding to an
80 // instruction in the instruction map or nullptr if there are no
81 // instructions at the offset.
82 PerInstPackets* getInstPackets(int offset);
83 void updateResources();
84 bool areRequestsDone(const InstSeqNum instSeqNum);
86 // Check if a packet hasn't been removed from instMap in too long.
87 // Panics if a deadlock is detected and returns nothing otherwise.
88 void checkDeadlock(Tick threshold);
91 GPUCoalescer *coalescer;
93 // Maps an instructions unique sequence number to a queue of packets
94 // which need responses. This data structure assumes the sequence number
95 // is monotonically increasing (which is true for CU class) in order to
96 // issue packets in age order.
97 std::map<InstSeqNum, PerInstPackets> instMap;
99 std::map<InstSeqNum, int> instPktsRemaining;
102 class CoalescedRequest
105 CoalescedRequest(uint64_t _seqNum)
106 : seqNum(_seqNum), issueTime(Cycles(0)),
107 rubyType(RubyRequestType_NULL)
109 ~CoalescedRequest() {}
111 void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
112 void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
113 void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
114 void setRubyType(RubyRequestType type) { rubyType = type; }
116 uint64_t getSeqNum() const { return seqNum; }
117 PacketPtr getFirstPkt() const { return pkts[0]; }
118 Cycles getIssueTime() const { return issueTime; }
119 RubyRequestType getRubyType() const { return rubyType; }
120 std::vector<PacketPtr>& getPackets() { return pkts; }
125 RubyRequestType rubyType;
126 std::vector<PacketPtr> pkts;
129 // PendingWriteInst tracks the number of outstanding Ruby requests
130 // per write instruction. Once all requests associated with one instruction
131 // are completely done in Ruby, we call back the requestor to mark
132 // that this instruction is complete.
133 class PendingWriteInst
137 : numPendingStores(0),
138 originalPort(nullptr),
139 gpuDynInstPtr(nullptr)
146 addPendingReq(RubyPort::MemResponsePort* port, GPUDynInstPtr inst,
147 bool usingRubyTester)
152 if (!usingRubyTester) {
153 gpuDynInstPtr = inst;
159 // return true if no more ack is expected
161 receiveWriteCompleteAck()
163 assert(numPendingStores > 0);
165 return (numPendingStores == 0) ? true : false;
168 // ack the original requestor that this write instruction is complete
170 ackWriteCompletion(bool usingRubyTester)
172 assert(numPendingStores == 0);
174 // make a response packet
175 PacketPtr pkt = new Packet(std::make_shared<Request>(),
176 MemCmd::WriteCompleteResp);
178 if (!usingRubyTester) {
179 assert(gpuDynInstPtr);
180 ComputeUnit::DataPort::SenderState* ss =
181 new ComputeUnit::DataPort::SenderState
182 (gpuDynInstPtr, 0, nullptr);
183 pkt->senderState = ss;
186 // send the ack response to the requestor
187 originalPort->sendTimingResp(pkt);
191 getNumPendingStores() {
192 return numPendingStores;
196 // the number of stores waiting for writeCompleteCallback
197 int numPendingStores;
198 // The original port that sent one of packets associated with this
199 // write instruction. We may have more than one packet per instruction,
200 // which implies multiple ports per instruction. However, we need
201 // only 1 of the ports to call back the CU. Therefore, here we keep
202 // track the port that sent the first packet of this instruction.
203 RubyPort::MemResponsePort* originalPort;
204 // similar to the originalPort, this gpuDynInstPtr is set only for
205 // the first packet of this instruction.
206 GPUDynInstPtr gpuDynInstPtr;
209 class GPUCoalescer : public RubyPort
212 class GMTokenPort : public TokenResponsePort
215 GMTokenPort(const std::string& name, ClockedObject *owner,
216 PortID id = InvalidPortID)
217 : TokenResponsePort(name, owner, id)
222 Tick recvAtomic(PacketPtr) { return Tick(0); }
223 void recvFunctional(PacketPtr) { }
224 bool recvTimingReq(PacketPtr) { return false; }
225 AddrRangeList getAddrRanges() const
227 AddrRangeList ranges;
232 typedef RubyGPUCoalescerParams Params;
233 GPUCoalescer(const Params &);
236 Port &getPort(const std::string &if_name,
237 PortID idx = InvalidPortID) override;
240 void wakeup(); // Used only for deadlock detection
241 void printRequestTable(std::stringstream& ss);
243 void printProgress(std::ostream& out) const;
244 void resetStats() override;
246 void regStats() override;
248 // each store request needs two callbacks:
249 // (1) writeCallback is called when the store is received and processed
250 // by TCP. This writeCallback does not guarantee the store is actually
251 // completed at its destination cache or memory. writeCallback helps
252 // release hardware resources (e.g., its entry in coalescedTable)
253 // allocated for the store so that subsequent requests will not be
254 // blocked unnecessarily due to hardware resource constraints.
255 // (2) writeCompleteCallback is called when the store is fully completed
256 // at its destination cache or memory. writeCompleteCallback
257 // guarantees that the store is fully completed. This callback
258 // will decrement hardware counters in CU
259 void writeCallback(Addr address, DataBlock& data);
261 void writeCallback(Addr address,
265 void writeCallback(Addr address,
268 Cycles initialRequestTime,
269 Cycles forwardRequestTime,
270 Cycles firstResponseTime,
273 void writeCallback(Addr address,
276 Cycles initialRequestTime,
277 Cycles forwardRequestTime,
278 Cycles firstResponseTime);
280 void writeCompleteCallback(Addr address,
284 void readCallback(Addr address, DataBlock& data);
286 void readCallback(Addr address,
290 void readCallback(Addr address,
293 Cycles initialRequestTime,
294 Cycles forwardRequestTime,
295 Cycles firstResponseTime);
297 void readCallback(Addr address,
300 Cycles initialRequestTime,
301 Cycles forwardRequestTime,
302 Cycles firstResponseTime,
305 /* atomics need their own callback because the data
306 might be const coming from SLICC */
307 virtual void atomicCallback(Addr address,
309 const DataBlock& data);
311 RequestStatus makeRequest(PacketPtr pkt) override;
312 int outstandingCount() const override { return m_outstanding_count; }
315 isDeadlockEventScheduled() const override
317 return deadlockCheckEvent.scheduled();
321 descheduleDeadlockEvent() override
323 deschedule(deadlockCheckEvent);
328 void print(std::ostream& out) const;
330 void evictionCallback(Addr address);
331 void completeIssue();
333 void insertKernel(int wavefront_id, PacketPtr pkt);
335 GMTokenPort& getGMTokenPort() { return gmTokenPort; }
337 Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
339 Stats::Histogram& getLatencyHist() { return m_latencyHist; }
340 Stats::Histogram& getTypeLatencyHist(uint32_t t)
341 { return *m_typeLatencyHist[t]; }
343 Stats::Histogram& getMissLatencyHist()
344 { return m_missLatencyHist; }
345 Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
346 { return *m_missTypeLatencyHist[t]; }
348 Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
349 { return *m_missMachLatencyHist[t]; }
352 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
353 { return *m_missTypeMachLatencyHist[r][t]; }
355 Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
356 { return *m_IssueToInitialDelayHist[t]; }
359 getInitialToForwardDelayHist(const MachineType t) const
360 { return *m_InitialToForwardDelayHist[t]; }
363 getForwardRequestToFirstResponseHist(const MachineType t) const
364 { return *m_ForwardToFirstResponseDelayHist[t]; }
367 getFirstResponseToCompletionDelayHist(const MachineType t) const
368 { return *m_FirstResponseToCompletionDelayHist[t]; }
371 bool tryCacheAccess(Addr addr, RubyRequestType type,
372 Addr pc, RubyAccessMode access_mode,
373 int size, DataBlock*& data_ptr);
375 // since the two following issue functions are protocol-specific,
376 // they must be implemented in a derived coalescer
377 virtual void issueRequest(CoalescedRequest* crequest) = 0;
378 virtual void issueMemSyncRequest(PacketPtr pkt) {}
380 void kernelCallback(int wavefront_id);
382 void hitCallback(CoalescedRequest* crequest,
386 Cycles initialRequestTime,
387 Cycles forwardRequestTime,
388 Cycles firstResponseTime,
390 void recordMissLatency(CoalescedRequest* crequest,
392 Cycles initialRequestTime,
393 Cycles forwardRequestTime,
394 Cycles firstResponseTime,
395 bool success, bool isRegion);
396 void completeHitCallback(std::vector<PacketPtr> & mylist);
398 virtual RubyRequestType getRequestType(PacketPtr pkt);
400 GPUDynInstPtr getDynInst(PacketPtr pkt) const;
402 // Attempt to remove a packet from the uncoalescedTable and coalesce
403 // with a previous request from the same instruction. If there is no
404 // previous instruction and the max number of outstanding requests has
405 // not be reached, a new coalesced request is created and added to the
406 // "target" list of the coalescedTable.
407 bool coalescePacket(PacketPtr pkt);
409 EventFunctionWrapper issueEvent;
412 int m_max_outstanding_requests;
413 Cycles m_deadlock_threshold;
415 CacheMemory* m_dataCache_ptr;
416 CacheMemory* m_instCache_ptr;
418 // coalescingWindow is the maximum number of instructions that are
419 // allowed to be coalesced in a single cycle.
420 int coalescingWindow;
422 // The uncoalescedTable contains several "columns" which hold memory
423 // request packets for an instruction. The maximum size is the number of
424 // columns * the wavefront size.
425 UncoalescedTable uncoalescedTable;
427 // An MSHR-like struct for holding coalesced requests. The requests in
428 // this table may or may not be outstanding in the memory hierarchy. The
429 // maximum size is equal to the maximum outstanding requests for a CU
430 // (typically the number of blocks in TCP). If there are duplicates of
431 // an address, the are serviced in age order.
432 std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
433 // Map of instruction sequence number to coalesced requests that get
434 // created in coalescePacket, used in completeIssue to send the fully
436 std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
438 // a map btw an instruction sequence number and PendingWriteInst
439 // this is used to do a final call back for each write when it is
440 // completely done in the memory system
441 std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
443 // Global outstanding request count, across all request tables
444 int m_outstanding_count;
445 bool m_deadlock_check_scheduled;
446 std::unordered_map<int, PacketPtr> kernelEndList;
447 std::vector<int> newKernelEnds;
449 int m_store_waiting_on_load_cycles;
450 int m_store_waiting_on_store_cycles;
451 int m_load_waiting_on_store_cycles;
452 int m_load_waiting_on_load_cycles;
454 bool m_runningGarnetStandalone;
456 EventFunctionWrapper deadlockCheckEvent;
457 bool assumingRfOCoherence;
459 // TODO - Need to update the following stats once the VIPER protocol
461 // // m5 style stats for TCP hit/miss counts
462 // Stats::Scalar GPU_TCPLdHits;
463 // Stats::Scalar GPU_TCPLdTransfers;
464 // Stats::Scalar GPU_TCCLdHits;
465 // Stats::Scalar GPU_LdMiss;
467 // Stats::Scalar GPU_TCPStHits;
468 // Stats::Scalar GPU_TCPStTransfers;
469 // Stats::Scalar GPU_TCCStHits;
470 // Stats::Scalar GPU_StMiss;
472 // Stats::Scalar CP_TCPLdHits;
473 // Stats::Scalar CP_TCPLdTransfers;
474 // Stats::Scalar CP_TCCLdHits;
475 // Stats::Scalar CP_LdMiss;
477 // Stats::Scalar CP_TCPStHits;
478 // Stats::Scalar CP_TCPStTransfers;
479 // Stats::Scalar CP_TCCStHits;
480 // Stats::Scalar CP_StMiss;
482 //! Histogram for number of outstanding requests per cycle.
483 Stats::Histogram m_outstandReqHist;
485 //! Histogram for holding latency profile of all requests.
486 Stats::Histogram m_latencyHist;
487 std::vector<Stats::Histogram *> m_typeLatencyHist;
489 //! Histogram for holding latency profile of all requests that
490 //! miss in the controller connected to this sequencer.
491 Stats::Histogram m_missLatencyHist;
492 std::vector<Stats::Histogram *> m_missTypeLatencyHist;
494 //! Histograms for profiling the latencies for requests that
495 //! required external messages.
496 std::vector<Stats::Histogram *> m_missMachLatencyHist;
497 std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
499 //! Histograms for recording the breakdown of miss latency
500 std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
501 std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
502 std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
503 std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
505 // TODO - Need to update the following stats once the VIPER protocol
507 // Stats::Distribution numHopDelays;
508 // Stats::Distribution tcpToTccDelay;
509 // Stats::Distribution tccToSdDelay;
510 // Stats::Distribution sdToSdDelay;
511 // Stats::Distribution sdToTccDelay;
512 // Stats::Distribution tccToTcpDelay;
514 // Stats::Average avgTcpToTcc;
515 // Stats::Average avgTccToSd;
516 // Stats::Average avgSdToSd;
517 // Stats::Average avgSdToTcc;
518 // Stats::Average avgTccToTcp;
521 // Token port is used to send/receive tokens to/from GPU's global memory
522 // pipeline across the port boundary. There is one per <wave size> data
524 GMTokenPort gmTokenPort;
526 // Private copy constructor and assignment operator
527 GPUCoalescer(const GPUCoalescer& obj);
528 GPUCoalescer& operator=(const GPUCoalescer& obj);
532 operator<<(std::ostream& out, const GPUCoalescer& obj)
539 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__