709b491a831943d4f92427e107914ef1b1b38429
[gem5.git] / src / mem / ruby / system / GPUCoalescer.hh
1 /*
2 * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
35 #define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
36
37 #include <iostream>
38 #include <unordered_map>
39
40 #include "base/statistics.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/misc.hh"
43 #include "mem/request.hh"
44 #include "mem/ruby/common/Address.hh"
45 #include "mem/ruby/common/Consumer.hh"
46 #include "mem/ruby/protocol/PrefetchBit.hh"
47 #include "mem/ruby/protocol/RubyAccessMode.hh"
48 #include "mem/ruby/protocol/RubyRequestType.hh"
49 #include "mem/ruby/protocol/SequencerRequestType.hh"
50 #include "mem/ruby/system/Sequencer.hh"
51 #include "mem/token_port.hh"
52
53 class DataBlock;
54 class CacheMsg;
55 class MachineID;
56 class CacheMemory;
57
58 class RubyGPUCoalescerParams;
59
60 // List of packets that belongs to a specific instruction.
61 typedef std::list<PacketPtr> PerInstPackets;
62
63 class UncoalescedTable
64 {
65 public:
66 UncoalescedTable(GPUCoalescer *gc);
67 ~UncoalescedTable() {}
68
69 void insertPacket(PacketPtr pkt);
70 bool packetAvailable();
71 void printRequestTable(std::stringstream& ss);
72
73 // Modify packets remaining map. Init sets value iff the seqNum has not
74 // yet been seen before. get/set act as a regular getter/setter.
75 void initPacketsRemaining(InstSeqNum seqNum, int count);
76 int getPacketsRemaining(InstSeqNum seqNum);
77 void setPacketsRemaining(InstSeqNum seqNum, int count);
78
79 // Returns a pointer to the list of packets corresponding to an
80 // instruction in the instruction map or nullptr if there are no
81 // instructions at the offset.
82 PerInstPackets* getInstPackets(int offset);
83 void updateResources();
84 bool areRequestsDone(const InstSeqNum instSeqNum);
85
86 // Check if a packet hasn't been removed from instMap in too long.
87 // Panics if a deadlock is detected and returns nothing otherwise.
88 void checkDeadlock(Tick threshold);
89
90 private:
91 GPUCoalescer *coalescer;
92
93 // Maps an instructions unique sequence number to a queue of packets
94 // which need responses. This data structure assumes the sequence number
95 // is monotonically increasing (which is true for CU class) in order to
96 // issue packets in age order.
97 std::map<InstSeqNum, PerInstPackets> instMap;
98
99 std::map<InstSeqNum, int> instPktsRemaining;
100 };
101
102 class CoalescedRequest
103 {
104 public:
105 CoalescedRequest(uint64_t _seqNum)
106 : seqNum(_seqNum), issueTime(Cycles(0)),
107 rubyType(RubyRequestType_NULL)
108 {}
109 ~CoalescedRequest() {}
110
111 void insertPacket(PacketPtr pkt) { pkts.push_back(pkt); }
112 void setSeqNum(uint64_t _seqNum) { seqNum = _seqNum; }
113 void setIssueTime(Cycles _issueTime) { issueTime = _issueTime; }
114 void setRubyType(RubyRequestType type) { rubyType = type; }
115
116 uint64_t getSeqNum() const { return seqNum; }
117 PacketPtr getFirstPkt() const { return pkts[0]; }
118 Cycles getIssueTime() const { return issueTime; }
119 RubyRequestType getRubyType() const { return rubyType; }
120 std::vector<PacketPtr>& getPackets() { return pkts; }
121
122 private:
123 uint64_t seqNum;
124 Cycles issueTime;
125 RubyRequestType rubyType;
126 std::vector<PacketPtr> pkts;
127 };
128
129 // PendingWriteInst tracks the number of outstanding Ruby requests
130 // per write instruction. Once all requests associated with one instruction
131 // are completely done in Ruby, we call back the requestor to mark
132 // that this instruction is complete.
133 class PendingWriteInst
134 {
135 public:
136 PendingWriteInst()
137 : numPendingStores(0),
138 originalPort(nullptr),
139 gpuDynInstPtr(nullptr)
140 {}
141
142 ~PendingWriteInst()
143 {}
144
145 void
146 addPendingReq(RubyPort::MemResponsePort* port, GPUDynInstPtr inst,
147 bool usingRubyTester)
148 {
149 assert(port);
150 originalPort = port;
151
152 if (!usingRubyTester) {
153 gpuDynInstPtr = inst;
154 }
155
156 numPendingStores++;
157 }
158
159 // return true if no more ack is expected
160 bool
161 receiveWriteCompleteAck()
162 {
163 assert(numPendingStores > 0);
164 numPendingStores--;
165 return (numPendingStores == 0) ? true : false;
166 }
167
168 // ack the original requestor that this write instruction is complete
169 void
170 ackWriteCompletion(bool usingRubyTester)
171 {
172 assert(numPendingStores == 0);
173
174 // make a response packet
175 PacketPtr pkt = new Packet(std::make_shared<Request>(),
176 MemCmd::WriteCompleteResp);
177
178 if (!usingRubyTester) {
179 assert(gpuDynInstPtr);
180 ComputeUnit::DataPort::SenderState* ss =
181 new ComputeUnit::DataPort::SenderState
182 (gpuDynInstPtr, 0, nullptr);
183 pkt->senderState = ss;
184 }
185
186 // send the ack response to the requestor
187 originalPort->sendTimingResp(pkt);
188 }
189
190 int
191 getNumPendingStores() {
192 return numPendingStores;
193 }
194
195 private:
196 // the number of stores waiting for writeCompleteCallback
197 int numPendingStores;
198 // The original port that sent one of packets associated with this
199 // write instruction. We may have more than one packet per instruction,
200 // which implies multiple ports per instruction. However, we need
201 // only 1 of the ports to call back the CU. Therefore, here we keep
202 // track the port that sent the first packet of this instruction.
203 RubyPort::MemResponsePort* originalPort;
204 // similar to the originalPort, this gpuDynInstPtr is set only for
205 // the first packet of this instruction.
206 GPUDynInstPtr gpuDynInstPtr;
207 };
208
209 class GPUCoalescer : public RubyPort
210 {
211 public:
212 class GMTokenPort : public TokenResponsePort
213 {
214 public:
215 GMTokenPort(const std::string& name, ClockedObject *owner,
216 PortID id = InvalidPortID)
217 : TokenResponsePort(name, owner, id)
218 { }
219 ~GMTokenPort() { }
220
221 protected:
222 Tick recvAtomic(PacketPtr) { return Tick(0); }
223 void recvFunctional(PacketPtr) { }
224 bool recvTimingReq(PacketPtr) { return false; }
225 AddrRangeList getAddrRanges() const
226 {
227 AddrRangeList ranges;
228 return ranges;
229 }
230 };
231
232 typedef RubyGPUCoalescerParams Params;
233 GPUCoalescer(const Params &);
234 ~GPUCoalescer();
235
236 Port &getPort(const std::string &if_name,
237 PortID idx = InvalidPortID) override;
238
239 // Public Methods
240 void wakeup(); // Used only for deadlock detection
241 void printRequestTable(std::stringstream& ss);
242
243 void printProgress(std::ostream& out) const;
244 void resetStats() override;
245 void collateStats();
246 void regStats() override;
247
248 // each store request needs two callbacks:
249 // (1) writeCallback is called when the store is received and processed
250 // by TCP. This writeCallback does not guarantee the store is actually
251 // completed at its destination cache or memory. writeCallback helps
252 // release hardware resources (e.g., its entry in coalescedTable)
253 // allocated for the store so that subsequent requests will not be
254 // blocked unnecessarily due to hardware resource constraints.
255 // (2) writeCompleteCallback is called when the store is fully completed
256 // at its destination cache or memory. writeCompleteCallback
257 // guarantees that the store is fully completed. This callback
258 // will decrement hardware counters in CU
259 void writeCallback(Addr address, DataBlock& data);
260
261 void writeCallback(Addr address,
262 MachineType mach,
263 DataBlock& data);
264
265 void writeCallback(Addr address,
266 MachineType mach,
267 DataBlock& data,
268 Cycles initialRequestTime,
269 Cycles forwardRequestTime,
270 Cycles firstResponseTime,
271 bool isRegion);
272
273 void writeCallback(Addr address,
274 MachineType mach,
275 DataBlock& data,
276 Cycles initialRequestTime,
277 Cycles forwardRequestTime,
278 Cycles firstResponseTime);
279
280 void writeCompleteCallback(Addr address,
281 uint64_t instSeqNum,
282 MachineType mach);
283
284 void readCallback(Addr address, DataBlock& data);
285
286 void readCallback(Addr address,
287 MachineType mach,
288 DataBlock& data);
289
290 void readCallback(Addr address,
291 MachineType mach,
292 DataBlock& data,
293 Cycles initialRequestTime,
294 Cycles forwardRequestTime,
295 Cycles firstResponseTime);
296
297 void readCallback(Addr address,
298 MachineType mach,
299 DataBlock& data,
300 Cycles initialRequestTime,
301 Cycles forwardRequestTime,
302 Cycles firstResponseTime,
303 bool isRegion);
304
305 /* atomics need their own callback because the data
306 might be const coming from SLICC */
307 virtual void atomicCallback(Addr address,
308 MachineType mach,
309 const DataBlock& data);
310
311 RequestStatus makeRequest(PacketPtr pkt) override;
312 int outstandingCount() const override { return m_outstanding_count; }
313
314 bool
315 isDeadlockEventScheduled() const override
316 {
317 return deadlockCheckEvent.scheduled();
318 }
319
320 void
321 descheduleDeadlockEvent() override
322 {
323 deschedule(deadlockCheckEvent);
324 }
325
326 bool empty() const;
327
328 void print(std::ostream& out) const;
329
330 void evictionCallback(Addr address);
331 void completeIssue();
332
333 void insertKernel(int wavefront_id, PacketPtr pkt);
334
335 GMTokenPort& getGMTokenPort() { return gmTokenPort; }
336
337 Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
338
339 Stats::Histogram& getLatencyHist() { return m_latencyHist; }
340 Stats::Histogram& getTypeLatencyHist(uint32_t t)
341 { return *m_typeLatencyHist[t]; }
342
343 Stats::Histogram& getMissLatencyHist()
344 { return m_missLatencyHist; }
345 Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
346 { return *m_missTypeLatencyHist[t]; }
347
348 Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
349 { return *m_missMachLatencyHist[t]; }
350
351 Stats::Histogram&
352 getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
353 { return *m_missTypeMachLatencyHist[r][t]; }
354
355 Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
356 { return *m_IssueToInitialDelayHist[t]; }
357
358 Stats::Histogram&
359 getInitialToForwardDelayHist(const MachineType t) const
360 { return *m_InitialToForwardDelayHist[t]; }
361
362 Stats::Histogram&
363 getForwardRequestToFirstResponseHist(const MachineType t) const
364 { return *m_ForwardToFirstResponseDelayHist[t]; }
365
366 Stats::Histogram&
367 getFirstResponseToCompletionDelayHist(const MachineType t) const
368 { return *m_FirstResponseToCompletionDelayHist[t]; }
369
370 protected:
371 bool tryCacheAccess(Addr addr, RubyRequestType type,
372 Addr pc, RubyAccessMode access_mode,
373 int size, DataBlock*& data_ptr);
374
375 // since the two following issue functions are protocol-specific,
376 // they must be implemented in a derived coalescer
377 virtual void issueRequest(CoalescedRequest* crequest) = 0;
378 virtual void issueMemSyncRequest(PacketPtr pkt) {}
379
380 void kernelCallback(int wavefront_id);
381
382 void hitCallback(CoalescedRequest* crequest,
383 MachineType mach,
384 DataBlock& data,
385 bool success,
386 Cycles initialRequestTime,
387 Cycles forwardRequestTime,
388 Cycles firstResponseTime,
389 bool isRegion);
390 void recordMissLatency(CoalescedRequest* crequest,
391 MachineType mach,
392 Cycles initialRequestTime,
393 Cycles forwardRequestTime,
394 Cycles firstResponseTime,
395 bool success, bool isRegion);
396 void completeHitCallback(std::vector<PacketPtr> & mylist);
397
398 virtual RubyRequestType getRequestType(PacketPtr pkt);
399
400 GPUDynInstPtr getDynInst(PacketPtr pkt) const;
401
402 // Attempt to remove a packet from the uncoalescedTable and coalesce
403 // with a previous request from the same instruction. If there is no
404 // previous instruction and the max number of outstanding requests has
405 // not be reached, a new coalesced request is created and added to the
406 // "target" list of the coalescedTable.
407 bool coalescePacket(PacketPtr pkt);
408
409 EventFunctionWrapper issueEvent;
410
411 protected:
412 int m_max_outstanding_requests;
413 Cycles m_deadlock_threshold;
414
415 CacheMemory* m_dataCache_ptr;
416 CacheMemory* m_instCache_ptr;
417
418 // coalescingWindow is the maximum number of instructions that are
419 // allowed to be coalesced in a single cycle.
420 int coalescingWindow;
421
422 // The uncoalescedTable contains several "columns" which hold memory
423 // request packets for an instruction. The maximum size is the number of
424 // columns * the wavefront size.
425 UncoalescedTable uncoalescedTable;
426
427 // An MSHR-like struct for holding coalesced requests. The requests in
428 // this table may or may not be outstanding in the memory hierarchy. The
429 // maximum size is equal to the maximum outstanding requests for a CU
430 // (typically the number of blocks in TCP). If there are duplicates of
431 // an address, the are serviced in age order.
432 std::map<Addr, std::deque<CoalescedRequest*>> coalescedTable;
433 // Map of instruction sequence number to coalesced requests that get
434 // created in coalescePacket, used in completeIssue to send the fully
435 // coalesced request
436 std::unordered_map<uint64_t, std::deque<CoalescedRequest*>> coalescedReqs;
437
438 // a map btw an instruction sequence number and PendingWriteInst
439 // this is used to do a final call back for each write when it is
440 // completely done in the memory system
441 std::unordered_map<uint64_t, PendingWriteInst> pendingWriteInsts;
442
443 // Global outstanding request count, across all request tables
444 int m_outstanding_count;
445 bool m_deadlock_check_scheduled;
446 std::unordered_map<int, PacketPtr> kernelEndList;
447 std::vector<int> newKernelEnds;
448
449 int m_store_waiting_on_load_cycles;
450 int m_store_waiting_on_store_cycles;
451 int m_load_waiting_on_store_cycles;
452 int m_load_waiting_on_load_cycles;
453
454 bool m_runningGarnetStandalone;
455
456 EventFunctionWrapper deadlockCheckEvent;
457 bool assumingRfOCoherence;
458
459 // TODO - Need to update the following stats once the VIPER protocol
460 // is re-integrated.
461 // // m5 style stats for TCP hit/miss counts
462 // Stats::Scalar GPU_TCPLdHits;
463 // Stats::Scalar GPU_TCPLdTransfers;
464 // Stats::Scalar GPU_TCCLdHits;
465 // Stats::Scalar GPU_LdMiss;
466 //
467 // Stats::Scalar GPU_TCPStHits;
468 // Stats::Scalar GPU_TCPStTransfers;
469 // Stats::Scalar GPU_TCCStHits;
470 // Stats::Scalar GPU_StMiss;
471 //
472 // Stats::Scalar CP_TCPLdHits;
473 // Stats::Scalar CP_TCPLdTransfers;
474 // Stats::Scalar CP_TCCLdHits;
475 // Stats::Scalar CP_LdMiss;
476 //
477 // Stats::Scalar CP_TCPStHits;
478 // Stats::Scalar CP_TCPStTransfers;
479 // Stats::Scalar CP_TCCStHits;
480 // Stats::Scalar CP_StMiss;
481
482 //! Histogram for number of outstanding requests per cycle.
483 Stats::Histogram m_outstandReqHist;
484
485 //! Histogram for holding latency profile of all requests.
486 Stats::Histogram m_latencyHist;
487 std::vector<Stats::Histogram *> m_typeLatencyHist;
488
489 //! Histogram for holding latency profile of all requests that
490 //! miss in the controller connected to this sequencer.
491 Stats::Histogram m_missLatencyHist;
492 std::vector<Stats::Histogram *> m_missTypeLatencyHist;
493
494 //! Histograms for profiling the latencies for requests that
495 //! required external messages.
496 std::vector<Stats::Histogram *> m_missMachLatencyHist;
497 std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
498
499 //! Histograms for recording the breakdown of miss latency
500 std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
501 std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
502 std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
503 std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
504
505 // TODO - Need to update the following stats once the VIPER protocol
506 // is re-integrated.
507 // Stats::Distribution numHopDelays;
508 // Stats::Distribution tcpToTccDelay;
509 // Stats::Distribution tccToSdDelay;
510 // Stats::Distribution sdToSdDelay;
511 // Stats::Distribution sdToTccDelay;
512 // Stats::Distribution tccToTcpDelay;
513 //
514 // Stats::Average avgTcpToTcc;
515 // Stats::Average avgTccToSd;
516 // Stats::Average avgSdToSd;
517 // Stats::Average avgSdToTcc;
518 // Stats::Average avgTccToTcp;
519
520 private:
521 // Token port is used to send/receive tokens to/from GPU's global memory
522 // pipeline across the port boundary. There is one per <wave size> data
523 // ports in the CU.
524 GMTokenPort gmTokenPort;
525
526 // Private copy constructor and assignment operator
527 GPUCoalescer(const GPUCoalescer& obj);
528 GPUCoalescer& operator=(const GPUCoalescer& obj);
529 };
530
531 inline std::ostream&
532 operator<<(std::ostream& out, const GPUCoalescer& obj)
533 {
534 obj.print(out);
535 out << std::flush;
536 return out;
537 }
538
539 #endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__