gpu-compute: Dropping fetchs when no entry is reserved in the buffer
[gem5.git] / src / gpu-compute / tlb_coalescer.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/tlb_coalescer.hh"
35
36 #include <cstring>
37
38 #include "base/logging.hh"
39 #include "debug/GPUTLB.hh"
40 #include "sim/process.hh"
41
42 TLBCoalescer::TLBCoalescer(const Params *p)
43 : ClockedObject(p),
44 TLBProbesPerCycle(p->probesPerCycle),
45 coalescingWindow(p->coalescingWindow),
46 disableCoalescing(p->disableCoalescing),
47 probeTLBEvent([this]{ processProbeTLBEvent(); },
48 "Probe the TLB below",
49 false, Event::CPU_Tick_Pri),
50 cleanupEvent([this]{ processCleanupEvent(); },
51 "Cleanup issuedTranslationsTable hashmap",
52 false, Event::Maximum_Pri)
53 {
54 // create the slave ports based on the number of connected ports
55 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
56 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
57 this, i));
58 }
59
60 // create the master ports based on the number of connected ports
61 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
62 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
63 this, i));
64 }
65 }
66
67 Port &
68 TLBCoalescer::getPort(const std::string &if_name, PortID idx)
69 {
70 if (if_name == "slave") {
71 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
72 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
73 }
74
75 return *cpuSidePort[idx];
76 } else if (if_name == "master") {
77 if (idx >= static_cast<PortID>(memSidePort.size())) {
78 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
79 }
80
81 return *memSidePort[idx];
82 } else {
83 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
84 }
85 }
86
87 /*
88 * This method returns true if the <incoming_pkt>
89 * can be coalesced with <coalesced_pkt> and false otherwise.
90 * A given set of rules is checked.
91 * The rules can potentially be modified based on the TLB level.
92 */
93 bool
94 TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
95 {
96 if (disableCoalescing)
97 return false;
98
99 TheISA::GpuTLB::TranslationState *incoming_state =
100 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
101
102 TheISA::GpuTLB::TranslationState *coalesced_state =
103 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
104
105 // Rule 1: Coalesce requests only if they
106 // fall within the same virtual page
107 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
108 TheISA::PageBytes);
109
110 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
111 TheISA::PageBytes);
112
113 if (incoming_virt_page_addr != coalesced_virt_page_addr)
114 return false;
115
116 //* Rule 2: Coalesce requests only if they
117 // share a TLB Mode, i.e. they are both read
118 // or write requests.
119 BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
120 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
121
122 if (incoming_mode != coalesced_mode)
123 return false;
124
125 // when we can coalesce a packet update the reqCnt
126 // that is the number of packets represented by
127 // this coalesced packet
128 if (!incoming_state->prefetch)
129 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
130
131 return true;
132 }
133
134 /*
135 * We need to update the physical addresses of all the translation requests
136 * that were coalesced into the one that just returned.
137 */
138 void
139 TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
140 {
141 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
142
143 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
144 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
145
146 TheISA::GpuTLB::TranslationState *sender_state =
147 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
148
149 TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
150 assert(tlb_entry);
151 Addr first_entry_vaddr = tlb_entry->vaddr;
152 Addr first_entry_paddr = tlb_entry->paddr;
153 int page_size = tlb_entry->size();
154 bool uncacheable = tlb_entry->uncacheable;
155 int first_hit_level = sender_state->hitLevel;
156
157 // Get the physical page address of the translated request
158 // Using the page_size specified in the TLBEntry allows us
159 // to support different page sizes.
160 Addr phys_page_paddr = pkt->req->getPaddr();
161 phys_page_paddr &= ~(page_size - 1);
162
163 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
164 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
165 TheISA::GpuTLB::TranslationState *sender_state =
166 safe_cast<TheISA::GpuTLB::TranslationState*>(
167 local_pkt->senderState);
168
169 // we are sending the packet back, so pop the reqCnt associated
170 // with this level in the TLB hiearchy
171 if (!sender_state->prefetch)
172 sender_state->reqCnt.pop_back();
173
174 /*
175 * Only the first packet from this coalesced request has been
176 * translated. Grab the translated phys. page addr and update the
177 * physical addresses of the remaining packets with the appropriate
178 * page offsets.
179 */
180 if (i) {
181 Addr paddr = phys_page_paddr;
182 paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
183 local_pkt->req->setPaddr(paddr);
184
185 if (uncacheable)
186 local_pkt->req->setFlags(Request::UNCACHEABLE);
187
188 // update senderState->tlbEntry, so we can insert
189 // the correct TLBEentry in the TLBs above.
190 auto p = sender_state->tc->getProcessPtr();
191 sender_state->tlbEntry =
192 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
193 first_entry_paddr, false, false);
194
195 // update the hitLevel for all uncoalesced reqs
196 // so that each packet knows where it hit
197 // (used for statistics in the CUs)
198 sender_state->hitLevel = first_hit_level;
199 }
200
201 SlavePort *return_port = sender_state->ports.back();
202 sender_state->ports.pop_back();
203
204 // Translation is done - Convert to a response pkt if necessary and
205 // send the translation back
206 if (local_pkt->isRequest()) {
207 local_pkt->makeTimingResponse();
208 }
209
210 return_port->sendTimingResp(local_pkt);
211 }
212
213 // schedule clean up for end of this cycle
214 // This is a maximum priority event and must be on
215 // the same cycle as GPUTLB cleanup event to prevent
216 // race conditions with an IssueProbeEvent caused by
217 // MemSidePort::recvReqRetry
218 cleanupQueue.push(virt_page_addr);
219
220 if (!cleanupEvent.scheduled())
221 schedule(cleanupEvent, curTick());
222 }
223
224 // Receive translation requests, create a coalesced request,
225 // and send them to the TLB (TLBProbesPerCycle)
226 bool
227 TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
228 {
229 // first packet of a coalesced request
230 PacketPtr first_packet = nullptr;
231 // true if we are able to do coalescing
232 bool didCoalesce = false;
233 // number of coalesced reqs for a given window
234 int coalescedReq_cnt = 0;
235
236 TheISA::GpuTLB::TranslationState *sender_state =
237 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
238
239 // push back the port to remember the path back
240 sender_state->ports.push_back(this);
241
242 bool update_stats = !sender_state->prefetch;
243
244 if (update_stats) {
245 // if reqCnt is empty then this packet does not represent
246 // multiple uncoalesced reqs(pkts) but just a single pkt.
247 // If it does though then the reqCnt for each level in the
248 // hierarchy accumulates the total number of reqs this packet
249 // represents
250 int req_cnt = 1;
251
252 if (!sender_state->reqCnt.empty())
253 req_cnt = sender_state->reqCnt.back();
254
255 sender_state->reqCnt.push_back(req_cnt);
256
257 // update statistics
258 coalescer->uncoalescedAccesses++;
259 req_cnt = sender_state->reqCnt.back();
260 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
261 coalescer->queuingCycles -= (curTick() * req_cnt);
262 coalescer->localqueuingCycles -= curTick();
263 }
264
265 // FIXME if you want to coalesce not based on the issueTime
266 // of the packets (i.e., from the compute unit's perspective)
267 // but based on when they reached this coalescer then
268 // remove the following if statement and use curTick() or
269 // coalescingWindow for the tick_index.
270 if (!sender_state->issueTime)
271 sender_state->issueTime = curTick();
272
273 // The tick index is used as a key to the coalescerFIFO hashmap.
274 // It is shared by all candidates that fall within the
275 // given coalescingWindow.
276 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
277
278 if (coalescer->coalescerFIFO.count(tick_index)) {
279 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
280 }
281
282 // see if we can coalesce the incoming pkt with another
283 // coalesced request with the same tick_index
284 for (int i = 0; i < coalescedReq_cnt; ++i) {
285 first_packet = coalescer->coalescerFIFO[tick_index][i][0];
286
287 if (coalescer->canCoalesce(pkt, first_packet)) {
288 coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
289
290 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
291 i, tick_index,
292 coalescer->coalescerFIFO[tick_index][i].size());
293
294 didCoalesce = true;
295 break;
296 }
297 }
298
299 // if this is the first request for this tick_index
300 // or we did not manage to coalesce, update stats
301 // and make necessary allocations.
302 if (!coalescedReq_cnt || !didCoalesce) {
303 if (update_stats)
304 coalescer->coalescedAccesses++;
305
306 std::vector<PacketPtr> new_array;
307 new_array.push_back(pkt);
308 coalescer->coalescerFIFO[tick_index].push_back(new_array);
309
310 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
311 "push\n", tick_index,
312 coalescer->coalescerFIFO[tick_index].size());
313 }
314
315 //schedule probeTLBEvent next cycle to send the
316 //coalesced requests to the TLB
317 if (!coalescer->probeTLBEvent.scheduled()) {
318 coalescer->schedule(coalescer->probeTLBEvent,
319 curTick() + coalescer->clockPeriod());
320 }
321
322 return true;
323 }
324
325 void
326 TLBCoalescer::CpuSidePort::recvReqRetry()
327 {
328 panic("recvReqRetry called");
329 }
330
331 void
332 TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
333 {
334
335 TheISA::GpuTLB::TranslationState *sender_state =
336 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
337
338 bool update_stats = !sender_state->prefetch;
339
340 if (update_stats)
341 coalescer->uncoalescedAccesses++;
342
343 // If there is a pending timing request for this virtual address
344 // print a warning message. This is a temporary caveat of
345 // the current simulator where atomic and timing requests can
346 // coexist. FIXME remove this check/warning in the future.
347 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
348 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
349
350 if (map_count) {
351 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
352 "req. pending\n", virt_page_addr);
353 }
354
355 coalescer->memSidePort[0]->sendFunctional(pkt);
356 }
357
358 AddrRangeList
359 TLBCoalescer::CpuSidePort::getAddrRanges() const
360 {
361 // currently not checked by the master
362 AddrRangeList ranges;
363
364 return ranges;
365 }
366
367 bool
368 TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
369 {
370 // a translation completed and returned
371 coalescer->updatePhysAddresses(pkt);
372
373 return true;
374 }
375
376 void
377 TLBCoalescer::MemSidePort::recvReqRetry()
378 {
379 //we've receeived a retry. Schedule a probeTLBEvent
380 if (!coalescer->probeTLBEvent.scheduled())
381 coalescer->schedule(coalescer->probeTLBEvent,
382 curTick() + coalescer->clockPeriod());
383 }
384
385 void
386 TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
387 {
388 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
389 }
390
391 /*
392 * Here we scan the coalescer FIFO and issue the max
393 * number of permitted probes to the TLB below. We
394 * permit bypassing of coalesced requests for the same
395 * tick_index.
396 *
397 * We do not access the next tick_index unless we've
398 * drained the previous one. The coalesced requests
399 * that are successfully sent are moved to the
400 * issuedTranslationsTable table (the table which keeps
401 * track of the outstanding reqs)
402 */
403 void
404 TLBCoalescer::processProbeTLBEvent()
405 {
406 // number of TLB probes sent so far
407 int sent_probes = 0;
408 // rejected denotes a blocking event
409 bool rejected = false;
410
411 // It is set to true either when the recvTiming of the TLB below
412 // returns false or when there is another outstanding request for the
413 // same virt. page.
414
415 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
416
417 for (auto iter = coalescerFIFO.begin();
418 iter != coalescerFIFO.end() && !rejected; ) {
419 int coalescedReq_cnt = iter->second.size();
420 int i = 0;
421 int vector_index = 0;
422
423 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
424 coalescedReq_cnt, iter->first);
425
426 while (i < coalescedReq_cnt) {
427 ++i;
428 PacketPtr first_packet = iter->second[vector_index][0];
429
430 // compute virtual page address for this request
431 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
432 TheISA::PageBytes);
433
434 // is there another outstanding request for the same page addr?
435 int pending_reqs =
436 issuedTranslationsTable.count(virt_page_addr);
437
438 if (pending_reqs) {
439 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
440 "page %#x\n", virt_page_addr);
441
442 ++vector_index;
443 rejected = true;
444
445 continue;
446 }
447
448 // send the coalesced request for virt_page_addr
449 if (!memSidePort[0]->sendTimingReq(first_packet)) {
450 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
451 virt_page_addr);
452
453 // No need for a retries queue since we are already buffering
454 // the coalesced request in coalescerFIFO.
455 rejected = true;
456 ++vector_index;
457 } else {
458 TheISA::GpuTLB::TranslationState *tmp_sender_state =
459 safe_cast<TheISA::GpuTLB::TranslationState*>
460 (first_packet->senderState);
461
462 bool update_stats = !tmp_sender_state->prefetch;
463
464 if (update_stats) {
465 // req_cnt is total number of packets represented
466 // by the one we just sent counting all the way from
467 // the top of TLB hiearchy (i.e., from the CU)
468 int req_cnt = tmp_sender_state->reqCnt.back();
469 queuingCycles += (curTick() * req_cnt);
470
471 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
472 name(), req_cnt);
473
474 // pkt_cnt is number of packets we coalesced into the one
475 // we just sent but only at this coalescer level
476 int pkt_cnt = iter->second[vector_index].size();
477 localqueuingCycles += (curTick() * pkt_cnt);
478 }
479
480 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
481 virt_page_addr);
482
483 //copy coalescedReq to issuedTranslationsTable
484 issuedTranslationsTable[virt_page_addr]
485 = iter->second[vector_index];
486
487 //erase the entry of this coalesced req
488 iter->second.erase(iter->second.begin() + vector_index);
489
490 if (iter->second.empty())
491 assert(i == coalescedReq_cnt);
492
493 sent_probes++;
494 if (sent_probes == TLBProbesPerCycle)
495 return;
496 }
497 }
498
499 //if there are no more coalesced reqs for this tick_index
500 //erase the hash_map with the first iterator
501 if (iter->second.empty()) {
502 coalescerFIFO.erase(iter++);
503 } else {
504 ++iter;
505 }
506 }
507 }
508
509 void
510 TLBCoalescer::processCleanupEvent()
511 {
512 while (!cleanupQueue.empty()) {
513 Addr cleanup_addr = cleanupQueue.front();
514 cleanupQueue.pop();
515 issuedTranslationsTable.erase(cleanup_addr);
516
517 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
518 cleanup_addr);
519 }
520 }
521
522 void
523 TLBCoalescer::regStats()
524 {
525 ClockedObject::regStats();
526
527 uncoalescedAccesses
528 .name(name() + ".uncoalesced_accesses")
529 .desc("Number of uncoalesced TLB accesses")
530 ;
531
532 coalescedAccesses
533 .name(name() + ".coalesced_accesses")
534 .desc("Number of coalesced TLB accesses")
535 ;
536
537 queuingCycles
538 .name(name() + ".queuing_cycles")
539 .desc("Number of cycles spent in queue")
540 ;
541
542 localqueuingCycles
543 .name(name() + ".local_queuing_cycles")
544 .desc("Number of cycles spent in queue for all incoming reqs")
545 ;
546
547 localLatency
548 .name(name() + ".local_latency")
549 .desc("Avg. latency over all incoming pkts")
550 ;
551
552 localLatency = localqueuingCycles / uncoalescedAccesses;
553 }
554
555
556 TLBCoalescer*
557 TLBCoalescerParams::create()
558 {
559 return new TLBCoalescer(this);
560 }
561