08eccd8d89a43789adb499a8a776f099a179d64d
[gem5.git] / src / gpu-compute / tlb_coalescer.cc
1 /*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * For use for simulation and test purposes only
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the copyright holder nor the names of its
18 * contributors may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "gpu-compute/tlb_coalescer.hh"
35
36 #include <cstring>
37
38 #include "arch/x86/isa_traits.hh"
39 #include "base/logging.hh"
40 #include "debug/GPUTLB.hh"
41 #include "sim/process.hh"
42
43 TLBCoalescer::TLBCoalescer(const Params *p)
44 : ClockedObject(p),
45 TLBProbesPerCycle(p->probesPerCycle),
46 coalescingWindow(p->coalescingWindow),
47 disableCoalescing(p->disableCoalescing),
48 probeTLBEvent([this]{ processProbeTLBEvent(); },
49 "Probe the TLB below",
50 false, Event::CPU_Tick_Pri),
51 cleanupEvent([this]{ processCleanupEvent(); },
52 "Cleanup issuedTranslationsTable hashmap",
53 false, Event::Maximum_Pri)
54 {
55 // create the slave ports based on the number of connected ports
56 for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
57 cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
58 this, i));
59 }
60
61 // create the master ports based on the number of connected ports
62 for (size_t i = 0; i < p->port_master_connection_count; ++i) {
63 memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
64 this, i));
65 }
66 }
67
68 Port &
69 TLBCoalescer::getPort(const std::string &if_name, PortID idx)
70 {
71 if (if_name == "slave") {
72 if (idx >= static_cast<PortID>(cpuSidePort.size())) {
73 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
74 }
75
76 return *cpuSidePort[idx];
77 } else if (if_name == "master") {
78 if (idx >= static_cast<PortID>(memSidePort.size())) {
79 panic("TLBCoalescer::getPort: unknown index %d\n", idx);
80 }
81
82 return *memSidePort[idx];
83 } else {
84 panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
85 }
86 }
87
88 /*
89 * This method returns true if the <incoming_pkt>
90 * can be coalesced with <coalesced_pkt> and false otherwise.
91 * A given set of rules is checked.
92 * The rules can potentially be modified based on the TLB level.
93 */
94 bool
95 TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
96 {
97 if (disableCoalescing)
98 return false;
99
100 TheISA::GpuTLB::TranslationState *incoming_state =
101 safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
102
103 TheISA::GpuTLB::TranslationState *coalesced_state =
104 safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
105
106 // Rule 1: Coalesce requests only if they
107 // fall within the same virtual page
108 Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
109 TheISA::PageBytes);
110
111 Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
112 TheISA::PageBytes);
113
114 if (incoming_virt_page_addr != coalesced_virt_page_addr)
115 return false;
116
117 //* Rule 2: Coalesce requests only if they
118 // share a TLB Mode, i.e. they are both read
119 // or write requests.
120 BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
121 BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
122
123 if (incoming_mode != coalesced_mode)
124 return false;
125
126 // when we can coalesce a packet update the reqCnt
127 // that is the number of packets represented by
128 // this coalesced packet
129 if (!incoming_state->prefetch)
130 coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
131
132 return true;
133 }
134
135 /*
136 * We need to update the physical addresses of all the translation requests
137 * that were coalesced into the one that just returned.
138 */
139 void
140 TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
141 {
142 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
143
144 DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
145 issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
146
147 TheISA::GpuTLB::TranslationState *sender_state =
148 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
149
150 TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
151 assert(tlb_entry);
152 Addr first_entry_vaddr = tlb_entry->vaddr;
153 Addr first_entry_paddr = tlb_entry->paddr;
154 int page_size = tlb_entry->size();
155 bool uncacheable = tlb_entry->uncacheable;
156 int first_hit_level = sender_state->hitLevel;
157
158 // Get the physical page address of the translated request
159 // Using the page_size specified in the TLBEntry allows us
160 // to support different page sizes.
161 Addr phys_page_paddr = pkt->req->getPaddr();
162 phys_page_paddr &= ~(page_size - 1);
163
164 for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
165 PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
166 TheISA::GpuTLB::TranslationState *sender_state =
167 safe_cast<TheISA::GpuTLB::TranslationState*>(
168 local_pkt->senderState);
169
170 // we are sending the packet back, so pop the reqCnt associated
171 // with this level in the TLB hiearchy
172 if (!sender_state->prefetch)
173 sender_state->reqCnt.pop_back();
174
175 /*
176 * Only the first packet from this coalesced request has been
177 * translated. Grab the translated phys. page addr and update the
178 * physical addresses of the remaining packets with the appropriate
179 * page offsets.
180 */
181 if (i) {
182 Addr paddr = phys_page_paddr;
183 paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
184 local_pkt->req->setPaddr(paddr);
185
186 if (uncacheable)
187 local_pkt->req->setFlags(Request::UNCACHEABLE);
188
189 // update senderState->tlbEntry, so we can insert
190 // the correct TLBEentry in the TLBs above.
191 auto p = sender_state->tc->getProcessPtr();
192 sender_state->tlbEntry =
193 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
194 first_entry_paddr, false, false);
195
196 // update the hitLevel for all uncoalesced reqs
197 // so that each packet knows where it hit
198 // (used for statistics in the CUs)
199 sender_state->hitLevel = first_hit_level;
200 }
201
202 ResponsePort *return_port = sender_state->ports.back();
203 sender_state->ports.pop_back();
204
205 // Translation is done - Convert to a response pkt if necessary and
206 // send the translation back
207 if (local_pkt->isRequest()) {
208 local_pkt->makeTimingResponse();
209 }
210
211 return_port->sendTimingResp(local_pkt);
212 }
213
214 // schedule clean up for end of this cycle
215 // This is a maximum priority event and must be on
216 // the same cycle as GPUTLB cleanup event to prevent
217 // race conditions with an IssueProbeEvent caused by
218 // MemSidePort::recvReqRetry
219 cleanupQueue.push(virt_page_addr);
220
221 if (!cleanupEvent.scheduled())
222 schedule(cleanupEvent, curTick());
223 }
224
225 // Receive translation requests, create a coalesced request,
226 // and send them to the TLB (TLBProbesPerCycle)
227 bool
228 TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
229 {
230 // first packet of a coalesced request
231 PacketPtr first_packet = nullptr;
232 // true if we are able to do coalescing
233 bool didCoalesce = false;
234 // number of coalesced reqs for a given window
235 int coalescedReq_cnt = 0;
236
237 TheISA::GpuTLB::TranslationState *sender_state =
238 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
239
240 // push back the port to remember the path back
241 sender_state->ports.push_back(this);
242
243 bool update_stats = !sender_state->prefetch;
244
245 if (update_stats) {
246 // if reqCnt is empty then this packet does not represent
247 // multiple uncoalesced reqs(pkts) but just a single pkt.
248 // If it does though then the reqCnt for each level in the
249 // hierarchy accumulates the total number of reqs this packet
250 // represents
251 int req_cnt = 1;
252
253 if (!sender_state->reqCnt.empty())
254 req_cnt = sender_state->reqCnt.back();
255
256 sender_state->reqCnt.push_back(req_cnt);
257
258 // update statistics
259 coalescer->uncoalescedAccesses++;
260 req_cnt = sender_state->reqCnt.back();
261 DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
262 coalescer->queuingCycles -= (curTick() * req_cnt);
263 coalescer->localqueuingCycles -= curTick();
264 }
265
266 // FIXME if you want to coalesce not based on the issueTime
267 // of the packets (i.e., from the compute unit's perspective)
268 // but based on when they reached this coalescer then
269 // remove the following if statement and use curTick() or
270 // coalescingWindow for the tick_index.
271 if (!sender_state->issueTime)
272 sender_state->issueTime = curTick();
273
274 // The tick index is used as a key to the coalescerFIFO hashmap.
275 // It is shared by all candidates that fall within the
276 // given coalescingWindow.
277 int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
278
279 if (coalescer->coalescerFIFO.count(tick_index)) {
280 coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
281 }
282
283 // see if we can coalesce the incoming pkt with another
284 // coalesced request with the same tick_index
285 for (int i = 0; i < coalescedReq_cnt; ++i) {
286 first_packet = coalescer->coalescerFIFO[tick_index][i][0];
287
288 if (coalescer->canCoalesce(pkt, first_packet)) {
289 coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
290
291 DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
292 i, tick_index,
293 coalescer->coalescerFIFO[tick_index][i].size());
294
295 didCoalesce = true;
296 break;
297 }
298 }
299
300 // if this is the first request for this tick_index
301 // or we did not manage to coalesce, update stats
302 // and make necessary allocations.
303 if (!coalescedReq_cnt || !didCoalesce) {
304 if (update_stats)
305 coalescer->coalescedAccesses++;
306
307 std::vector<PacketPtr> new_array;
308 new_array.push_back(pkt);
309 coalescer->coalescerFIFO[tick_index].push_back(new_array);
310
311 DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
312 "push\n", tick_index,
313 coalescer->coalescerFIFO[tick_index].size());
314 }
315
316 //schedule probeTLBEvent next cycle to send the
317 //coalesced requests to the TLB
318 if (!coalescer->probeTLBEvent.scheduled()) {
319 coalescer->schedule(coalescer->probeTLBEvent,
320 curTick() + coalescer->clockPeriod());
321 }
322
323 return true;
324 }
325
326 void
327 TLBCoalescer::CpuSidePort::recvReqRetry()
328 {
329 panic("recvReqRetry called");
330 }
331
332 void
333 TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
334 {
335
336 TheISA::GpuTLB::TranslationState *sender_state =
337 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
338
339 bool update_stats = !sender_state->prefetch;
340
341 if (update_stats)
342 coalescer->uncoalescedAccesses++;
343
344 // If there is a pending timing request for this virtual address
345 // print a warning message. This is a temporary caveat of
346 // the current simulator where atomic and timing requests can
347 // coexist. FIXME remove this check/warning in the future.
348 Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
349 int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
350
351 if (map_count) {
352 DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
353 "req. pending\n", virt_page_addr);
354 }
355
356 coalescer->memSidePort[0]->sendFunctional(pkt);
357 }
358
359 AddrRangeList
360 TLBCoalescer::CpuSidePort::getAddrRanges() const
361 {
362 // currently not checked by the master
363 AddrRangeList ranges;
364
365 return ranges;
366 }
367
368 bool
369 TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
370 {
371 // a translation completed and returned
372 coalescer->updatePhysAddresses(pkt);
373
374 return true;
375 }
376
377 void
378 TLBCoalescer::MemSidePort::recvReqRetry()
379 {
380 //we've receeived a retry. Schedule a probeTLBEvent
381 if (!coalescer->probeTLBEvent.scheduled())
382 coalescer->schedule(coalescer->probeTLBEvent,
383 curTick() + coalescer->clockPeriod());
384 }
385
386 void
387 TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
388 {
389 fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
390 }
391
392 /*
393 * Here we scan the coalescer FIFO and issue the max
394 * number of permitted probes to the TLB below. We
395 * permit bypassing of coalesced requests for the same
396 * tick_index.
397 *
398 * We do not access the next tick_index unless we've
399 * drained the previous one. The coalesced requests
400 * that are successfully sent are moved to the
401 * issuedTranslationsTable table (the table which keeps
402 * track of the outstanding reqs)
403 */
404 void
405 TLBCoalescer::processProbeTLBEvent()
406 {
407 // number of TLB probes sent so far
408 int sent_probes = 0;
409 // rejected denotes a blocking event
410 bool rejected = false;
411
412 // It is set to true either when the recvTiming of the TLB below
413 // returns false or when there is another outstanding request for the
414 // same virt. page.
415
416 DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
417
418 for (auto iter = coalescerFIFO.begin();
419 iter != coalescerFIFO.end() && !rejected; ) {
420 int coalescedReq_cnt = iter->second.size();
421 int i = 0;
422 int vector_index = 0;
423
424 DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
425 coalescedReq_cnt, iter->first);
426
427 while (i < coalescedReq_cnt) {
428 ++i;
429 PacketPtr first_packet = iter->second[vector_index][0];
430
431 // compute virtual page address for this request
432 Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
433 TheISA::PageBytes);
434
435 // is there another outstanding request for the same page addr?
436 int pending_reqs =
437 issuedTranslationsTable.count(virt_page_addr);
438
439 if (pending_reqs) {
440 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
441 "page %#x\n", virt_page_addr);
442
443 ++vector_index;
444 rejected = true;
445
446 continue;
447 }
448
449 // send the coalesced request for virt_page_addr
450 if (!memSidePort[0]->sendTimingReq(first_packet)) {
451 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
452 virt_page_addr);
453
454 // No need for a retries queue since we are already buffering
455 // the coalesced request in coalescerFIFO.
456 rejected = true;
457 ++vector_index;
458 } else {
459 TheISA::GpuTLB::TranslationState *tmp_sender_state =
460 safe_cast<TheISA::GpuTLB::TranslationState*>
461 (first_packet->senderState);
462
463 bool update_stats = !tmp_sender_state->prefetch;
464
465 if (update_stats) {
466 // req_cnt is total number of packets represented
467 // by the one we just sent counting all the way from
468 // the top of TLB hiearchy (i.e., from the CU)
469 int req_cnt = tmp_sender_state->reqCnt.back();
470 queuingCycles += (curTick() * req_cnt);
471
472 DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
473 name(), req_cnt);
474
475 // pkt_cnt is number of packets we coalesced into the one
476 // we just sent but only at this coalescer level
477 int pkt_cnt = iter->second[vector_index].size();
478 localqueuingCycles += (curTick() * pkt_cnt);
479 }
480
481 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
482 virt_page_addr);
483
484 //copy coalescedReq to issuedTranslationsTable
485 issuedTranslationsTable[virt_page_addr]
486 = iter->second[vector_index];
487
488 //erase the entry of this coalesced req
489 iter->second.erase(iter->second.begin() + vector_index);
490
491 if (iter->second.empty())
492 assert(i == coalescedReq_cnt);
493
494 sent_probes++;
495 if (sent_probes == TLBProbesPerCycle)
496 return;
497 }
498 }
499
500 //if there are no more coalesced reqs for this tick_index
501 //erase the hash_map with the first iterator
502 if (iter->second.empty()) {
503 coalescerFIFO.erase(iter++);
504 } else {
505 ++iter;
506 }
507 }
508 }
509
510 void
511 TLBCoalescer::processCleanupEvent()
512 {
513 while (!cleanupQueue.empty()) {
514 Addr cleanup_addr = cleanupQueue.front();
515 cleanupQueue.pop();
516 issuedTranslationsTable.erase(cleanup_addr);
517
518 DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
519 cleanup_addr);
520 }
521 }
522
523 void
524 TLBCoalescer::regStats()
525 {
526 ClockedObject::regStats();
527
528 uncoalescedAccesses
529 .name(name() + ".uncoalesced_accesses")
530 .desc("Number of uncoalesced TLB accesses")
531 ;
532
533 coalescedAccesses
534 .name(name() + ".coalesced_accesses")
535 .desc("Number of coalesced TLB accesses")
536 ;
537
538 queuingCycles
539 .name(name() + ".queuing_cycles")
540 .desc("Number of cycles spent in queue")
541 ;
542
543 localqueuingCycles
544 .name(name() + ".local_queuing_cycles")
545 .desc("Number of cycles spent in queue for all incoming reqs")
546 ;
547
548 localLatency
549 .name(name() + ".local_latency")
550 .desc("Avg. latency over all incoming pkts")
551 ;
552
553 localLatency = localqueuingCycles / uncoalescedAccesses;
554 }
555
556
557 TLBCoalescer*
558 TLBCoalescerParams::create()
559 {
560 return new TLBCoalescer(this);
561 }
562