src/gpu-compute/tlb_coalescer.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/tlb_coalescer.hh"
  35
  36 #include <cstring>
  37
  38 #include "base/logging.hh"
  39 #include "debug/GPUTLB.hh"
  40 #include "sim/process.hh"
  41
  42 TLBCoalescer::TLBCoalescer(const Params *p)
  43     : ClockedObject(p),
  44       TLBProbesPerCycle(p->probesPerCycle),
  45       coalescingWindow(p->coalescingWindow),
  46       disableCoalescing(p->disableCoalescing),
  47       probeTLBEvent([this]{ processProbeTLBEvent(); },
  48                     "Probe the TLB below",
  49                     false, Event::CPU_Tick_Pri),
  50       cleanupEvent([this]{ processCleanupEvent(); },
  51                    "Cleanup issuedTranslationsTable hashmap",
  52                    false, Event::Maximum_Pri)
  53 {
  54     // create the slave ports based on the number of connected ports
  55     for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
  56         cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
  57                                               this, i));
  58     }
  59
  60     // create the master ports based on the number of connected ports
  61     for (size_t i = 0; i < p->port_master_connection_count; ++i) {
  62         memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
  63                                               this, i));
  64     }
  65 }
  66
  67 Port &
  68 TLBCoalescer::getPort(const std::string &if_name, PortID idx)
  69 {
  70     if (if_name == "slave") {
  71         if (idx >= static_cast<PortID>(cpuSidePort.size())) {
  72             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
  73         }
  74
  75         return *cpuSidePort[idx];
  76     } else  if (if_name == "master") {
  77         if (idx >= static_cast<PortID>(memSidePort.size())) {
  78             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
  79         }
  80
  81         return *memSidePort[idx];
  82     } else {
  83         panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
  84     }
  85 }
  86
  87 /*
  88  * This method returns true if the <incoming_pkt>
  89  * can be coalesced with <coalesced_pkt> and false otherwise.
  90  * A given set of rules is checked.
  91  * The rules can potentially be modified based on the TLB level.
  92  */
  93 bool
  94 TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
  95 {
  96     if (disableCoalescing)
  97         return false;
  98
  99     TheISA::GpuTLB::TranslationState *incoming_state =
 100       safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
 101
 102     TheISA::GpuTLB::TranslationState *coalesced_state =
 103      safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
 104
 105     // Rule 1: Coalesce requests only if they
 106     // fall within the same virtual page
 107     Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
 108                                              TheISA::PageBytes);
 109
 110     Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
 111                                               TheISA::PageBytes);
 112
 113     if (incoming_virt_page_addr != coalesced_virt_page_addr)
 114         return false;
 115
 116     //* Rule 2: Coalesce requests only if they
 117     // share a TLB Mode, i.e. they are both read
 118     // or write requests.
 119     BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
 120     BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
 121
 122     if (incoming_mode != coalesced_mode)
 123         return false;
 124
 125     // when we can coalesce a packet update the reqCnt
 126     // that is the number of packets represented by
 127     // this coalesced packet
 128     if (!incoming_state->prefetch)
 129         coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
 130
 131     return true;
 132 }
 133
 134 /*
 135  * We need to update the physical addresses of all the translation requests
 136  * that were coalesced into the one that just returned.
 137  */
 138 void
 139 TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
 140 {
 141     Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
 142
 143     DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
 144             issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
 145
 146     TheISA::GpuTLB::TranslationState *sender_state =
 147         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 148
 149     TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
 150     assert(tlb_entry);
 151     Addr first_entry_vaddr = tlb_entry->vaddr;
 152     Addr first_entry_paddr = tlb_entry->paddr;
 153     int page_size = tlb_entry->size();
 154     bool uncacheable = tlb_entry->uncacheable;
 155     int first_hit_level = sender_state->hitLevel;
 156
 157     // Get the physical page address of the translated request
 158     // Using the page_size specified in the TLBEntry allows us
 159     // to support different page sizes.
 160     Addr phys_page_paddr = pkt->req->getPaddr();
 161     phys_page_paddr &= ~(page_size - 1);
 162
 163     for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
 164         PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
 165         TheISA::GpuTLB::TranslationState *sender_state =
 166             safe_cast<TheISA::GpuTLB::TranslationState*>(
 167                     local_pkt->senderState);
 168
 169         // we are sending the packet back, so pop the reqCnt associated
 170         // with this level in the TLB hiearchy
 171         if (!sender_state->prefetch)
 172             sender_state->reqCnt.pop_back();
 173
 174         /*
 175          * Only the first packet from this coalesced request has been
 176          * translated. Grab the translated phys. page addr and update the
 177          * physical addresses of the remaining packets with the appropriate
 178          * page offsets.
 179          */
 180         if (i) {
 181             Addr paddr = phys_page_paddr;
 182             paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
 183             local_pkt->req->setPaddr(paddr);
 184
 185             if (uncacheable)
 186                 local_pkt->req->setFlags(Request::UNCACHEABLE);
 187
 188             // update senderState->tlbEntry, so we can insert
 189             // the correct TLBEentry in the TLBs above.
 190             auto p = sender_state->tc->getProcessPtr();
 191             sender_state->tlbEntry =
 192                 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
 193                     first_entry_paddr, false, false);
 194
 195             // update the hitLevel for all uncoalesced reqs
 196             // so that each packet knows where it hit
 197             // (used for statistics in the CUs)
 198             sender_state->hitLevel = first_hit_level;
 199         }
 200
 201         SlavePort *return_port = sender_state->ports.back();
 202         sender_state->ports.pop_back();
 203
 204         // Translation is done - Convert to a response pkt if necessary and
 205         // send the translation back
 206         if (local_pkt->isRequest()) {
 207             local_pkt->makeTimingResponse();
 208         }
 209
 210         return_port->sendTimingResp(local_pkt);
 211     }
 212
 213     // schedule clean up for end of this cycle
 214     // This is a maximum priority event and must be on
 215     // the same cycle as GPUTLB cleanup event to prevent
 216     // race conditions with an IssueProbeEvent caused by
 217     // MemSidePort::recvReqRetry
 218     cleanupQueue.push(virt_page_addr);
 219
 220     if (!cleanupEvent.scheduled())
 221         schedule(cleanupEvent, curTick());
 222 }
 223
 224 // Receive translation requests, create a coalesced request,
 225 // and send them to the TLB (TLBProbesPerCycle)
 226 bool
 227 TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
 228 {
 229     // first packet of a coalesced request
 230     PacketPtr first_packet = nullptr;
 231     // true if we are able to do coalescing
 232     bool didCoalesce = false;
 233     // number of coalesced reqs for a given window
 234     int coalescedReq_cnt = 0;
 235
 236     TheISA::GpuTLB::TranslationState *sender_state =
 237         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 238
 239     // push back the port to remember the path back
 240     sender_state->ports.push_back(this);
 241
 242     bool update_stats = !sender_state->prefetch;
 243
 244     if (update_stats) {
 245         // if reqCnt is empty then this packet does not represent
 246         // multiple uncoalesced reqs(pkts) but just a single pkt.
 247         // If it does though then the reqCnt for each level in the
 248         // hierarchy accumulates the total number of reqs this packet
 249         // represents
 250         int req_cnt = 1;
 251
 252         if (!sender_state->reqCnt.empty())
 253             req_cnt = sender_state->reqCnt.back();
 254
 255         sender_state->reqCnt.push_back(req_cnt);
 256
 257         // update statistics
 258         coalescer->uncoalescedAccesses++;
 259         req_cnt = sender_state->reqCnt.back();
 260         DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
 261         coalescer->queuingCycles -= (curTick() * req_cnt);
 262         coalescer->localqueuingCycles -= curTick();
 263     }
 264
 265     // FIXME if you want to coalesce not based on the issueTime
 266     // of the packets (i.e., from the compute unit's perspective)
 267     // but based on when they reached this coalescer then
 268     // remove the following if statement and use curTick() or
 269     // coalescingWindow for the tick_index.
 270     if (!sender_state->issueTime)
 271        sender_state->issueTime = curTick();
 272
 273     // The tick index is used as a key to the coalescerFIFO hashmap.
 274     // It is shared by all candidates that fall within the
 275     // given coalescingWindow.
 276     int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
 277
 278     if (coalescer->coalescerFIFO.count(tick_index)) {
 279         coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
 280     }
 281
 282     // see if we can coalesce the incoming pkt with another
 283     // coalesced request with the same tick_index
 284     for (int i = 0; i < coalescedReq_cnt; ++i) {
 285         first_packet = coalescer->coalescerFIFO[tick_index][i][0];
 286
 287         if (coalescer->canCoalesce(pkt, first_packet)) {
 288             coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
 289
 290             DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
 291                     i, tick_index,
 292                     coalescer->coalescerFIFO[tick_index][i].size());
 293
 294             didCoalesce = true;
 295             break;
 296         }
 297     }
 298
 299     // if this is the first request for this tick_index
 300     // or we did not manage to coalesce, update stats
 301     // and make necessary allocations.
 302     if (!coalescedReq_cnt || !didCoalesce) {
 303         if (update_stats)
 304             coalescer->coalescedAccesses++;
 305
 306         std::vector<PacketPtr> new_array;
 307         new_array.push_back(pkt);
 308         coalescer->coalescerFIFO[tick_index].push_back(new_array);
 309
 310         DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
 311                 "push\n", tick_index,
 312                 coalescer->coalescerFIFO[tick_index].size());
 313     }
 314
 315     //schedule probeTLBEvent next cycle to send the
 316     //coalesced requests to the TLB
 317     if (!coalescer->probeTLBEvent.scheduled()) {
 318         coalescer->schedule(coalescer->probeTLBEvent,
 319                 curTick() + coalescer->clockPeriod());
 320     }
 321
 322     return true;
 323 }
 324
 325 void
 326 TLBCoalescer::CpuSidePort::recvReqRetry()
 327 {
 328     panic("recvReqRetry called");
 329 }
 330
 331 void
 332 TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
 333 {
 334
 335     TheISA::GpuTLB::TranslationState *sender_state =
 336         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 337
 338     bool update_stats = !sender_state->prefetch;
 339
 340     if (update_stats)
 341         coalescer->uncoalescedAccesses++;
 342
 343     // If there is a pending timing request for this virtual address
 344     // print a warning message. This is a temporary caveat of
 345     // the current simulator where atomic and timing requests can
 346     // coexist. FIXME remove this check/warning in the future.
 347     Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
 348     int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
 349
 350     if (map_count) {
 351         DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
 352                 "req. pending\n", virt_page_addr);
 353     }
 354
 355     coalescer->memSidePort[0]->sendFunctional(pkt);
 356 }
 357
 358 AddrRangeList
 359 TLBCoalescer::CpuSidePort::getAddrRanges() const
 360 {
 361     // currently not checked by the master
 362     AddrRangeList ranges;
 363
 364     return ranges;
 365 }
 366
 367 bool
 368 TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
 369 {
 370     // a translation completed and returned
 371     coalescer->updatePhysAddresses(pkt);
 372
 373     return true;
 374 }
 375
 376 void
 377 TLBCoalescer::MemSidePort::recvReqRetry()
 378 {
 379     //we've receeived a retry. Schedule a probeTLBEvent
 380     if (!coalescer->probeTLBEvent.scheduled())
 381         coalescer->schedule(coalescer->probeTLBEvent,
 382                 curTick() + coalescer->clockPeriod());
 383 }
 384
 385 void
 386 TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
 387 {
 388     fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
 389 }
 390
 391 /*
 392  * Here we scan the coalescer FIFO and issue the max
 393  * number of permitted probes to the TLB below. We
 394  * permit bypassing of coalesced requests for the same
 395  * tick_index.
 396  *
 397  * We do not access the next tick_index unless we've
 398  * drained the previous one. The coalesced requests
 399  * that are successfully sent are moved to the
 400  * issuedTranslationsTable table (the table which keeps
 401  * track of the outstanding reqs)
 402  */
 403 void
 404 TLBCoalescer::processProbeTLBEvent()
 405 {
 406     // number of TLB probes sent so far
 407     int sent_probes = 0;
 408     // rejected denotes a blocking event
 409     bool rejected = false;
 410
 411     // It is set to true either when the recvTiming of the TLB below
 412     // returns false or when there is another outstanding request for the
 413     // same virt. page.
 414
 415     DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
 416
 417     for (auto iter = coalescerFIFO.begin();
 418          iter != coalescerFIFO.end() && !rejected; ) {
 419         int coalescedReq_cnt = iter->second.size();
 420         int i = 0;
 421         int vector_index = 0;
 422
 423         DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
 424                coalescedReq_cnt, iter->first);
 425
 426         while (i < coalescedReq_cnt) {
 427             ++i;
 428             PacketPtr first_packet = iter->second[vector_index][0];
 429
 430             // compute virtual page address for this request
 431             Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
 432                     TheISA::PageBytes);
 433
 434             // is there another outstanding request for the same page addr?
 435             int pending_reqs =
 436                 issuedTranslationsTable.count(virt_page_addr);
 437
 438             if (pending_reqs) {
 439                 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
 440                         "page %#x\n", virt_page_addr);
 441
 442                 ++vector_index;
 443                 rejected = true;
 444
 445                 continue;
 446             }
 447
 448             // send the coalesced request for virt_page_addr
 449             if (!memSidePort[0]->sendTimingReq(first_packet)) {
 450                 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
 451                        virt_page_addr);
 452
 453                 // No need for a retries queue since we are already buffering
 454                 // the coalesced request in coalescerFIFO.
 455                 rejected = true;
 456                 ++vector_index;
 457             } else {
 458                 TheISA::GpuTLB::TranslationState *tmp_sender_state =
 459                     safe_cast<TheISA::GpuTLB::TranslationState*>
 460                     (first_packet->senderState);
 461
 462                 bool update_stats = !tmp_sender_state->prefetch;
 463
 464                 if (update_stats) {
 465                     // req_cnt is total number of packets represented
 466                     // by the one we just sent counting all the way from
 467                     // the top of TLB hiearchy (i.e., from the CU)
 468                     int req_cnt = tmp_sender_state->reqCnt.back();
 469                     queuingCycles += (curTick() * req_cnt);
 470
 471                     DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
 472                             name(), req_cnt);
 473
 474                     // pkt_cnt is number of packets we coalesced into the one
 475                     // we just sent but only at this coalescer level
 476                     int pkt_cnt = iter->second[vector_index].size();
 477                     localqueuingCycles += (curTick() * pkt_cnt);
 478                 }
 479
 480                 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
 481                        virt_page_addr);
 482
 483                 //copy coalescedReq to issuedTranslationsTable
 484                 issuedTranslationsTable[virt_page_addr]
 485                     = iter->second[vector_index];
 486
 487                 //erase the entry of this coalesced req
 488                 iter->second.erase(iter->second.begin() + vector_index);
 489
 490                 if (iter->second.empty())
 491                     assert(i == coalescedReq_cnt);
 492
 493                 sent_probes++;
 494                 if (sent_probes == TLBProbesPerCycle)
 495                    return;
 496             }
 497         }
 498
 499         //if there are no more coalesced reqs for this tick_index
 500         //erase the hash_map with the first iterator
 501         if (iter->second.empty()) {
 502             coalescerFIFO.erase(iter++);
 503         } else {
 504             ++iter;
 505         }
 506     }
 507 }
 508
 509 void
 510 TLBCoalescer::processCleanupEvent()
 511 {
 512     while (!cleanupQueue.empty()) {
 513         Addr cleanup_addr = cleanupQueue.front();
 514         cleanupQueue.pop();
 515         issuedTranslationsTable.erase(cleanup_addr);
 516
 517         DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
 518                 cleanup_addr);
 519     }
 520 }
 521
 522 void
 523 TLBCoalescer::regStats()
 524 {
 525     ClockedObject::regStats();
 526
 527     uncoalescedAccesses
 528         .name(name() + ".uncoalesced_accesses")
 529         .desc("Number of uncoalesced TLB accesses")
 530         ;
 531
 532     coalescedAccesses
 533         .name(name() + ".coalesced_accesses")
 534         .desc("Number of coalesced TLB accesses")
 535         ;
 536
 537     queuingCycles
 538         .name(name() + ".queuing_cycles")
 539         .desc("Number of cycles spent in queue")
 540         ;
 541
 542     localqueuingCycles
 543         .name(name() + ".local_queuing_cycles")
 544         .desc("Number of cycles spent in queue for all incoming reqs")
 545         ;
 546
 547     localLatency
 548         .name(name() + ".local_latency")
 549         .desc("Avg. latency over all incoming pkts")
 550         ;
 551
 552     localLatency = localqueuingCycles / uncoalescedAccesses;
 553 }
 554
 555
 556 TLBCoalescer*
 557 TLBCoalescerParams::create()
 558 {
 559     return new TLBCoalescer(this);
 560 }
 561