src/gpu-compute/tlb_coalescer.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/tlb_coalescer.hh"
  35
  36 #include <cstring>
  37
  38 #include "arch/x86/isa_traits.hh"
  39 #include "base/logging.hh"
  40 #include "debug/GPUTLB.hh"
  41 #include "sim/process.hh"
  42
  43 TLBCoalescer::TLBCoalescer(const Params *p)
  44     : ClockedObject(p),
  45       TLBProbesPerCycle(p->probesPerCycle),
  46       coalescingWindow(p->coalescingWindow),
  47       disableCoalescing(p->disableCoalescing),
  48       probeTLBEvent([this]{ processProbeTLBEvent(); },
  49                     "Probe the TLB below",
  50                     false, Event::CPU_Tick_Pri),
  51       cleanupEvent([this]{ processCleanupEvent(); },
  52                    "Cleanup issuedTranslationsTable hashmap",
  53                    false, Event::Maximum_Pri)
  54 {
  55     // create the slave ports based on the number of connected ports
  56     for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
  57         cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
  58                                               this, i));
  59     }
  60
  61     // create the master ports based on the number of connected ports
  62     for (size_t i = 0; i < p->port_master_connection_count; ++i) {
  63         memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
  64                                               this, i));
  65     }
  66 }
  67
  68 Port &
  69 TLBCoalescer::getPort(const std::string &if_name, PortID idx)
  70 {
  71     if (if_name == "slave") {
  72         if (idx >= static_cast<PortID>(cpuSidePort.size())) {
  73             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
  74         }
  75
  76         return *cpuSidePort[idx];
  77     } else  if (if_name == "master") {
  78         if (idx >= static_cast<PortID>(memSidePort.size())) {
  79             panic("TLBCoalescer::getPort: unknown index %d\n", idx);
  80         }
  81
  82         return *memSidePort[idx];
  83     } else {
  84         panic("TLBCoalescer::getPort: unknown port %s\n", if_name);
  85     }
  86 }
  87
  88 /*
  89  * This method returns true if the <incoming_pkt>
  90  * can be coalesced with <coalesced_pkt> and false otherwise.
  91  * A given set of rules is checked.
  92  * The rules can potentially be modified based on the TLB level.
  93  */
  94 bool
  95 TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
  96 {
  97     if (disableCoalescing)
  98         return false;
  99
 100     TheISA::GpuTLB::TranslationState *incoming_state =
 101       safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
 102
 103     TheISA::GpuTLB::TranslationState *coalesced_state =
 104      safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
 105
 106     // Rule 1: Coalesce requests only if they
 107     // fall within the same virtual page
 108     Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
 109                                              TheISA::PageBytes);
 110
 111     Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
 112                                               TheISA::PageBytes);
 113
 114     if (incoming_virt_page_addr != coalesced_virt_page_addr)
 115         return false;
 116
 117     //* Rule 2: Coalesce requests only if they
 118     // share a TLB Mode, i.e. they are both read
 119     // or write requests.
 120     BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
 121     BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
 122
 123     if (incoming_mode != coalesced_mode)
 124         return false;
 125
 126     // when we can coalesce a packet update the reqCnt
 127     // that is the number of packets represented by
 128     // this coalesced packet
 129     if (!incoming_state->prefetch)
 130         coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
 131
 132     return true;
 133 }
 134
 135 /*
 136  * We need to update the physical addresses of all the translation requests
 137  * that were coalesced into the one that just returned.
 138  */
 139 void
 140 TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
 141 {
 142     Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
 143
 144     DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
 145             issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
 146
 147     TheISA::GpuTLB::TranslationState *sender_state =
 148         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 149
 150     TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry;
 151     assert(tlb_entry);
 152     Addr first_entry_vaddr = tlb_entry->vaddr;
 153     Addr first_entry_paddr = tlb_entry->paddr;
 154     int page_size = tlb_entry->size();
 155     bool uncacheable = tlb_entry->uncacheable;
 156     int first_hit_level = sender_state->hitLevel;
 157
 158     // Get the physical page address of the translated request
 159     // Using the page_size specified in the TLBEntry allows us
 160     // to support different page sizes.
 161     Addr phys_page_paddr = pkt->req->getPaddr();
 162     phys_page_paddr &= ~(page_size - 1);
 163
 164     for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
 165         PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
 166         TheISA::GpuTLB::TranslationState *sender_state =
 167             safe_cast<TheISA::GpuTLB::TranslationState*>(
 168                     local_pkt->senderState);
 169
 170         // we are sending the packet back, so pop the reqCnt associated
 171         // with this level in the TLB hiearchy
 172         if (!sender_state->prefetch)
 173             sender_state->reqCnt.pop_back();
 174
 175         /*
 176          * Only the first packet from this coalesced request has been
 177          * translated. Grab the translated phys. page addr and update the
 178          * physical addresses of the remaining packets with the appropriate
 179          * page offsets.
 180          */
 181         if (i) {
 182             Addr paddr = phys_page_paddr;
 183             paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
 184             local_pkt->req->setPaddr(paddr);
 185
 186             if (uncacheable)
 187                 local_pkt->req->setFlags(Request::UNCACHEABLE);
 188
 189             // update senderState->tlbEntry, so we can insert
 190             // the correct TLBEentry in the TLBs above.
 191             auto p = sender_state->tc->getProcessPtr();
 192             sender_state->tlbEntry =
 193                 new TheISA::TlbEntry(p->pid(), first_entry_vaddr,
 194                     first_entry_paddr, false, false);
 195
 196             // update the hitLevel for all uncoalesced reqs
 197             // so that each packet knows where it hit
 198             // (used for statistics in the CUs)
 199             sender_state->hitLevel = first_hit_level;
 200         }
 201
 202         ResponsePort *return_port = sender_state->ports.back();
 203         sender_state->ports.pop_back();
 204
 205         // Translation is done - Convert to a response pkt if necessary and
 206         // send the translation back
 207         if (local_pkt->isRequest()) {
 208             local_pkt->makeTimingResponse();
 209         }
 210
 211         return_port->sendTimingResp(local_pkt);
 212     }
 213
 214     // schedule clean up for end of this cycle
 215     // This is a maximum priority event and must be on
 216     // the same cycle as GPUTLB cleanup event to prevent
 217     // race conditions with an IssueProbeEvent caused by
 218     // MemSidePort::recvReqRetry
 219     cleanupQueue.push(virt_page_addr);
 220
 221     if (!cleanupEvent.scheduled())
 222         schedule(cleanupEvent, curTick());
 223 }
 224
 225 // Receive translation requests, create a coalesced request,
 226 // and send them to the TLB (TLBProbesPerCycle)
 227 bool
 228 TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
 229 {
 230     // first packet of a coalesced request
 231     PacketPtr first_packet = nullptr;
 232     // true if we are able to do coalescing
 233     bool didCoalesce = false;
 234     // number of coalesced reqs for a given window
 235     int coalescedReq_cnt = 0;
 236
 237     TheISA::GpuTLB::TranslationState *sender_state =
 238         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 239
 240     // push back the port to remember the path back
 241     sender_state->ports.push_back(this);
 242
 243     bool update_stats = !sender_state->prefetch;
 244
 245     if (update_stats) {
 246         // if reqCnt is empty then this packet does not represent
 247         // multiple uncoalesced reqs(pkts) but just a single pkt.
 248         // If it does though then the reqCnt for each level in the
 249         // hierarchy accumulates the total number of reqs this packet
 250         // represents
 251         int req_cnt = 1;
 252
 253         if (!sender_state->reqCnt.empty())
 254             req_cnt = sender_state->reqCnt.back();
 255
 256         sender_state->reqCnt.push_back(req_cnt);
 257
 258         // update statistics
 259         coalescer->uncoalescedAccesses++;
 260         req_cnt = sender_state->reqCnt.back();
 261         DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
 262         coalescer->queuingCycles -= (curTick() * req_cnt);
 263         coalescer->localqueuingCycles -= curTick();
 264     }
 265
 266     // FIXME if you want to coalesce not based on the issueTime
 267     // of the packets (i.e., from the compute unit's perspective)
 268     // but based on when they reached this coalescer then
 269     // remove the following if statement and use curTick() or
 270     // coalescingWindow for the tick_index.
 271     if (!sender_state->issueTime)
 272        sender_state->issueTime = curTick();
 273
 274     // The tick index is used as a key to the coalescerFIFO hashmap.
 275     // It is shared by all candidates that fall within the
 276     // given coalescingWindow.
 277     int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
 278
 279     if (coalescer->coalescerFIFO.count(tick_index)) {
 280         coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
 281     }
 282
 283     // see if we can coalesce the incoming pkt with another
 284     // coalesced request with the same tick_index
 285     for (int i = 0; i < coalescedReq_cnt; ++i) {
 286         first_packet = coalescer->coalescerFIFO[tick_index][i][0];
 287
 288         if (coalescer->canCoalesce(pkt, first_packet)) {
 289             coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
 290
 291             DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
 292                     i, tick_index,
 293                     coalescer->coalescerFIFO[tick_index][i].size());
 294
 295             didCoalesce = true;
 296             break;
 297         }
 298     }
 299
 300     // if this is the first request for this tick_index
 301     // or we did not manage to coalesce, update stats
 302     // and make necessary allocations.
 303     if (!coalescedReq_cnt || !didCoalesce) {
 304         if (update_stats)
 305             coalescer->coalescedAccesses++;
 306
 307         std::vector<PacketPtr> new_array;
 308         new_array.push_back(pkt);
 309         coalescer->coalescerFIFO[tick_index].push_back(new_array);
 310
 311         DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
 312                 "push\n", tick_index,
 313                 coalescer->coalescerFIFO[tick_index].size());
 314     }
 315
 316     //schedule probeTLBEvent next cycle to send the
 317     //coalesced requests to the TLB
 318     if (!coalescer->probeTLBEvent.scheduled()) {
 319         coalescer->schedule(coalescer->probeTLBEvent,
 320                 curTick() + coalescer->clockPeriod());
 321     }
 322
 323     return true;
 324 }
 325
 326 void
 327 TLBCoalescer::CpuSidePort::recvReqRetry()
 328 {
 329     panic("recvReqRetry called");
 330 }
 331
 332 void
 333 TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
 334 {
 335
 336     TheISA::GpuTLB::TranslationState *sender_state =
 337         safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 338
 339     bool update_stats = !sender_state->prefetch;
 340
 341     if (update_stats)
 342         coalescer->uncoalescedAccesses++;
 343
 344     // If there is a pending timing request for this virtual address
 345     // print a warning message. This is a temporary caveat of
 346     // the current simulator where atomic and timing requests can
 347     // coexist. FIXME remove this check/warning in the future.
 348     Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
 349     int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
 350
 351     if (map_count) {
 352         DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
 353                 "req. pending\n", virt_page_addr);
 354     }
 355
 356     coalescer->memSidePort[0]->sendFunctional(pkt);
 357 }
 358
 359 AddrRangeList
 360 TLBCoalescer::CpuSidePort::getAddrRanges() const
 361 {
 362     // currently not checked by the master
 363     AddrRangeList ranges;
 364
 365     return ranges;
 366 }
 367
 368 bool
 369 TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
 370 {
 371     // a translation completed and returned
 372     coalescer->updatePhysAddresses(pkt);
 373
 374     return true;
 375 }
 376
 377 void
 378 TLBCoalescer::MemSidePort::recvReqRetry()
 379 {
 380     //we've receeived a retry. Schedule a probeTLBEvent
 381     if (!coalescer->probeTLBEvent.scheduled())
 382         coalescer->schedule(coalescer->probeTLBEvent,
 383                 curTick() + coalescer->clockPeriod());
 384 }
 385
 386 void
 387 TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
 388 {
 389     fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
 390 }
 391
 392 /*
 393  * Here we scan the coalescer FIFO and issue the max
 394  * number of permitted probes to the TLB below. We
 395  * permit bypassing of coalesced requests for the same
 396  * tick_index.
 397  *
 398  * We do not access the next tick_index unless we've
 399  * drained the previous one. The coalesced requests
 400  * that are successfully sent are moved to the
 401  * issuedTranslationsTable table (the table which keeps
 402  * track of the outstanding reqs)
 403  */
 404 void
 405 TLBCoalescer::processProbeTLBEvent()
 406 {
 407     // number of TLB probes sent so far
 408     int sent_probes = 0;
 409     // rejected denotes a blocking event
 410     bool rejected = false;
 411
 412     // It is set to true either when the recvTiming of the TLB below
 413     // returns false or when there is another outstanding request for the
 414     // same virt. page.
 415
 416     DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__);
 417
 418     for (auto iter = coalescerFIFO.begin();
 419          iter != coalescerFIFO.end() && !rejected; ) {
 420         int coalescedReq_cnt = iter->second.size();
 421         int i = 0;
 422         int vector_index = 0;
 423
 424         DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
 425                coalescedReq_cnt, iter->first);
 426
 427         while (i < coalescedReq_cnt) {
 428             ++i;
 429             PacketPtr first_packet = iter->second[vector_index][0];
 430
 431             // compute virtual page address for this request
 432             Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
 433                     TheISA::PageBytes);
 434
 435             // is there another outstanding request for the same page addr?
 436             int pending_reqs =
 437                 issuedTranslationsTable.count(virt_page_addr);
 438
 439             if (pending_reqs) {
 440                 DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
 441                         "page %#x\n", virt_page_addr);
 442
 443                 ++vector_index;
 444                 rejected = true;
 445
 446                 continue;
 447             }
 448
 449             // send the coalesced request for virt_page_addr
 450             if (!memSidePort[0]->sendTimingReq(first_packet)) {
 451                 DPRINTF(GPUTLB, "Failed to send TLB request for page %#x\n",
 452                        virt_page_addr);
 453
 454                 // No need for a retries queue since we are already buffering
 455                 // the coalesced request in coalescerFIFO.
 456                 rejected = true;
 457                 ++vector_index;
 458             } else {
 459                 TheISA::GpuTLB::TranslationState *tmp_sender_state =
 460                     safe_cast<TheISA::GpuTLB::TranslationState*>
 461                     (first_packet->senderState);
 462
 463                 bool update_stats = !tmp_sender_state->prefetch;
 464
 465                 if (update_stats) {
 466                     // req_cnt is total number of packets represented
 467                     // by the one we just sent counting all the way from
 468                     // the top of TLB hiearchy (i.e., from the CU)
 469                     int req_cnt = tmp_sender_state->reqCnt.back();
 470                     queuingCycles += (curTick() * req_cnt);
 471
 472                     DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
 473                             name(), req_cnt);
 474
 475                     // pkt_cnt is number of packets we coalesced into the one
 476                     // we just sent but only at this coalescer level
 477                     int pkt_cnt = iter->second[vector_index].size();
 478                     localqueuingCycles += (curTick() * pkt_cnt);
 479                 }
 480
 481                 DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
 482                        virt_page_addr);
 483
 484                 //copy coalescedReq to issuedTranslationsTable
 485                 issuedTranslationsTable[virt_page_addr]
 486                     = iter->second[vector_index];
 487
 488                 //erase the entry of this coalesced req
 489                 iter->second.erase(iter->second.begin() + vector_index);
 490
 491                 if (iter->second.empty())
 492                     assert(i == coalescedReq_cnt);
 493
 494                 sent_probes++;
 495                 if (sent_probes == TLBProbesPerCycle)
 496                    return;
 497             }
 498         }
 499
 500         //if there are no more coalesced reqs for this tick_index
 501         //erase the hash_map with the first iterator
 502         if (iter->second.empty()) {
 503             coalescerFIFO.erase(iter++);
 504         } else {
 505             ++iter;
 506         }
 507     }
 508 }
 509
 510 void
 511 TLBCoalescer::processCleanupEvent()
 512 {
 513     while (!cleanupQueue.empty()) {
 514         Addr cleanup_addr = cleanupQueue.front();
 515         cleanupQueue.pop();
 516         issuedTranslationsTable.erase(cleanup_addr);
 517
 518         DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
 519                 cleanup_addr);
 520     }
 521 }
 522
 523 void
 524 TLBCoalescer::regStats()
 525 {
 526     ClockedObject::regStats();
 527
 528     uncoalescedAccesses
 529         .name(name() + ".uncoalesced_accesses")
 530         .desc("Number of uncoalesced TLB accesses")
 531         ;
 532
 533     coalescedAccesses
 534         .name(name() + ".coalesced_accesses")
 535         .desc("Number of coalesced TLB accesses")
 536         ;
 537
 538     queuingCycles
 539         .name(name() + ".queuing_cycles")
 540         .desc("Number of cycles spent in queue")
 541         ;
 542
 543     localqueuingCycles
 544         .name(name() + ".local_queuing_cycles")
 545         .desc("Number of cycles spent in queue for all incoming reqs")
 546         ;
 547
 548     localLatency
 549         .name(name() + ".local_latency")
 550         .desc("Avg. latency over all incoming pkts")
 551         ;
 552
 553     localLatency = localqueuingCycles / uncoalescedAccesses;
 554 }
 555
 556
 557 TLBCoalescer*
 558 TLBCoalescerParams::create()
 559 {
 560     return new TLBCoalescer(this);
 561 }
 562