src/mem/ruby/system/GPUCoalescer.cc

   1 /*
   2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "base/logging.hh"
  35 #include "base/str.hh"
  36 #include "config/the_isa.hh"
  37
  38 #if THE_ISA == X86_ISA
  39 #include "arch/x86/insts/microldstop.hh"
  40
  41 #endif // X86_ISA
  42 #include "mem/ruby/system/GPUCoalescer.hh"
  43
  44 #include "cpu/testers/rubytest/RubyTester.hh"
  45 #include "debug/GPUCoalescer.hh"
  46 #include "debug/MemoryAccess.hh"
  47 #include "debug/ProtocolTrace.hh"
  48 #include "debug/RubyPort.hh"
  49 #include "debug/RubyStats.hh"
  50 #include "gpu-compute/shader.hh"
  51 #include "mem/packet.hh"
  52 #include "mem/ruby/common/DataBlock.hh"
  53 #include "mem/ruby/common/SubBlock.hh"
  54 #include "mem/ruby/network/MessageBuffer.hh"
  55 #include "mem/ruby/profiler/Profiler.hh"
  56 #include "mem/ruby/slicc_interface/AbstractController.hh"
  57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
  58 #include "mem/ruby/structures/CacheMemory.hh"
  59 #include "mem/ruby/system/RubySystem.hh"
  60 #include "params/RubyGPUCoalescer.hh"
  61
  62 using namespace std;
  63
  64 GPUCoalescer *
  65 RubyGPUCoalescerParams::create()
  66 {
  67     return new GPUCoalescer(this);
  68 }
  69
  70 HSAScope
  71 reqScopeToHSAScope(const RequestPtr &req)
  72 {
  73     HSAScope accessScope = HSAScope_UNSPECIFIED;
  74     if (req->isScoped()) {
  75         if (req->isWavefrontScope()) {
  76             accessScope = HSAScope_WAVEFRONT;
  77         } else if (req->isWorkgroupScope()) {
  78             accessScope = HSAScope_WORKGROUP;
  79         } else if (req->isDeviceScope()) {
  80             accessScope = HSAScope_DEVICE;
  81         } else if (req->isSystemScope()) {
  82             accessScope = HSAScope_SYSTEM;
  83         } else {
  84             fatal("Bad scope type");
  85         }
  86     }
  87     return accessScope;
  88 }
  89
  90 HSASegment
  91 reqSegmentToHSASegment(const RequestPtr &req)
  92 {
  93     HSASegment accessSegment = HSASegment_GLOBAL;
  94
  95     if (req->isGlobalSegment()) {
  96         accessSegment = HSASegment_GLOBAL;
  97     } else if (req->isGroupSegment()) {
  98         accessSegment = HSASegment_GROUP;
  99     } else if (req->isPrivateSegment()) {
 100         accessSegment = HSASegment_PRIVATE;
 101     } else if (req->isKernargSegment()) {
 102         accessSegment = HSASegment_KERNARG;
 103     } else if (req->isReadonlySegment()) {
 104         accessSegment = HSASegment_READONLY;
 105     } else if (req->isSpillSegment()) {
 106         accessSegment = HSASegment_SPILL;
 107     } else if (req->isArgSegment()) {
 108         accessSegment = HSASegment_ARG;
 109     } else {
 110         fatal("Bad segment type");
 111     }
 112
 113     return accessSegment;
 114 }
 115
 116 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
 117     : coalescer(gc)
 118 {
 119 }
 120
 121 void
 122 UncoalescedTable::insertPacket(PacketPtr pkt)
 123 {
 124     uint64_t seqNum = pkt->req->getReqInstSeqNum();
 125
 126     instMap[seqNum].push_back(pkt);
 127     DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
 128             pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
 129 }
 130
 131 bool
 132 UncoalescedTable::packetAvailable()
 133 {
 134     return !instMap.empty();
 135 }
 136
 137 PerInstPackets*
 138 UncoalescedTable::getInstPackets(int offset)
 139 {
 140     if (offset >= instMap.size()) {
 141         return nullptr;
 142     }
 143
 144     auto instMapIter = instMap.begin();
 145     std::advance(instMapIter, offset);
 146
 147     return &(instMapIter->second);
 148 }
 149
 150 void
 151 UncoalescedTable::updateResources()
 152 {
 153     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
 154         if (iter->second.empty()) {
 155             instMap.erase(iter++);
 156             coalescer->getGMTokenPort().sendTokens(1);
 157         } else {
 158             ++iter;
 159         }
 160     }
 161 }
 162
 163 void
 164 UncoalescedTable::printRequestTable(std::stringstream& ss)
 165 {
 166     ss << "UncoalescedTable contains " << instMap.size()
 167        << " address entries." << std::endl;
 168     for (auto& inst : instMap) {
 169         ss << "Addr 0x" << std::hex << inst.first << std::dec
 170            << " with " << inst.second.size() << " packets"
 171            << std::endl;
 172     }
 173 }
 174
 175 void
 176 UncoalescedTable::checkDeadlock(Tick threshold)
 177 {
 178     Tick current_time = curTick();
 179
 180     for (auto &it : instMap) {
 181         for (auto &pkt : it.second) {
 182             if (current_time - pkt->req->time() > threshold) {
 183                 std::stringstream ss;
 184                 printRequestTable(ss);
 185
 186                 panic("Possible Deadlock detected. Aborting!\n"
 187                      "version: %d request.paddr: 0x%x uncoalescedTable: %d "
 188                      "current time: %u issue_time: %d difference: %d\n"
 189                      "Request Tables:\n\n%s", coalescer->getId(),
 190                       pkt->getAddr(), instMap.size(), current_time,
 191                       pkt->req->time(), current_time - pkt->req->time(),
 192                       ss.str());
 193             }
 194         }
 195     }
 196 }
 197
 198 GPUCoalescer::GPUCoalescer(const Params *p)
 199     : RubyPort(p),
 200       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
 201                  false, Event::Progress_Event_Pri),
 202       uncoalescedTable(this),
 203       deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
 204       gmTokenPort(name() + ".gmTokenPort", this)
 205 {
 206     m_store_waiting_on_load_cycles = 0;
 207     m_store_waiting_on_store_cycles = 0;
 208     m_load_waiting_on_store_cycles = 0;
 209     m_load_waiting_on_load_cycles = 0;
 210
 211     m_outstanding_count = 0;
 212
 213     coalescingWindow = p->max_coalesces_per_cycle;
 214
 215     m_max_outstanding_requests = 0;
 216     m_instCache_ptr = nullptr;
 217     m_dataCache_ptr = nullptr;
 218
 219     m_instCache_ptr = p->icache;
 220     m_dataCache_ptr = p->dcache;
 221     m_max_outstanding_requests = p->max_outstanding_requests;
 222     m_deadlock_threshold = p->deadlock_threshold;
 223
 224     assert(m_max_outstanding_requests > 0);
 225     assert(m_deadlock_threshold > 0);
 226     assert(m_instCache_ptr);
 227     assert(m_dataCache_ptr);
 228
 229     m_runningGarnetStandalone = p->garnet_standalone;
 230     assumingRfOCoherence = p->assume_rfo;
 231 }
 232
 233 GPUCoalescer::~GPUCoalescer()
 234 {
 235 }
 236
 237 Port &
 238 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
 239 {
 240     if (if_name == "gmTokenPort") {
 241         return gmTokenPort;
 242     }
 243
 244     // delgate to RubyPort otherwise
 245     return RubyPort::getPort(if_name, idx);
 246 }
 247
 248 void
 249 GPUCoalescer::wakeup()
 250 {
 251     Cycles current_time = curCycle();
 252     for (auto& requestList : coalescedTable) {
 253         for (auto& req : requestList.second) {
 254             if (current_time - req->getIssueTime() > m_deadlock_threshold) {
 255                 std::stringstream ss;
 256                 printRequestTable(ss);
 257                 ss << "Outstanding requests: " << m_outstanding_count
 258                    << std::endl;
 259
 260                 panic("Possible Deadlock detected. Aborting!\n"
 261                      "version: %d request.paddr: 0x%x coalescedTable: %d "
 262                      "current time: %u issue_time: %d difference: %d\n"
 263                      "Request Tables:\n %s", m_version,
 264                       req->getFirstPkt()->getAddr(),
 265                       coalescedTable.size(), cyclesToTicks(current_time),
 266                       cyclesToTicks(req->getIssueTime()),
 267                       cyclesToTicks(current_time - req->getIssueTime()),
 268                       ss.str());
 269             }
 270         }
 271     }
 272
 273     Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
 274     uncoalescedTable.checkDeadlock(tick_threshold);
 275
 276     if (m_outstanding_count > 0) {
 277         schedule(deadlockCheckEvent,
 278                  m_deadlock_threshold * clockPeriod() +
 279                  curTick());
 280     }
 281 }
 282
 283 void
 284 GPUCoalescer::printRequestTable(std::stringstream& ss)
 285 {
 286     uncoalescedTable.printRequestTable(ss);
 287
 288     ss << "CoalescedTable contains " << coalescedTable.size()
 289        << " address entries." << std::endl;
 290     for (auto& requestList : coalescedTable) {
 291         ss << "Addr 0x" << std::hex << requestList.first << std::dec
 292            << ": type-";
 293         for (auto& request : requestList.second) {
 294             ss << RubyRequestType_to_string(request->getRubyType())
 295                << " pkts-" << request->getPackets().size()
 296                << " issued-" << request->getIssueTime() << " seqNum-"
 297                << request->getSeqNum() << "; ";
 298         }
 299         ss << std::endl;
 300     }
 301 }
 302
 303 void
 304 GPUCoalescer::resetStats()
 305 {
 306     m_latencyHist.reset();
 307     m_missLatencyHist.reset();
 308     for (int i = 0; i < RubyRequestType_NUM; i++) {
 309         m_typeLatencyHist[i]->reset();
 310         m_missTypeLatencyHist[i]->reset();
 311         for (int j = 0; j < MachineType_NUM; j++) {
 312             m_missTypeMachLatencyHist[i][j]->reset();
 313         }
 314     }
 315
 316     for (int i = 0; i < MachineType_NUM; i++) {
 317         m_missMachLatencyHist[i]->reset();
 318
 319         m_IssueToInitialDelayHist[i]->reset();
 320         m_InitialToForwardDelayHist[i]->reset();
 321         m_ForwardToFirstResponseDelayHist[i]->reset();
 322         m_FirstResponseToCompletionDelayHist[i]->reset();
 323     }
 324 }
 325
 326 void
 327 GPUCoalescer::printProgress(ostream& out) const
 328 {
 329 }
 330
 331 // sets the kernelEndList
 332 void
 333 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
 334 {
 335     // Don't know if this will happen or is possible
 336     // but I just want to be careful and not have it become
 337     // simulator hang in the future
 338     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
 339     assert(kernelEndList.count(wavefront_id) == 0);
 340
 341     kernelEndList[wavefront_id] = pkt;
 342     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
 343             kernelEndList.size());
 344 }
 345
 346 void
 347 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 348 {
 349     writeCallback(address, MachineType_NULL, data);
 350 }
 351
 352 void
 353 GPUCoalescer::writeCallback(Addr address,
 354                          MachineType mach,
 355                          DataBlock& data)
 356 {
 357     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 358 }
 359
 360 void
 361 GPUCoalescer::writeCallback(Addr address,
 362                          MachineType mach,
 363                          DataBlock& data,
 364                          Cycles initialRequestTime,
 365                          Cycles forwardRequestTime,
 366                          Cycles firstResponseTime)
 367 {
 368     writeCallback(address, mach, data,
 369                   initialRequestTime, forwardRequestTime, firstResponseTime,
 370                   false);
 371 }
 372
 373 void
 374 GPUCoalescer::writeCallback(Addr address,
 375                          MachineType mach,
 376                          DataBlock& data,
 377                          Cycles initialRequestTime,
 378                          Cycles forwardRequestTime,
 379                          Cycles firstResponseTime,
 380                          bool isRegion)
 381 {
 382     assert(address == makeLineAddress(address));
 383     assert(coalescedTable.count(address));
 384
 385     auto crequest = coalescedTable.at(address).front();
 386
 387     hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
 388                 forwardRequestTime, firstResponseTime, isRegion);
 389
 390     delete crequest;
 391     coalescedTable.at(address).pop_front();
 392
 393     if (coalescedTable.at(address).empty()) {
 394         coalescedTable.erase(address);
 395     } else {
 396         auto nextRequest = coalescedTable.at(address).front();
 397         issueRequest(nextRequest);
 398     }
 399 }
 400
 401 void
 402 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 403 {
 404     readCallback(address, MachineType_NULL, data);
 405 }
 406
 407 void
 408 GPUCoalescer::readCallback(Addr address,
 409                         MachineType mach,
 410                         DataBlock& data)
 411 {
 412     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 413 }
 414
 415 void
 416 GPUCoalescer::readCallback(Addr address,
 417                         MachineType mach,
 418                         DataBlock& data,
 419                         Cycles initialRequestTime,
 420                         Cycles forwardRequestTime,
 421                         Cycles firstResponseTime)
 422 {
 423
 424     readCallback(address, mach, data,
 425                  initialRequestTime, forwardRequestTime, firstResponseTime,
 426                  false);
 427 }
 428
 429 void
 430 GPUCoalescer::readCallback(Addr address,
 431                         MachineType mach,
 432                         DataBlock& data,
 433                         Cycles initialRequestTime,
 434                         Cycles forwardRequestTime,
 435                         Cycles firstResponseTime,
 436                         bool isRegion)
 437 {
 438     assert(address == makeLineAddress(address));
 439     assert(coalescedTable.count(address));
 440
 441     auto crequest = coalescedTable.at(address).front();
 442     fatal_if(crequest->getRubyType() != RubyRequestType_LD,
 443              "readCallback received non-read type response\n");
 444
 445     // Iterate over the coalesced requests to respond to as many loads as
 446     // possible until another request type is seen. Models MSHR for TCP.
 447     while (crequest->getRubyType() == RubyRequestType_LD) {
 448         hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
 449                     forwardRequestTime, firstResponseTime, isRegion);
 450
 451         delete crequest;
 452         coalescedTable.at(address).pop_front();
 453         if (coalescedTable.at(address).empty()) {
 454             break;
 455         }
 456
 457         crequest = coalescedTable.at(address).front();
 458     }
 459
 460     if (coalescedTable.at(address).empty()) {
 461         coalescedTable.erase(address);
 462     } else {
 463         auto nextRequest = coalescedTable.at(address).front();
 464         issueRequest(nextRequest);
 465     }
 466 }
 467
 468 void
 469 GPUCoalescer::hitCallback(CoalescedRequest* crequest,
 470                        MachineType mach,
 471                        DataBlock& data,
 472                        bool success,
 473                        Cycles initialRequestTime,
 474                        Cycles forwardRequestTime,
 475                        Cycles firstResponseTime,
 476                        bool isRegion)
 477 {
 478     PacketPtr pkt = crequest->getFirstPkt();
 479     Addr request_address = pkt->getAddr();
 480     Addr request_line_address = makeLineAddress(request_address);
 481
 482     RubyRequestType type = crequest->getRubyType();
 483
 484     DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
 485
 486     recordMissLatency(crequest, mach,
 487                       initialRequestTime,
 488                       forwardRequestTime,
 489                       firstResponseTime,
 490                       success, isRegion);
 491     // update the data
 492     //
 493     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
 494     std::vector<PacketPtr> pktList = crequest->getPackets();
 495     DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
 496             pktList.size(), request_line_address);
 497     for (auto& pkt : pktList) {
 498         request_address = pkt->getAddr();
 499         if (pkt->getPtr<uint8_t>()) {
 500             if ((type == RubyRequestType_LD) ||
 501                 (type == RubyRequestType_ATOMIC) ||
 502                 (type == RubyRequestType_ATOMIC_RETURN) ||
 503                 (type == RubyRequestType_IFETCH) ||
 504                 (type == RubyRequestType_RMW_Read) ||
 505                 (type == RubyRequestType_Locked_RMW_Read) ||
 506                 (type == RubyRequestType_Load_Linked)) {
 507                 pkt->setData(
 508                     data.getData(getOffset(request_address), pkt->getSize()));
 509             } else {
 510                 data.setData(pkt->getPtr<uint8_t>(),
 511                              getOffset(request_address), pkt->getSize());
 512             }
 513         } else {
 514             DPRINTF(MemoryAccess,
 515                     "WARNING.  Data not transfered from Ruby to M5 for type " \
 516                     "%s\n",
 517                     RubyRequestType_to_string(type));
 518         }
 519
 520         // If using the RubyTester, update the RubyTester sender state's
 521         // subBlock with the recieved data.  The tester will later access
 522         // this state.
 523         // Note: RubyPort will access it's sender state before the
 524         // RubyTester.
 525         if (m_usingRubyTester) {
 526             RubyPort::SenderState *requestSenderState =
 527                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
 528             RubyTester::SenderState* testerSenderState =
 529                 safe_cast<RubyTester::SenderState*>
 530                     (requestSenderState->predecessor);
 531             testerSenderState->subBlock.mergeFrom(data);
 532         }
 533     }
 534
 535
 536
 537     m_outstanding_count--;
 538     assert(m_outstanding_count >= 0);
 539
 540     completeHitCallback(pktList);
 541 }
 542
 543 bool
 544 GPUCoalescer::empty() const
 545 {
 546     return coalescedTable.empty();
 547 }
 548
 549 RubyRequestType
 550 GPUCoalescer::getRequestType(PacketPtr pkt)
 551 {
 552     RubyRequestType req_type = RubyRequestType_NULL;
 553
 554     // These types are not support or not used in GPU caches.
 555     assert(!pkt->req->isLLSC());
 556     assert(!pkt->req->isLockedRMW());
 557     assert(!pkt->req->isInstFetch());
 558     assert(!pkt->isFlush());
 559
 560     if (pkt->req->isAtomicReturn()) {
 561         req_type = RubyRequestType_ATOMIC_RETURN;
 562     } else if (pkt->req->isAtomicNoReturn()) {
 563         req_type = RubyRequestType_ATOMIC_NO_RETURN;
 564     } else if (pkt->isRead()) {
 565         req_type = RubyRequestType_LD;
 566     } else if (pkt->isWrite()) {
 567         req_type = RubyRequestType_ST;
 568     } else {
 569         // Acquire and release packets will have been issued by
 570         // makeRequest, so we do not need to check for it here.
 571         panic("Unsupported ruby packet type\n");
 572     }
 573
 574     return req_type;
 575 }
 576
 577 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
 578 // special type (MemFence, scoping, etc), it is issued immediately.
 579 RequestStatus
 580 GPUCoalescer::makeRequest(PacketPtr pkt)
 581 {
 582     // Check for GPU Barrier Kernel End or Kernel Begin
 583     // Leave these to be handled by the child class
 584     // Kernel End/Barrier = isFlush + isRelease
 585     // Kernel Begin = isFlush + isAcquire
 586     if (pkt->req->isKernel()) {
 587         if (pkt->req->isAcquire()){
 588             // This is a Kernel Begin leave handling to
 589             // virtual xCoalescer::makeRequest
 590             return RequestStatus_Issued;
 591         }else if (pkt->req->isRelease()) {
 592             // This is a Kernel End leave handling to
 593             // virtual xCoalescer::makeRequest
 594             // If we are here then we didn't call
 595             // a virtual version of this function
 596             // so we will also schedule the callback
 597             int wf_id = 0;
 598             if (pkt->req->hasContextId()) {
 599                 wf_id = pkt->req->contextId();
 600             }
 601             insertKernel(wf_id, pkt);
 602             newKernelEnds.push_back(wf_id);
 603             if (!issueEvent.scheduled()) {
 604                 schedule(issueEvent, curTick());
 605             }
 606             return RequestStatus_Issued;
 607         }
 608     }
 609
 610     if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
 611         !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
 612         (pkt->req->isRelease() || pkt->req->isAcquire())) {
 613         if (assumingRfOCoherence) {
 614             // If we reached here, this request must be a memFence
 615             // and the protocol implements RfO, the coalescer can
 616             // assume sequentially consistency and schedule the callback
 617             // immediately.
 618             // Currently the code implements fence callbacks
 619             // by reusing the mechanism for kernel completions.
 620             // This should be fixed.
 621             int wf_id = 0;
 622             if (pkt->req->hasContextId()) {
 623                 wf_id = pkt->req->contextId();
 624             }
 625             insertKernel(wf_id, pkt);
 626             newKernelEnds.push_back(wf_id);
 627             if (!issueEvent.scheduled()) {
 628                 schedule(issueEvent, curTick());
 629             }
 630             return RequestStatus_Issued;
 631         } else {
 632             // If not RfO, return issued here and let the child coalescer
 633             // take care of it.
 634             return RequestStatus_Issued;
 635         }
 636     }
 637
 638     uncoalescedTable.insertPacket(pkt);
 639     DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());
 640
 641     if (!issueEvent.scheduled())
 642         schedule(issueEvent, curTick());
 643     // TODO: issue hardware prefetches here
 644     return RequestStatus_Issued;
 645 }
 646
 647 void
 648 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 649 {
 650     PacketPtr pkt = crequest->getFirstPkt();
 651
 652     int proc_id = -1;
 653     if (pkt != NULL && pkt->req->hasContextId()) {
 654         proc_id = pkt->req->contextId();
 655     }
 656
 657     // If valid, copy the pc to the ruby request
 658     Addr pc = 0;
 659     if (pkt->req->hasPC()) {
 660         pc = pkt->req->getPC();
 661     }
 662
 663     // At the moment setting scopes only counts
 664     // for GPU spill space accesses
 665     // which is pkt->req->isStack()
 666     // this scope is REPLACE since it
 667     // does not need to be flushed at the end
 668     // of a kernel Private and local may need
 669     // to be visible at the end of the kernel
 670     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
 671     HSAScope accessScope = reqScopeToHSAScope(pkt->req);
 672
 673     Addr line_addr = makeLineAddress(pkt->getAddr());
 674
 675     // Creating WriteMask that records written bytes
 676     // and atomic operations. This enables partial writes
 677     // and partial reads of those writes
 678     DataBlock dataBlock;
 679     dataBlock.clear();
 680     uint32_t blockSize = RubySystem::getBlockSizeBytes();
 681     std::vector<bool> accessMask(blockSize,false);
 682     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
 683     uint32_t tableSize = crequest->getPackets().size();
 684     for (int i = 0; i < tableSize; i++) {
 685         PacketPtr tmpPkt = crequest->getPackets()[i];
 686         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
 687         uint32_t tmpSize = tmpPkt->getSize();
 688         if (tmpPkt->isAtomicOp()) {
 689             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
 690                                                         tmpPkt->getAtomicOp());
 691             atomicOps.push_back(tmpAtomicOp);
 692         } else if (tmpPkt->isWrite()) {
 693             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
 694                               tmpOffset, tmpSize);
 695         }
 696         for (int j = 0; j < tmpSize; j++) {
 697             accessMask[tmpOffset + j] = true;
 698         }
 699     }
 700     std::shared_ptr<RubyRequest> msg;
 701     if (pkt->isAtomicOp()) {
 702         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 703                               pkt->getPtr<uint8_t>(),
 704                               pkt->getSize(), pc, crequest->getRubyType(),
 705                               RubyAccessMode_Supervisor, pkt,
 706                               PrefetchBit_No, proc_id, 100,
 707                               blockSize, accessMask,
 708                               dataBlock, atomicOps,
 709                               accessScope, accessSegment);
 710     } else {
 711         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 712                               pkt->getPtr<uint8_t>(),
 713                               pkt->getSize(), pc, crequest->getRubyType(),
 714                               RubyAccessMode_Supervisor, pkt,
 715                               PrefetchBit_No, proc_id, 100,
 716                               blockSize, accessMask,
 717                               dataBlock,
 718                               accessScope, accessSegment);
 719     }
 720     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
 721              curTick(), m_version, "Coal", "Begin", "", "",
 722              printAddress(msg->getPhysicalAddress()),
 723              RubyRequestType_to_string(crequest->getRubyType()));
 724
 725     fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
 726              "there should not be any I-Fetch requests in the GPU Coalescer");
 727
 728     Tick latency = cyclesToTicks(
 729                 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
 730     assert(latency > 0);
 731
 732     if (!deadlockCheckEvent.scheduled()) {
 733         schedule(deadlockCheckEvent,
 734                  m_deadlock_threshold * clockPeriod() +
 735                  curTick());
 736     }
 737
 738     assert(m_mandatory_q_ptr);
 739     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 740 }
 741
 742 template <class KEY, class VALUE>
 743 std::ostream &
 744 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
 745 {
 746     out << "[";
 747     for (auto i = map.begin(); i != map.end(); ++i)
 748         out << " " << i->first << "=" << i->second;
 749     out << " ]";
 750
 751     return out;
 752 }
 753
 754 void
 755 GPUCoalescer::print(ostream& out) const
 756 {
 757     out << "[GPUCoalescer: " << m_version
 758         << ", outstanding requests: " << m_outstanding_count
 759         << "]";
 760 }
 761
 762
 763 void
 764 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
 765     DPRINTF(RubyStats, "Recorded statistic: %s\n",
 766             SequencerRequestType_to_string(requestType));
 767 }
 768
 769 bool
 770 GPUCoalescer::coalescePacket(PacketPtr pkt)
 771 {
 772     uint64_t seqNum = pkt->req->getReqInstSeqNum();
 773     Addr line_addr = makeLineAddress(pkt->getAddr());
 774
 775     // If the packet has the same line address as a request already in the
 776     // coalescedTable and has the same sequence number, it can be coalesced.
 777     if (coalescedTable.count(line_addr)) {
 778         // Search for a previous coalesced request with the same seqNum.
 779         auto& creqQueue = coalescedTable.at(line_addr);
 780         auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
 781             [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
 782         );
 783         if (citer != creqQueue.end()) {
 784             (*citer)->insertPacket(pkt);
 785             return true;
 786         }
 787     }
 788
 789     if (m_outstanding_count < m_max_outstanding_requests) {
 790         // This is an "aliased" or new request. Create a RubyRequest and
 791         // append it to the list of "targets" in the coalescing table.
 792         DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
 793                 line_addr);
 794
 795         CoalescedRequest *creq = new CoalescedRequest(seqNum);
 796         creq->insertPacket(pkt);
 797         creq->setRubyType(getRequestType(pkt));
 798         creq->setIssueTime(curCycle());
 799
 800         if (!coalescedTable.count(line_addr)) {
 801             // If there is no outstanding request for this line address,
 802             // create a new coalecsed request and issue it immediately.
 803             auto reqList = std::deque<CoalescedRequest*> { creq };
 804             coalescedTable.insert(std::make_pair(line_addr, reqList));
 805
 806             DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
 807                     RubyRequestType_to_string(creq->getRubyType()), seqNum);
 808             issueRequest(creq);
 809         } else {
 810             // The request is for a line address that is already outstanding
 811             // but for a different instruction. Add it as a new request to be
 812             // issued when the current outstanding request is completed.
 813             coalescedTable.at(line_addr).push_back(creq);
 814             DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
 815                     line_addr, seqNum);
 816         }
 817
 818         // In both cases, requests are added to the coalescing table and will
 819         // be counted as outstanding requests.
 820         m_outstanding_count++;
 821
 822         return true;
 823     }
 824
 825     // The maximum number of outstanding requests have been issued.
 826     return false;
 827 }
 828
 829 void
 830 GPUCoalescer::completeIssue()
 831 {
 832     // Iterate over the maximum number of instructions we can coalesce
 833     // per cycle (coalescingWindow).
 834     for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
 835         PerInstPackets *pktList =
 836             uncoalescedTable.getInstPackets(instIdx);
 837
 838         // getInstPackets will return nullptr if no instruction
 839         // exists at the current offset.
 840         if (!pktList) {
 841             break;
 842         } else {
 843             // Since we have a pointer to the list of packets in the inst,
 844             // erase them from the list if coalescing is successful and
 845             // leave them in the list otherwise. This aggressively attempts
 846             // to coalesce as many packets as possible from the current inst.
 847             pktList->remove_if(
 848                 [&](PacketPtr pkt) { return coalescePacket(pkt); }
 849             );
 850         }
 851     }
 852
 853     // Clean up any instructions in the uncoalesced table that have had
 854     // all of their packets coalesced and return a token for that column.
 855     uncoalescedTable.updateResources();
 856
 857     // have Kernel End releases been issued this cycle
 858     int len = newKernelEnds.size();
 859     for (int i = 0; i < len; i++) {
 860         kernelCallback(newKernelEnds[i]);
 861     }
 862     newKernelEnds.clear();
 863 }
 864
 865 void
 866 GPUCoalescer::evictionCallback(Addr address)
 867 {
 868     ruby_eviction_callback(address);
 869 }
 870
 871 void
 872 GPUCoalescer::kernelCallback(int wavefront_id)
 873 {
 874     assert(kernelEndList.count(wavefront_id));
 875
 876     ruby_hit_callback(kernelEndList[wavefront_id]);
 877
 878     kernelEndList.erase(wavefront_id);
 879 }
 880
 881 void
 882 GPUCoalescer::atomicCallback(Addr address,
 883                              MachineType mach,
 884                              const DataBlock& data)
 885 {
 886     assert(address == makeLineAddress(address));
 887     assert(coalescedTable.count(address));
 888
 889     auto crequest = coalescedTable.at(address).front();
 890
 891     fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
 892               crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
 893               crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
 894              "atomicCallback saw non-atomic type response\n");
 895
 896     hitCallback(crequest, mach, (DataBlock&)data, true,
 897                 crequest->getIssueTime(), Cycles(0), Cycles(0), false);
 898
 899     delete crequest;
 900     coalescedTable.at(address).pop_front();
 901
 902     if (coalescedTable.at(address).empty()) {
 903         coalescedTable.erase(address);
 904     } else {
 905         auto nextRequest = coalescedTable.at(address).front();
 906         issueRequest(nextRequest);
 907     }
 908 }
 909
 910 void
 911 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
 912 {
 913     if (myMachID == senderMachID) {
 914         CP_TCPLdHits++;
 915     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
 916         CP_TCPLdTransfers++;
 917     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
 918         CP_TCCLdHits++;
 919     } else {
 920         CP_LdMiss++;
 921     }
 922 }
 923
 924 void
 925 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
 926 {
 927     if (myMachID == senderMachID) {
 928         CP_TCPStHits++;
 929     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
 930         CP_TCPStTransfers++;
 931     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
 932         CP_TCCStHits++;
 933     } else {
 934         CP_StMiss++;
 935     }
 936 }
 937
 938 void
 939 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 940 {
 941     for (auto& pkt : mylist) {
 942         RubyPort::SenderState *ss =
 943             safe_cast<RubyPort::SenderState *>(pkt->senderState);
 944         MemSlavePort *port = ss->port;
 945         assert(port != NULL);
 946
 947         pkt->senderState = ss->predecessor;
 948         delete ss;
 949         port->hitCallback(pkt);
 950         trySendRetries();
 951     }
 952
 953     // We schedule an event in the same tick as hitCallback (similar to
 954     // makeRequest) rather than calling completeIssue directly to reduce
 955     // function calls to complete issue. This can only happen if the max
 956     // outstanding requests is less than the number of slots in the
 957     // uncoalesced table and makeRequest is not called again.
 958     if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
 959         schedule(issueEvent, curTick());
 960     }
 961
 962     testDrainComplete();
 963 }
 964
 965 void
 966 GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
 967                                 MachineType mach,
 968                                 Cycles initialRequestTime,
 969                                 Cycles forwardRequestTime,
 970                                 Cycles firstResponseTime,
 971                                 bool success, bool isRegion)
 972 {
 973     RubyRequestType type = crequest->getRubyType();
 974     Cycles issued_time = crequest->getIssueTime();
 975     Cycles completion_time = curCycle();
 976     assert(completion_time >= issued_time);
 977     Cycles total_lat = completion_time - issued_time;
 978
 979     // cache stats (valid for RfO protocol only)
 980     if (mach == MachineType_TCP) {
 981         if (type == RubyRequestType_LD) {
 982             GPU_TCPLdHits++;
 983         } else {
 984             GPU_TCPStHits++;
 985         }
 986     } else if (mach == MachineType_L1Cache_wCC) {
 987         if (type == RubyRequestType_LD) {
 988             GPU_TCPLdTransfers++;
 989         } else {
 990             GPU_TCPStTransfers++;
 991         }
 992     } else if (mach == MachineType_TCC) {
 993         if (type == RubyRequestType_LD) {
 994             GPU_TCCLdHits++;
 995         } else {
 996             GPU_TCCStHits++;
 997         }
 998     } else  {
 999         if (type == RubyRequestType_LD) {
1000             GPU_LdMiss++;
1001         } else {
1002             GPU_StMiss++;
1003         }
1004     }
1005
1006     // Profile all access latency, even zero latency accesses
1007     m_latencyHist.sample(total_lat);
1008     m_typeLatencyHist[type]->sample(total_lat);
1009
1010     // Profile the miss latency for all non-zero demand misses
1011     if (total_lat != Cycles(0)) {
1012         m_missLatencyHist.sample(total_lat);
1013         m_missTypeLatencyHist[type]->sample(total_lat);
1014
1015         if (mach != MachineType_NUM) {
1016             m_missMachLatencyHist[mach]->sample(total_lat);
1017             m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1018
1019             if ((issued_time <= initialRequestTime) &&
1020                 (initialRequestTime <= forwardRequestTime) &&
1021                 (forwardRequestTime <= firstResponseTime) &&
1022                 (firstResponseTime <= completion_time)) {
1023
1024                 m_IssueToInitialDelayHist[mach]->sample(
1025                     initialRequestTime - issued_time);
1026                 m_InitialToForwardDelayHist[mach]->sample(
1027                     forwardRequestTime - initialRequestTime);
1028                 m_ForwardToFirstResponseDelayHist[mach]->sample(
1029                     firstResponseTime - forwardRequestTime);
1030                 m_FirstResponseToCompletionDelayHist[mach]->sample(
1031                     completion_time - firstResponseTime);
1032             }
1033         }
1034
1035     }
1036
1037     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1038              curTick(), m_version, "Coal",
1039              success ? "Done" : "SC_Failed", "", "",
1040              printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
1041 }
1042
1043 void
1044 GPUCoalescer::regStats()
1045 {
1046     RubyPort::regStats();
1047
1048     // These statistical variables are not for display.
1049     // The profiler will collate these across different
1050     // coalescers and display those collated statistics.
1051     m_outstandReqHist.init(10);
1052     m_latencyHist.init(10);
1053     m_missLatencyHist.init(10);
1054
1055     for (int i = 0; i < RubyRequestType_NUM; i++) {
1056         m_typeLatencyHist.push_back(new Stats::Histogram());
1057         m_typeLatencyHist[i]->init(10);
1058
1059         m_missTypeLatencyHist.push_back(new Stats::Histogram());
1060         m_missTypeLatencyHist[i]->init(10);
1061     }
1062
1063     for (int i = 0; i < MachineType_NUM; i++) {
1064         m_missMachLatencyHist.push_back(new Stats::Histogram());
1065         m_missMachLatencyHist[i]->init(10);
1066
1067         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1068         m_IssueToInitialDelayHist[i]->init(10);
1069
1070         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1071         m_InitialToForwardDelayHist[i]->init(10);
1072
1073         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1074         m_ForwardToFirstResponseDelayHist[i]->init(10);
1075
1076         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1077         m_FirstResponseToCompletionDelayHist[i]->init(10);
1078     }
1079
1080     for (int i = 0; i < RubyRequestType_NUM; i++) {
1081         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1082
1083         for (int j = 0; j < MachineType_NUM; j++) {
1084             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1085             m_missTypeMachLatencyHist[i][j]->init(10);
1086         }
1087     }
1088
1089     // GPU cache stats
1090     GPU_TCPLdHits
1091         .name(name() + ".gpu_tcp_ld_hits")
1092         .desc("loads that hit in the TCP")
1093         ;
1094     GPU_TCPLdTransfers
1095         .name(name() + ".gpu_tcp_ld_transfers")
1096         .desc("TCP to TCP load transfers")
1097         ;
1098     GPU_TCCLdHits
1099         .name(name() + ".gpu_tcc_ld_hits")
1100         .desc("loads that hit in the TCC")
1101         ;
1102     GPU_LdMiss
1103         .name(name() + ".gpu_ld_misses")
1104         .desc("loads that miss in the GPU")
1105         ;
1106
1107     GPU_TCPStHits
1108         .name(name() + ".gpu_tcp_st_hits")
1109         .desc("stores that hit in the TCP")
1110         ;
1111     GPU_TCPStTransfers
1112         .name(name() + ".gpu_tcp_st_transfers")
1113         .desc("TCP to TCP store transfers")
1114         ;
1115     GPU_TCCStHits
1116         .name(name() + ".gpu_tcc_st_hits")
1117         .desc("stores that hit in the TCC")
1118         ;
1119     GPU_StMiss
1120         .name(name() + ".gpu_st_misses")
1121         .desc("stores that miss in the GPU")
1122         ;
1123
1124     // CP cache stats
1125     CP_TCPLdHits
1126         .name(name() + ".cp_tcp_ld_hits")
1127         .desc("loads that hit in the TCP")
1128         ;
1129     CP_TCPLdTransfers
1130         .name(name() + ".cp_tcp_ld_transfers")
1131         .desc("TCP to TCP load transfers")
1132         ;
1133     CP_TCCLdHits
1134         .name(name() + ".cp_tcc_ld_hits")
1135         .desc("loads that hit in the TCC")
1136         ;
1137     CP_LdMiss
1138         .name(name() + ".cp_ld_misses")
1139         .desc("loads that miss in the GPU")
1140         ;
1141
1142     CP_TCPStHits
1143         .name(name() + ".cp_tcp_st_hits")
1144         .desc("stores that hit in the TCP")
1145         ;
1146     CP_TCPStTransfers
1147         .name(name() + ".cp_tcp_st_transfers")
1148         .desc("TCP to TCP store transfers")
1149         ;
1150     CP_TCCStHits
1151         .name(name() + ".cp_tcc_st_hits")
1152         .desc("stores that hit in the TCC")
1153         ;
1154     CP_StMiss
1155         .name(name() + ".cp_st_misses")
1156         .desc("stores that miss in the GPU")
1157         ;
1158 }