src/mem/ruby/system/GPUCoalescer.cc

   1 /*
   2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "base/logging.hh"
  35 #include "base/str.hh"
  36 #include "config/the_isa.hh"
  37
  38 #if THE_ISA == X86_ISA
  39 #include "arch/x86/insts/microldstop.hh"
  40
  41 #endif // X86_ISA
  42 #include "mem/ruby/system/GPUCoalescer.hh"
  43
  44 #include "cpu/testers/rubytest/RubyTester.hh"
  45 #include "debug/GPUCoalescer.hh"
  46 #include "debug/MemoryAccess.hh"
  47 #include "debug/ProtocolTrace.hh"
  48 #include "debug/RubyPort.hh"
  49 #include "debug/RubyStats.hh"
  50 #include "gpu-compute/shader.hh"
  51 #include "mem/packet.hh"
  52 #include "mem/ruby/common/DataBlock.hh"
  53 #include "mem/ruby/common/SubBlock.hh"
  54 #include "mem/ruby/network/MessageBuffer.hh"
  55 #include "mem/ruby/profiler/Profiler.hh"
  56 #include "mem/ruby/slicc_interface/AbstractController.hh"
  57 #include "mem/ruby/slicc_interface/RubyRequest.hh"
  58 #include "mem/ruby/structures/CacheMemory.hh"
  59 #include "mem/ruby/system/RubySystem.hh"
  60 #include "params/RubyGPUCoalescer.hh"
  61
  62 using namespace std;
  63
  64 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
  65     : coalescer(gc)
  66 {
  67 }
  68
  69 void
  70 UncoalescedTable::insertPacket(PacketPtr pkt)
  71 {
  72     uint64_t seqNum = pkt->req->getReqInstSeqNum();
  73
  74     instMap[seqNum].push_back(pkt);
  75     DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
  76             pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
  77 }
  78
  79 bool
  80 UncoalescedTable::packetAvailable()
  81 {
  82     return !instMap.empty();
  83 }
  84
  85 PerInstPackets*
  86 UncoalescedTable::getInstPackets(int offset)
  87 {
  88     if (offset >= instMap.size()) {
  89         return nullptr;
  90     }
  91
  92     auto instMapIter = instMap.begin();
  93     std::advance(instMapIter, offset);
  94
  95     return &(instMapIter->second);
  96 }
  97
  98 void
  99 UncoalescedTable::updateResources()
 100 {
 101     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
 102         if (iter->second.empty()) {
 103             DPRINTF(GPUCoalescer, "Returning token seqNum %d\n", iter->first);
 104             instMap.erase(iter++);
 105             coalescer->getGMTokenPort().sendTokens(1);
 106         } else {
 107             ++iter;
 108         }
 109     }
 110 }
 111
 112 bool
 113 UncoalescedTable::areRequestsDone(const uint64_t instSeqNum) {
 114     // iterate the instructions held in UncoalescedTable to see whether there
 115     // are more requests to issue; if yes, not yet done; otherwise, done
 116     for (auto& inst : instMap) {
 117         DPRINTF(GPUCoalescer, "instSeqNum= %d, pending packets=%d\n"
 118             ,inst.first, inst.second.size());
 119         if (inst.first == instSeqNum) { return false; }
 120     }
 121
 122     return true;
 123 }
 124
 125 void
 126 UncoalescedTable::printRequestTable(std::stringstream& ss)
 127 {
 128     ss << "Listing pending packets from " << instMap.size() << " instructions";
 129
 130     for (auto& inst : instMap) {
 131         ss << "\tAddr: " << printAddress(inst.first) << " with "
 132            << inst.second.size() << " pending packets" << std::endl;
 133     }
 134 }
 135
 136 void
 137 UncoalescedTable::checkDeadlock(Tick threshold)
 138 {
 139     Tick current_time = curTick();
 140
 141     for (auto &it : instMap) {
 142         for (auto &pkt : it.second) {
 143             if (current_time - pkt->req->time() > threshold) {
 144                 std::stringstream ss;
 145                 printRequestTable(ss);
 146
 147                 panic("Possible Deadlock detected. Aborting!\n"
 148                      "version: %d request.paddr: 0x%x uncoalescedTable: %d "
 149                      "current time: %u issue_time: %d difference: %d\n"
 150                      "Request Tables:\n\n%s", coalescer->getId(),
 151                       pkt->getAddr(), instMap.size(), current_time,
 152                       pkt->req->time(), current_time - pkt->req->time(),
 153                       ss.str());
 154             }
 155         }
 156     }
 157 }
 158
 159 GPUCoalescer::GPUCoalescer(const Params *p)
 160     : RubyPort(p),
 161       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
 162                  false, Event::Progress_Event_Pri),
 163       uncoalescedTable(this),
 164       deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
 165       gmTokenPort(name() + ".gmTokenPort", this)
 166 {
 167     m_store_waiting_on_load_cycles = 0;
 168     m_store_waiting_on_store_cycles = 0;
 169     m_load_waiting_on_store_cycles = 0;
 170     m_load_waiting_on_load_cycles = 0;
 171
 172     m_outstanding_count = 0;
 173
 174     coalescingWindow = p->max_coalesces_per_cycle;
 175
 176     m_max_outstanding_requests = 0;
 177     m_instCache_ptr = nullptr;
 178     m_dataCache_ptr = nullptr;
 179
 180     m_instCache_ptr = p->icache;
 181     m_dataCache_ptr = p->dcache;
 182     m_max_outstanding_requests = p->max_outstanding_requests;
 183     m_deadlock_threshold = p->deadlock_threshold;
 184
 185     assert(m_max_outstanding_requests > 0);
 186     assert(m_deadlock_threshold > 0);
 187     assert(m_instCache_ptr);
 188     assert(m_dataCache_ptr);
 189
 190     m_runningGarnetStandalone = p->garnet_standalone;
 191 }
 192
 193 GPUCoalescer::~GPUCoalescer()
 194 {
 195 }
 196
 197 Port &
 198 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
 199 {
 200     if (if_name == "gmTokenPort") {
 201         return gmTokenPort;
 202     }
 203
 204     // delgate to RubyPort otherwise
 205     return RubyPort::getPort(if_name, idx);
 206 }
 207
 208 void
 209 GPUCoalescer::wakeup()
 210 {
 211     Cycles current_time = curCycle();
 212     for (auto& requestList : coalescedTable) {
 213         for (auto& req : requestList.second) {
 214             if (current_time - req->getIssueTime() > m_deadlock_threshold) {
 215                 std::stringstream ss;
 216                 printRequestTable(ss);
 217                 warn("GPUCoalescer %d Possible deadlock detected!\n%s\n",
 218                      m_version, ss.str());
 219                 panic("Aborting due to deadlock!\n");
 220             }
 221         }
 222     }
 223
 224     Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
 225     uncoalescedTable.checkDeadlock(tick_threshold);
 226
 227     if (m_outstanding_count > 0) {
 228         schedule(deadlockCheckEvent,
 229                  m_deadlock_threshold * clockPeriod() +
 230                  curTick());
 231     }
 232 }
 233
 234 void
 235 GPUCoalescer::printRequestTable(std::stringstream& ss)
 236 {
 237     ss << "Printing out " << coalescedTable.size()
 238        << " outstanding requests in the coalesced table\n";
 239
 240     for (auto& requestList : coalescedTable) {
 241         for (auto& request : requestList.second) {
 242             ss << "\tAddr: " << printAddress(requestList.first) << "\n"
 243                << "\tInstruction sequence number: "
 244                << request->getSeqNum() << "\n"
 245                << "\t\tType: "
 246                << RubyRequestType_to_string(request->getRubyType()) << "\n"
 247                << "\t\tNumber of associated packets: "
 248                << request->getPackets().size() << "\n"
 249                << "\t\tIssue time: "
 250                << request->getIssueTime() * clockPeriod() << "\n"
 251                << "\t\tDifference from current tick: "
 252                << (curCycle() - request->getIssueTime()) * clockPeriod();
 253         }
 254     }
 255
 256     // print out packets waiting to be issued in uncoalesced table
 257     uncoalescedTable.printRequestTable(ss);
 258 }
 259
 260 void
 261 GPUCoalescer::resetStats()
 262 {
 263     m_latencyHist.reset();
 264     m_missLatencyHist.reset();
 265     for (int i = 0; i < RubyRequestType_NUM; i++) {
 266         m_typeLatencyHist[i]->reset();
 267         m_missTypeLatencyHist[i]->reset();
 268         for (int j = 0; j < MachineType_NUM; j++) {
 269             m_missTypeMachLatencyHist[i][j]->reset();
 270         }
 271     }
 272
 273     for (int i = 0; i < MachineType_NUM; i++) {
 274         m_missMachLatencyHist[i]->reset();
 275
 276         m_IssueToInitialDelayHist[i]->reset();
 277         m_InitialToForwardDelayHist[i]->reset();
 278         m_ForwardToFirstResponseDelayHist[i]->reset();
 279         m_FirstResponseToCompletionDelayHist[i]->reset();
 280     }
 281 }
 282
 283 void
 284 GPUCoalescer::printProgress(ostream& out) const
 285 {
 286 }
 287
 288 // sets the kernelEndList
 289 void
 290 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
 291 {
 292     // Don't know if this will happen or is possible
 293     // but I just want to be careful and not have it become
 294     // simulator hang in the future
 295     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
 296     assert(kernelEndList.count(wavefront_id) == 0);
 297
 298     kernelEndList[wavefront_id] = pkt;
 299     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
 300             kernelEndList.size());
 301 }
 302
 303 void
 304 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 305 {
 306     writeCallback(address, MachineType_NULL, data);
 307 }
 308
 309 void
 310 GPUCoalescer::writeCallback(Addr address,
 311                          MachineType mach,
 312                          DataBlock& data)
 313 {
 314     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 315 }
 316
 317 void
 318 GPUCoalescer::writeCallback(Addr address,
 319                          MachineType mach,
 320                          DataBlock& data,
 321                          Cycles initialRequestTime,
 322                          Cycles forwardRequestTime,
 323                          Cycles firstResponseTime)
 324 {
 325     writeCallback(address, mach, data,
 326                   initialRequestTime, forwardRequestTime, firstResponseTime,
 327                   false);
 328 }
 329
 330 void
 331 GPUCoalescer::writeCallback(Addr address,
 332                          MachineType mach,
 333                          DataBlock& data,
 334                          Cycles initialRequestTime,
 335                          Cycles forwardRequestTime,
 336                          Cycles firstResponseTime,
 337                          bool isRegion)
 338 {
 339     assert(address == makeLineAddress(address));
 340     assert(coalescedTable.count(address));
 341
 342     auto crequest = coalescedTable.at(address).front();
 343
 344     hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
 345                 forwardRequestTime, firstResponseTime, isRegion);
 346
 347     // remove this crequest in coalescedTable
 348     delete crequest;
 349     coalescedTable.at(address).pop_front();
 350
 351     if (coalescedTable.at(address).empty()) {
 352         coalescedTable.erase(address);
 353     } else {
 354         auto nextRequest = coalescedTable.at(address).front();
 355         issueRequest(nextRequest);
 356     }
 357 }
 358
 359 void
 360 GPUCoalescer::writeCompleteCallback(Addr address,
 361                                     uint64_t instSeqNum,
 362                                     MachineType mach)
 363 {
 364     DPRINTF(GPUCoalescer, "writeCompleteCallback for address 0x%x"
 365             " instSeqNum = %d\n", address, instSeqNum);
 366
 367     assert(pendingWriteInsts.count(instSeqNum) == 1);
 368     PendingWriteInst& inst = pendingWriteInsts[instSeqNum];
 369
 370     // check the uncoalescedTable to see whether all requests for the inst
 371     // have been issued or not
 372     bool reqsAllIssued = uncoalescedTable.areRequestsDone(instSeqNum);
 373     DPRINTF(GPUCoalescer, "instSeqNum = %d, pendingStores=%d, "
 374                     "reqsAllIssued=%d\n", reqsAllIssued,
 375                     inst.getNumPendingStores()-1, reqsAllIssued);
 376
 377     if (inst.receiveWriteCompleteAck() && reqsAllIssued ) {
 378         // if the pending write instruction has received all write completion
 379         // callbacks for its issued Ruby requests, we can now start respond
 380         // the requesting CU in one response packet.
 381         inst.ackWriteCompletion(m_usingRubyTester);
 382
 383         DPRINTF(GPUCoalescer, "write inst %d completed at coalescer\n",
 384                 instSeqNum);
 385         pendingWriteInsts.erase(instSeqNum);
 386     }
 387 }
 388
 389 void
 390 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 391 {
 392     readCallback(address, MachineType_NULL, data);
 393 }
 394
 395 void
 396 GPUCoalescer::readCallback(Addr address,
 397                         MachineType mach,
 398                         DataBlock& data)
 399 {
 400     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 401 }
 402
 403 void
 404 GPUCoalescer::readCallback(Addr address,
 405                         MachineType mach,
 406                         DataBlock& data,
 407                         Cycles initialRequestTime,
 408                         Cycles forwardRequestTime,
 409                         Cycles firstResponseTime)
 410 {
 411
 412     readCallback(address, mach, data,
 413                  initialRequestTime, forwardRequestTime, firstResponseTime,
 414                  false);
 415 }
 416
 417 void
 418 GPUCoalescer::readCallback(Addr address,
 419                         MachineType mach,
 420                         DataBlock& data,
 421                         Cycles initialRequestTime,
 422                         Cycles forwardRequestTime,
 423                         Cycles firstResponseTime,
 424                         bool isRegion)
 425 {
 426     assert(address == makeLineAddress(address));
 427     assert(coalescedTable.count(address));
 428
 429     auto crequest = coalescedTable.at(address).front();
 430     fatal_if(crequest->getRubyType() != RubyRequestType_LD,
 431              "readCallback received non-read type response\n");
 432
 433     // Iterate over the coalesced requests to respond to as many loads as
 434     // possible until another request type is seen. Models MSHR for TCP.
 435     while (crequest->getRubyType() == RubyRequestType_LD) {
 436         hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
 437                     forwardRequestTime, firstResponseTime, isRegion);
 438
 439         delete crequest;
 440         coalescedTable.at(address).pop_front();
 441         if (coalescedTable.at(address).empty()) {
 442             break;
 443         }
 444
 445         crequest = coalescedTable.at(address).front();
 446     }
 447
 448     if (coalescedTable.at(address).empty()) {
 449         coalescedTable.erase(address);
 450     } else {
 451         auto nextRequest = coalescedTable.at(address).front();
 452         issueRequest(nextRequest);
 453     }
 454 }
 455
 456 void
 457 GPUCoalescer::hitCallback(CoalescedRequest* crequest,
 458                        MachineType mach,
 459                        DataBlock& data,
 460                        bool success,
 461                        Cycles initialRequestTime,
 462                        Cycles forwardRequestTime,
 463                        Cycles firstResponseTime,
 464                        bool isRegion)
 465 {
 466     PacketPtr pkt = crequest->getFirstPkt();
 467     Addr request_address = pkt->getAddr();
 468     Addr request_line_address M5_VAR_USED = makeLineAddress(request_address);
 469
 470     RubyRequestType type = crequest->getRubyType();
 471
 472     DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);
 473
 474     recordMissLatency(crequest, mach,
 475                       initialRequestTime,
 476                       forwardRequestTime,
 477                       firstResponseTime,
 478                       success, isRegion);
 479     // update the data
 480     //
 481     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
 482     std::vector<PacketPtr> pktList = crequest->getPackets();
 483     DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
 484             pktList.size(), request_line_address);
 485     for (auto& pkt : pktList) {
 486         request_address = pkt->getAddr();
 487         if (pkt->getPtr<uint8_t>()) {
 488             if ((type == RubyRequestType_LD) ||
 489                 (type == RubyRequestType_ATOMIC) ||
 490                 (type == RubyRequestType_ATOMIC_RETURN) ||
 491                 (type == RubyRequestType_IFETCH) ||
 492                 (type == RubyRequestType_RMW_Read) ||
 493                 (type == RubyRequestType_Locked_RMW_Read) ||
 494                 (type == RubyRequestType_Load_Linked)) {
 495                 pkt->setData(
 496                     data.getData(getOffset(request_address), pkt->getSize()));
 497             } else {
 498                 data.setData(pkt->getPtr<uint8_t>(),
 499                              getOffset(request_address), pkt->getSize());
 500             }
 501         } else {
 502             DPRINTF(MemoryAccess,
 503                     "WARNING.  Data not transfered from Ruby to M5 for type " \
 504                     "%s\n",
 505                     RubyRequestType_to_string(type));
 506         }
 507     }
 508
 509
 510
 511     m_outstanding_count--;
 512     assert(m_outstanding_count >= 0);
 513
 514     completeHitCallback(pktList);
 515 }
 516
 517 bool
 518 GPUCoalescer::empty() const
 519 {
 520     return coalescedTable.empty();
 521 }
 522
 523 RubyRequestType
 524 GPUCoalescer::getRequestType(PacketPtr pkt)
 525 {
 526     RubyRequestType req_type = RubyRequestType_NULL;
 527
 528     // These types are not support or not used in GPU caches.
 529     assert(!pkt->req->isLLSC());
 530     assert(!pkt->req->isLockedRMW());
 531     assert(!pkt->req->isInstFetch());
 532     assert(!pkt->isFlush());
 533
 534     if (pkt->req->isAtomicReturn()) {
 535         req_type = RubyRequestType_ATOMIC_RETURN;
 536     } else if (pkt->req->isAtomicNoReturn()) {
 537         req_type = RubyRequestType_ATOMIC_NO_RETURN;
 538     } else if (pkt->isRead()) {
 539         req_type = RubyRequestType_LD;
 540     } else if (pkt->isWrite()) {
 541         req_type = RubyRequestType_ST;
 542     } else {
 543         panic("Unsupported ruby packet type\n");
 544     }
 545
 546     return req_type;
 547 }
 548
 549 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
 550 // special type (MemFence, scoping, etc), it is issued immediately.
 551 RequestStatus
 552 GPUCoalescer::makeRequest(PacketPtr pkt)
 553 {
 554     // all packets must have valid instruction sequence numbers
 555     assert(pkt->req->hasInstSeqNum());
 556
 557     if (pkt->cmd == MemCmd::MemSyncReq) {
 558         // issue mem_sync requests immedidately to the cache system without
 559         // going though uncoalescedTable like normal LD/ST/Atomic requests
 560         issueMemSyncRequest(pkt);
 561     } else {
 562         // otherwise, this must be either read or write command
 563         assert(pkt->isRead() || pkt->isWrite());
 564
 565         // the pkt is temporarily stored in the uncoalesced table until
 566         // it's picked for coalescing process later in this cycle or in a
 567         // future cycle
 568         uncoalescedTable.insertPacket(pkt);
 569         DPRINTF(GPUCoalescer, "Put pkt with addr 0x%X to uncoalescedTable\n",
 570                 pkt->getAddr());
 571
 572         // we schedule an issue event here to process the uncoalesced table
 573         // and try to issue Ruby request to cache system
 574         if (!issueEvent.scheduled()) {
 575             schedule(issueEvent, curTick());
 576         }
 577     }
 578
 579     // we always return RequestStatus_Issued in this coalescer
 580     // b/c the coalescer's resouce was checked ealier and the coalescer is
 581     // queueing up aliased requets in its coalesced table
 582     return RequestStatus_Issued;
 583 }
 584
 585 /**
 586  * TODO: Figure out what do with this code. This code may go away
 587  *       and/or be merged into the VIPER coalescer once the VIPER
 588  *       protocol is re-integrated with GCN3 codes.
 589  */
 590 /*
 591 void
 592 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 593 {
 594     PacketPtr pkt = crequest->getFirstPkt();
 595
 596     int proc_id = -1;
 597     if (pkt != NULL && pkt->req->hasContextId()) {
 598         proc_id = pkt->req->contextId();
 599     }
 600
 601     // If valid, copy the pc to the ruby request
 602     Addr pc = 0;
 603     if (pkt->req->hasPC()) {
 604         pc = pkt->req->getPC();
 605     }
 606
 607     // At the moment setting scopes only counts
 608     // for GPU spill space accesses
 609     // which is pkt->req->isStack()
 610     // this scope is REPLACE since it
 611     // does not need to be flushed at the end
 612     // of a kernel Private and local may need
 613     // to be visible at the end of the kernel
 614     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
 615     HSAScope accessScope = reqScopeToHSAScope(pkt->req);
 616
 617     Addr line_addr = makeLineAddress(pkt->getAddr());
 618
 619     // Creating WriteMask that records written bytes
 620     // and atomic operations. This enables partial writes
 621     // and partial reads of those writes
 622     DataBlock dataBlock;
 623     dataBlock.clear();
 624     uint32_t blockSize = RubySystem::getBlockSizeBytes();
 625     std::vector<bool> accessMask(blockSize,false);
 626     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
 627     uint32_t tableSize = crequest->getPackets().size();
 628     for (int i = 0; i < tableSize; i++) {
 629         PacketPtr tmpPkt = crequest->getPackets()[i];
 630         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
 631         uint32_t tmpSize = tmpPkt->getSize();
 632         if (tmpPkt->isAtomicOp()) {
 633             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
 634                                                         tmpPkt->getAtomicOp());
 635             atomicOps.push_back(tmpAtomicOp);
 636         } else if (tmpPkt->isWrite()) {
 637             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
 638                               tmpOffset, tmpSize);
 639         }
 640         for (int j = 0; j < tmpSize; j++) {
 641             accessMask[tmpOffset + j] = true;
 642         }
 643     }
 644     std::shared_ptr<RubyRequest> msg;
 645     if (pkt->isAtomicOp()) {
 646         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 647                               pkt->getPtr<uint8_t>(),
 648                               pkt->getSize(), pc, crequest->getRubyType(),
 649                               RubyAccessMode_Supervisor, pkt,
 650                               PrefetchBit_No, proc_id, 100,
 651                               blockSize, accessMask,
 652                               dataBlock, atomicOps,
 653                               accessScope, accessSegment);
 654     } else {
 655         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 656                               pkt->getPtr<uint8_t>(),
 657                               pkt->getSize(), pc, crequest->getRubyType(),
 658                               RubyAccessMode_Supervisor, pkt,
 659                               PrefetchBit_No, proc_id, 100,
 660                               blockSize, accessMask,
 661                               dataBlock,
 662                               accessScope, accessSegment);
 663     }
 664     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
 665              curTick(), m_version, "Coal", "Begin", "", "",
 666              printAddress(msg->getPhysicalAddress()),
 667              RubyRequestType_to_string(crequest->getRubyType()));
 668
 669     fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
 670              "there should not be any I-Fetch requests in the GPU Coalescer");
 671
 672     Tick latency = cyclesToTicks(
 673                 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
 674     assert(latency > 0);
 675
 676     if (!deadlockCheckEvent.scheduled()) {
 677         schedule(deadlockCheckEvent,
 678                  m_deadlock_threshold * clockPeriod() +
 679                  curTick());
 680     }
 681
 682     assert(m_mandatory_q_ptr);
 683     m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
 684 }*/
 685
 686 template <class KEY, class VALUE>
 687 std::ostream &
 688 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
 689 {
 690     out << "[";
 691     for (auto i = map.begin(); i != map.end(); ++i)
 692         out << " " << i->first << "=" << i->second;
 693     out << " ]";
 694
 695     return out;
 696 }
 697
 698 void
 699 GPUCoalescer::print(ostream& out) const
 700 {
 701     out << "[GPUCoalescer: " << m_version
 702         << ", outstanding requests: " << m_outstanding_count
 703         << "]";
 704 }
 705
 706
 707 bool
 708 GPUCoalescer::coalescePacket(PacketPtr pkt)
 709 {
 710     uint64_t seqNum = pkt->req->getReqInstSeqNum();
 711     Addr line_addr = makeLineAddress(pkt->getAddr());
 712
 713     // If the packet has the same line address as a request already in the
 714     // coalescedTable and has the same sequence number, it can be coalesced.
 715     if (coalescedTable.count(line_addr)) {
 716         // Search for a previous coalesced request with the same seqNum.
 717         auto& creqQueue = coalescedTable.at(line_addr);
 718         auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
 719             [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
 720         );
 721         if (citer != creqQueue.end()) {
 722             (*citer)->insertPacket(pkt);
 723             return true;
 724         }
 725     }
 726
 727     if (m_outstanding_count < m_max_outstanding_requests) {
 728         // This is an "aliased" or new request. Create a RubyRequest and
 729         // append it to the list of "targets" in the coalescing table.
 730         DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
 731                 line_addr);
 732
 733         CoalescedRequest *creq = new CoalescedRequest(seqNum);
 734         creq->insertPacket(pkt);
 735         creq->setRubyType(getRequestType(pkt));
 736         creq->setIssueTime(curCycle());
 737
 738         if (!coalescedTable.count(line_addr)) {
 739             // If there is no outstanding request for this line address,
 740             // create a new coalecsed request and issue it immediately.
 741             auto reqList = std::deque<CoalescedRequest*> { creq };
 742             coalescedTable.insert(std::make_pair(line_addr, reqList));
 743
 744             DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
 745                     RubyRequestType_to_string(creq->getRubyType()), seqNum);
 746             issueRequest(creq);
 747         } else {
 748             // The request is for a line address that is already outstanding
 749             // but for a different instruction. Add it as a new request to be
 750             // issued when the current outstanding request is completed.
 751             coalescedTable.at(line_addr).push_back(creq);
 752             DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
 753                     line_addr, seqNum);
 754         }
 755
 756         // In both cases, requests are added to the coalescing table and will
 757         // be counted as outstanding requests.
 758         m_outstanding_count++;
 759
 760         // We track all issued or to-be-issued Ruby requests associated with
 761         // write instructions. An instruction may have multiple Ruby
 762         // requests.
 763         if (pkt->cmd == MemCmd::WriteReq) {
 764             DPRINTF(GPUCoalescer, "adding write inst %d at line 0x%x to"
 765                     " the pending write instruction list\n", seqNum,
 766                     line_addr);
 767
 768             RubyPort::SenderState* ss =
 769                     safe_cast<RubyPort::SenderState*>(pkt->senderState);
 770
 771             // we need to save this port because it will be used to call
 772             // back the requesting CU when we receive write
 773             // complete callbacks for all issued Ruby requests of this
 774             // instruction.
 775             RubyPort::MemSlavePort* mem_slave_port = ss->port;
 776
 777             GPUDynInstPtr gpuDynInst = nullptr;
 778
 779             if (!m_usingRubyTester) {
 780                 // If this coalescer is connected to a real CU, we need
 781                 // to save the corresponding gpu dynamic instruction.
 782                 // CU will use that instruction to decrement wait counters
 783                 // in the issuing wavefront.
 784                 // For Ruby tester, gpuDynInst == nullptr
 785                 ComputeUnit::DataPort::SenderState* cu_state =
 786                     safe_cast<ComputeUnit::DataPort::SenderState*>
 787                         (ss->predecessor);
 788                 gpuDynInst = cu_state->_gpuDynInst;
 789             }
 790
 791             PendingWriteInst& inst = pendingWriteInsts[seqNum];
 792             inst.addPendingReq(mem_slave_port, gpuDynInst, m_usingRubyTester);
 793         }
 794
 795         return true;
 796     }
 797
 798     // The maximum number of outstanding requests have been issued.
 799     return false;
 800 }
 801
 802 void
 803 GPUCoalescer::completeIssue()
 804 {
 805     // Iterate over the maximum number of instructions we can coalesce
 806     // per cycle (coalescingWindow).
 807     for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
 808         PerInstPackets *pktList =
 809             uncoalescedTable.getInstPackets(instIdx);
 810
 811         // getInstPackets will return nullptr if no instruction
 812         // exists at the current offset.
 813         if (!pktList) {
 814             break;
 815         } else {
 816             // Since we have a pointer to the list of packets in the inst,
 817             // erase them from the list if coalescing is successful and
 818             // leave them in the list otherwise. This aggressively attempts
 819             // to coalesce as many packets as possible from the current inst.
 820             pktList->remove_if(
 821                 [&](PacketPtr pkt) { return coalescePacket(pkt); }
 822             );
 823         }
 824     }
 825
 826     // Clean up any instructions in the uncoalesced table that have had
 827     // all of their packets coalesced and return a token for that column.
 828     uncoalescedTable.updateResources();
 829
 830     // have Kernel End releases been issued this cycle
 831     int len = newKernelEnds.size();
 832     for (int i = 0; i < len; i++) {
 833         kernelCallback(newKernelEnds[i]);
 834     }
 835     newKernelEnds.clear();
 836 }
 837
 838 void
 839 GPUCoalescer::evictionCallback(Addr address)
 840 {
 841     ruby_eviction_callback(address);
 842 }
 843
 844 void
 845 GPUCoalescer::kernelCallback(int wavefront_id)
 846 {
 847     assert(kernelEndList.count(wavefront_id));
 848
 849     ruby_hit_callback(kernelEndList[wavefront_id]);
 850
 851     kernelEndList.erase(wavefront_id);
 852 }
 853
 854 void
 855 GPUCoalescer::atomicCallback(Addr address,
 856                              MachineType mach,
 857                              const DataBlock& data)
 858 {
 859     assert(address == makeLineAddress(address));
 860     assert(coalescedTable.count(address));
 861
 862     auto crequest = coalescedTable.at(address).front();
 863
 864     fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
 865               crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
 866               crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
 867              "atomicCallback saw non-atomic type response\n");
 868
 869     hitCallback(crequest, mach, (DataBlock&)data, true,
 870                 crequest->getIssueTime(), Cycles(0), Cycles(0), false);
 871
 872     delete crequest;
 873     coalescedTable.at(address).pop_front();
 874
 875     if (coalescedTable.at(address).empty()) {
 876         coalescedTable.erase(address);
 877     } else {
 878         auto nextRequest = coalescedTable.at(address).front();
 879         issueRequest(nextRequest);
 880     }
 881 }
 882
 883 void
 884 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 885 {
 886     for (auto& pkt : mylist) {
 887         RubyPort::SenderState *ss =
 888             safe_cast<RubyPort::SenderState *>(pkt->senderState);
 889         MemSlavePort *port = ss->port;
 890         assert(port != NULL);
 891
 892         pkt->senderState = ss->predecessor;
 893         delete ss;
 894         port->hitCallback(pkt);
 895         trySendRetries();
 896     }
 897
 898     // We schedule an event in the same tick as hitCallback (similar to
 899     // makeRequest) rather than calling completeIssue directly to reduce
 900     // function calls to complete issue. This can only happen if the max
 901     // outstanding requests is less than the number of slots in the
 902     // uncoalesced table and makeRequest is not called again.
 903     if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
 904         schedule(issueEvent, curTick());
 905     }
 906
 907     testDrainComplete();
 908 }
 909
 910 void
 911 GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
 912                                 MachineType mach,
 913                                 Cycles initialRequestTime,
 914                                 Cycles forwardRequestTime,
 915                                 Cycles firstResponseTime,
 916                                 bool success, bool isRegion)
 917 {
 918 }
 919
 920 void
 921 GPUCoalescer::regStats()
 922 {
 923     RubyPort::regStats();
 924
 925     // These statistical variables are not for display.
 926     // The profiler will collate these across different
 927     // coalescers and display those collated statistics.
 928     m_outstandReqHist.init(10);
 929     m_latencyHist.init(10);
 930     m_missLatencyHist.init(10);
 931
 932     for (int i = 0; i < RubyRequestType_NUM; i++) {
 933         m_typeLatencyHist.push_back(new Stats::Histogram());
 934         m_typeLatencyHist[i]->init(10);
 935
 936         m_missTypeLatencyHist.push_back(new Stats::Histogram());
 937         m_missTypeLatencyHist[i]->init(10);
 938     }
 939
 940     for (int i = 0; i < MachineType_NUM; i++) {
 941         m_missMachLatencyHist.push_back(new Stats::Histogram());
 942         m_missMachLatencyHist[i]->init(10);
 943
 944         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
 945         m_IssueToInitialDelayHist[i]->init(10);
 946
 947         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
 948         m_InitialToForwardDelayHist[i]->init(10);
 949
 950         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
 951         m_ForwardToFirstResponseDelayHist[i]->init(10);
 952
 953         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
 954         m_FirstResponseToCompletionDelayHist[i]->init(10);
 955     }
 956
 957     for (int i = 0; i < RubyRequestType_NUM; i++) {
 958         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
 959
 960         for (int j = 0; j < MachineType_NUM; j++) {
 961             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
 962             m_missTypeMachLatencyHist[i][j]->init(10);
 963         }
 964     }
 965 }