src/mem/ruby/system/GPUCoalescer.cc

   1 /*
   2  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Sooraj Puthoor
  34  */
  35
  36 #include "base/misc.hh"
  37 #include "base/str.hh"
  38 #include "config/the_isa.hh"
  39
  40 #if THE_ISA == X86_ISA
  41 #include "arch/x86/insts/microldstop.hh"
  42
  43 #endif // X86_ISA
  44 #include "mem/ruby/system/GPUCoalescer.hh"
  45
  46 #include "cpu/testers/rubytest/RubyTester.hh"
  47 #include "debug/GPUCoalescer.hh"
  48 #include "debug/MemoryAccess.hh"
  49 #include "debug/ProtocolTrace.hh"
  50 #include "debug/RubyPort.hh"
  51 #include "debug/RubyStats.hh"
  52 #include "gpu-compute/shader.hh"
  53 #include "mem/packet.hh"
  54 #include "mem/ruby/common/DataBlock.hh"
  55 #include "mem/ruby/common/SubBlock.hh"
  56 #include "mem/ruby/network/MessageBuffer.hh"
  57 #include "mem/ruby/profiler/Profiler.hh"
  58 #include "mem/ruby/slicc_interface/AbstractController.hh"
  59 #include "mem/ruby/slicc_interface/RubyRequest.hh"
  60 #include "mem/ruby/structures/CacheMemory.hh"
  61 #include "mem/ruby/system/RubySystem.hh"
  62 #include "params/RubyGPUCoalescer.hh"
  63
  64 using namespace std;
  65
  66 GPUCoalescer *
  67 RubyGPUCoalescerParams::create()
  68 {
  69     return new GPUCoalescer(this);
  70 }
  71
  72 HSAScope
  73 reqScopeToHSAScope(Request* req)
  74 {
  75     HSAScope accessScope = HSAScope_UNSPECIFIED;
  76     if (req->isScoped()) {
  77         if (req->isWavefrontScope()) {
  78             accessScope = HSAScope_WAVEFRONT;
  79         } else if (req->isWorkgroupScope()) {
  80             accessScope = HSAScope_WORKGROUP;
  81         } else if (req->isDeviceScope()) {
  82             accessScope = HSAScope_DEVICE;
  83         } else if (req->isSystemScope()) {
  84             accessScope = HSAScope_SYSTEM;
  85         } else {
  86             fatal("Bad scope type");
  87         }
  88     }
  89     return accessScope;
  90 }
  91
  92 HSASegment
  93 reqSegmentToHSASegment(Request* req)
  94 {
  95     HSASegment accessSegment = HSASegment_GLOBAL;
  96
  97     if (req->isGlobalSegment()) {
  98         accessSegment = HSASegment_GLOBAL;
  99     } else if (req->isGroupSegment()) {
 100         accessSegment = HSASegment_GROUP;
 101     } else if (req->isPrivateSegment()) {
 102         accessSegment = HSASegment_PRIVATE;
 103     } else if (req->isKernargSegment()) {
 104         accessSegment = HSASegment_KERNARG;
 105     } else if (req->isReadonlySegment()) {
 106         accessSegment = HSASegment_READONLY;
 107     } else if (req->isSpillSegment()) {
 108         accessSegment = HSASegment_SPILL;
 109     } else if (req->isArgSegment()) {
 110         accessSegment = HSASegment_ARG;
 111     } else {
 112         fatal("Bad segment type");
 113     }
 114
 115     return accessSegment;
 116 }
 117
 118 GPUCoalescer::GPUCoalescer(const Params *p)
 119     : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
 120 {
 121     m_store_waiting_on_load_cycles = 0;
 122     m_store_waiting_on_store_cycles = 0;
 123     m_load_waiting_on_store_cycles = 0;
 124     m_load_waiting_on_load_cycles = 0;
 125
 126     m_outstanding_count = 0;
 127
 128     m_max_outstanding_requests = 0;
 129     m_deadlock_threshold = 0;
 130     m_instCache_ptr = nullptr;
 131     m_dataCache_ptr = nullptr;
 132
 133     m_instCache_ptr = p->icache;
 134     m_dataCache_ptr = p->dcache;
 135     m_max_outstanding_requests = p->max_outstanding_requests;
 136     m_deadlock_threshold = p->deadlock_threshold;
 137
 138     assert(m_max_outstanding_requests > 0);
 139     assert(m_deadlock_threshold > 0);
 140     assert(m_instCache_ptr);
 141     assert(m_dataCache_ptr);
 142
 143     m_data_cache_hit_latency = p->dcache_hit_latency;
 144
 145     m_usingNetworkTester = p->using_network_tester;
 146     assumingRfOCoherence = p->assume_rfo;
 147 }
 148
 149 GPUCoalescer::~GPUCoalescer()
 150 {
 151 }
 152
 153 void
 154 GPUCoalescer::wakeup()
 155 {
 156     // Check for deadlock of any of the requests
 157     Cycles current_time = curCycle();
 158
 159     // Check across all outstanding requests
 160     int total_outstanding = 0;
 161
 162     RequestTable::iterator read = m_readRequestTable.begin();
 163     RequestTable::iterator read_end = m_readRequestTable.end();
 164     for (; read != read_end; ++read) {
 165         GPUCoalescerRequest* request = read->second;
 166         if (current_time - request->issue_time < m_deadlock_threshold)
 167             continue;
 168
 169         panic("Possible Deadlock detected. Aborting!\n"
 170              "version: %d request.paddr: 0x%x m_readRequestTable: %d "
 171              "current time: %u issue_time: %d difference: %d\n", m_version,
 172               request->pkt->getAddr(), m_readRequestTable.size(),
 173               current_time * clockPeriod(), request->issue_time * clockPeriod(),
 174               (current_time - request->issue_time)*clockPeriod());
 175     }
 176
 177     RequestTable::iterator write = m_writeRequestTable.begin();
 178     RequestTable::iterator write_end = m_writeRequestTable.end();
 179     for (; write != write_end; ++write) {
 180         GPUCoalescerRequest* request = write->second;
 181         if (current_time - request->issue_time < m_deadlock_threshold)
 182             continue;
 183
 184         panic("Possible Deadlock detected. Aborting!\n"
 185              "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
 186              "current time: %u issue_time: %d difference: %d\n", m_version,
 187               request->pkt->getAddr(), m_writeRequestTable.size(),
 188               current_time * clockPeriod(), request->issue_time * clockPeriod(),
 189               (current_time - request->issue_time) * clockPeriod());
 190     }
 191
 192     total_outstanding += m_writeRequestTable.size();
 193     total_outstanding += m_readRequestTable.size();
 194
 195     assert(m_outstanding_count == total_outstanding);
 196
 197     if (m_outstanding_count > 0) {
 198         // If there are still outstanding requests, keep checking
 199         schedule(deadlockCheckEvent,
 200                  m_deadlock_threshold * clockPeriod() +
 201                  curTick());
 202     }
 203 }
 204
 205 void
 206 GPUCoalescer::resetStats()
 207 {
 208     m_latencyHist.reset();
 209     m_missLatencyHist.reset();
 210     for (int i = 0; i < RubyRequestType_NUM; i++) {
 211         m_typeLatencyHist[i]->reset();
 212         m_missTypeLatencyHist[i]->reset();
 213         for (int j = 0; j < MachineType_NUM; j++) {
 214             m_missTypeMachLatencyHist[i][j]->reset();
 215         }
 216     }
 217
 218     for (int i = 0; i < MachineType_NUM; i++) {
 219         m_missMachLatencyHist[i]->reset();
 220
 221         m_IssueToInitialDelayHist[i]->reset();
 222         m_InitialToForwardDelayHist[i]->reset();
 223         m_ForwardToFirstResponseDelayHist[i]->reset();
 224         m_FirstResponseToCompletionDelayHist[i]->reset();
 225     }
 226 }
 227
 228 void
 229 GPUCoalescer::printProgress(ostream& out) const
 230 {
 231 }
 232
 233 RequestStatus
 234 GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
 235 {
 236     Addr line_addr = makeLineAddress(pkt->getAddr());
 237
 238     if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
 239         return RequestStatus_BufferFull;
 240     }
 241
 242     if (m_controller->isBlocked(line_addr) &&
 243        request_type != RubyRequestType_Locked_RMW_Write) {
 244         return RequestStatus_Aliased;
 245     }
 246
 247     if ((request_type == RubyRequestType_ST) ||
 248         (request_type == RubyRequestType_ATOMIC) ||
 249         (request_type == RubyRequestType_ATOMIC_RETURN) ||
 250         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
 251         (request_type == RubyRequestType_RMW_Read) ||
 252         (request_type == RubyRequestType_RMW_Write) ||
 253         (request_type == RubyRequestType_Load_Linked) ||
 254         (request_type == RubyRequestType_Store_Conditional) ||
 255         (request_type == RubyRequestType_Locked_RMW_Read) ||
 256         (request_type == RubyRequestType_Locked_RMW_Write) ||
 257         (request_type == RubyRequestType_FLUSH)) {
 258
 259         // Check if there is any outstanding read request for the same
 260         // cache line.
 261         if (m_readRequestTable.count(line_addr) > 0) {
 262             m_store_waiting_on_load_cycles++;
 263             return RequestStatus_Aliased;
 264         }
 265
 266         if (m_writeRequestTable.count(line_addr) > 0) {
 267           // There is an outstanding write request for the cache line
 268           m_store_waiting_on_store_cycles++;
 269           return RequestStatus_Aliased;
 270         }
 271     } else {
 272         // Check if there is any outstanding write request for the same
 273         // cache line.
 274         if (m_writeRequestTable.count(line_addr) > 0) {
 275             m_load_waiting_on_store_cycles++;
 276             return RequestStatus_Aliased;
 277         }
 278
 279         if (m_readRequestTable.count(line_addr) > 0) {
 280             // There is an outstanding read request for the cache line
 281             m_load_waiting_on_load_cycles++;
 282             return RequestStatus_Aliased;
 283         }
 284     }
 285
 286     return RequestStatus_Ready;
 287
 288 }
 289
 290
 291
 292 // sets the kernelEndList
 293 void
 294 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
 295 {
 296     // Don't know if this will happen or is possible
 297     // but I just want to be careful and not have it become
 298     // simulator hang in the future
 299     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
 300     assert(kernelEndList.count(wavefront_id) == 0);
 301
 302     kernelEndList[wavefront_id] = pkt;
 303     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
 304             kernelEndList.size());
 305 }
 306
 307
 308 // Insert the request on the correct request table.  Return true if
 309 // the entry was already present.
 310 bool
 311 GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
 312 {
 313     assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
 314            pkt->req->isLockedRMW() ||
 315            !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
 316
 317     int total_outstanding M5_VAR_USED =
 318         m_writeRequestTable.size() + m_readRequestTable.size();
 319
 320     assert(m_outstanding_count == total_outstanding);
 321
 322     // See if we should schedule a deadlock check
 323     if (deadlockCheckEvent.scheduled() == false) {
 324         schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
 325     }
 326
 327     Addr line_addr = makeLineAddress(pkt->getAddr());
 328     if ((request_type == RubyRequestType_ST) ||
 329         (request_type == RubyRequestType_ATOMIC) ||
 330         (request_type == RubyRequestType_ATOMIC_RETURN) ||
 331         (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
 332         (request_type == RubyRequestType_RMW_Read) ||
 333         (request_type == RubyRequestType_RMW_Write) ||
 334         (request_type == RubyRequestType_Load_Linked) ||
 335         (request_type == RubyRequestType_Store_Conditional) ||
 336         (request_type == RubyRequestType_Locked_RMW_Read) ||
 337         (request_type == RubyRequestType_Locked_RMW_Write) ||
 338         (request_type == RubyRequestType_FLUSH)) {
 339
 340         pair<RequestTable::iterator, bool> r =
 341           m_writeRequestTable.insert(RequestTable::value_type(line_addr,
 342                                        (GPUCoalescerRequest*) NULL));
 343         if (r.second) {
 344             RequestTable::iterator i = r.first;
 345             i->second = new GPUCoalescerRequest(pkt, request_type,
 346                                                 curCycle());
 347             DPRINTF(GPUCoalescer,
 348                     "Inserting write request for paddr %#x for type %d\n",
 349                     pkt->req->getPaddr(), i->second->m_type);
 350             m_outstanding_count++;
 351         } else {
 352             return true;
 353         }
 354     } else {
 355         pair<RequestTable::iterator, bool> r =
 356             m_readRequestTable.insert(RequestTable::value_type(line_addr,
 357                                         (GPUCoalescerRequest*) NULL));
 358
 359         if (r.second) {
 360             RequestTable::iterator i = r.first;
 361             i->second = new GPUCoalescerRequest(pkt, request_type,
 362                                              curCycle());
 363             DPRINTF(GPUCoalescer,
 364                     "Inserting read request for paddr %#x for type %d\n",
 365                     pkt->req->getPaddr(), i->second->m_type);
 366             m_outstanding_count++;
 367         } else {
 368             return true;
 369         }
 370     }
 371
 372     m_outstandReqHist.sample(m_outstanding_count);
 373
 374     total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
 375     assert(m_outstanding_count == total_outstanding);
 376
 377     return false;
 378 }
 379
 380 void
 381 GPUCoalescer::markRemoved()
 382 {
 383     m_outstanding_count--;
 384     assert(m_outstanding_count ==
 385            m_writeRequestTable.size() + m_readRequestTable.size());
 386 }
 387
 388 void
 389 GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
 390 {
 391     assert(m_outstanding_count ==
 392            m_writeRequestTable.size() + m_readRequestTable.size());
 393
 394     Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
 395     if ((srequest->m_type == RubyRequestType_ST) ||
 396         (srequest->m_type == RubyRequestType_RMW_Read) ||
 397         (srequest->m_type == RubyRequestType_RMW_Write) ||
 398         (srequest->m_type == RubyRequestType_Load_Linked) ||
 399         (srequest->m_type == RubyRequestType_Store_Conditional) ||
 400         (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
 401         (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
 402         m_writeRequestTable.erase(line_addr);
 403     } else {
 404         m_readRequestTable.erase(line_addr);
 405     }
 406
 407     markRemoved();
 408 }
 409
 410 bool
 411 GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
 412 {
 413     //
 414     // The success flag indicates whether the LLSC operation was successful.
 415     // LL ops will always succeed, but SC may fail if the cache line is no
 416     // longer locked.
 417     //
 418     bool success = true;
 419     if (request->m_type == RubyRequestType_Store_Conditional) {
 420         if (!m_dataCache_ptr->isLocked(address, m_version)) {
 421             //
 422             // For failed SC requests, indicate the failure to the cpu by
 423             // setting the extra data to zero.
 424             //
 425             request->pkt->req->setExtraData(0);
 426             success = false;
 427         } else {
 428             //
 429             // For successful SC requests, indicate the success to the cpu by
 430             // setting the extra data to one.
 431             //
 432             request->pkt->req->setExtraData(1);
 433         }
 434         //
 435         // Independent of success, all SC operations must clear the lock
 436         //
 437         m_dataCache_ptr->clearLocked(address);
 438     } else if (request->m_type == RubyRequestType_Load_Linked) {
 439         //
 440         // Note: To fully follow Alpha LLSC semantics, should the LL clear any
 441         // previously locked cache lines?
 442         //
 443         m_dataCache_ptr->setLocked(address, m_version);
 444     } else if ((m_dataCache_ptr->isTagPresent(address)) &&
 445                (m_dataCache_ptr->isLocked(address, m_version))) {
 446         //
 447         // Normal writes should clear the locked address
 448         //
 449         m_dataCache_ptr->clearLocked(address);
 450     }
 451     return success;
 452 }
 453
 454 void
 455 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 456 {
 457     writeCallback(address, MachineType_NULL, data);
 458 }
 459
 460 void
 461 GPUCoalescer::writeCallback(Addr address,
 462                          MachineType mach,
 463                          DataBlock& data)
 464 {
 465     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 466 }
 467
 468 void
 469 GPUCoalescer::writeCallback(Addr address,
 470                          MachineType mach,
 471                          DataBlock& data,
 472                          Cycles initialRequestTime,
 473                          Cycles forwardRequestTime,
 474                          Cycles firstResponseTime)
 475 {
 476     writeCallback(address, mach, data,
 477                   initialRequestTime, forwardRequestTime, firstResponseTime,
 478                   false);
 479 }
 480
 481 void
 482 GPUCoalescer::writeCallback(Addr address,
 483                          MachineType mach,
 484                          DataBlock& data,
 485                          Cycles initialRequestTime,
 486                          Cycles forwardRequestTime,
 487                          Cycles firstResponseTime,
 488                          bool isRegion)
 489 {
 490     assert(address == makeLineAddress(address));
 491
 492     DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
 493     assert(m_writeRequestTable.count(makeLineAddress(address)));
 494
 495     RequestTable::iterator i = m_writeRequestTable.find(address);
 496     assert(i != m_writeRequestTable.end());
 497     GPUCoalescerRequest* request = i->second;
 498
 499     m_writeRequestTable.erase(i);
 500     markRemoved();
 501
 502     assert((request->m_type == RubyRequestType_ST) ||
 503            (request->m_type == RubyRequestType_ATOMIC) ||
 504            (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
 505            (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
 506            (request->m_type == RubyRequestType_RMW_Read) ||
 507            (request->m_type == RubyRequestType_RMW_Write) ||
 508            (request->m_type == RubyRequestType_Load_Linked) ||
 509            (request->m_type == RubyRequestType_Store_Conditional) ||
 510            (request->m_type == RubyRequestType_Locked_RMW_Read) ||
 511            (request->m_type == RubyRequestType_Locked_RMW_Write) ||
 512            (request->m_type == RubyRequestType_FLUSH));
 513
 514
 515     //
 516     // For Alpha, properly handle LL, SC, and write requests with respect to
 517     // locked cache blocks.
 518     //
 519     // Not valid for Network_test protocl
 520     //
 521     bool success = true;
 522     if (!m_usingNetworkTester)
 523         success = handleLlsc(address, request);
 524
 525     if (request->m_type == RubyRequestType_Locked_RMW_Read) {
 526         m_controller->blockOnQueue(address, m_mandatory_q_ptr);
 527     } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
 528         m_controller->unblock(address);
 529     }
 530
 531     hitCallback(request, mach, data, success,
 532                 request->issue_time, forwardRequestTime, firstResponseTime,
 533                 isRegion);
 534 }
 535
 536 void
 537 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 538 {
 539     readCallback(address, MachineType_NULL, data);
 540 }
 541
 542 void
 543 GPUCoalescer::readCallback(Addr address,
 544                         MachineType mach,
 545                         DataBlock& data)
 546 {
 547     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 548 }
 549
 550 void
 551 GPUCoalescer::readCallback(Addr address,
 552                         MachineType mach,
 553                         DataBlock& data,
 554                         Cycles initialRequestTime,
 555                         Cycles forwardRequestTime,
 556                         Cycles firstResponseTime)
 557 {
 558
 559     readCallback(address, mach, data,
 560                  initialRequestTime, forwardRequestTime, firstResponseTime,
 561                  false);
 562 }
 563
 564 void
 565 GPUCoalescer::readCallback(Addr address,
 566                         MachineType mach,
 567                         DataBlock& data,
 568                         Cycles initialRequestTime,
 569                         Cycles forwardRequestTime,
 570                         Cycles firstResponseTime,
 571                         bool isRegion)
 572 {
 573     assert(address == makeLineAddress(address));
 574     assert(m_readRequestTable.count(makeLineAddress(address)));
 575
 576     DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
 577     RequestTable::iterator i = m_readRequestTable.find(address);
 578     assert(i != m_readRequestTable.end());
 579     GPUCoalescerRequest* request = i->second;
 580
 581     m_readRequestTable.erase(i);
 582     markRemoved();
 583
 584     assert((request->m_type == RubyRequestType_LD) ||
 585            (request->m_type == RubyRequestType_IFETCH));
 586
 587     hitCallback(request, mach, data, true,
 588                 request->issue_time, forwardRequestTime, firstResponseTime,
 589                 isRegion);
 590 }
 591
 592 void
 593 GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
 594                        MachineType mach,
 595                        DataBlock& data,
 596                        bool success,
 597                        Cycles initialRequestTime,
 598                        Cycles forwardRequestTime,
 599                        Cycles firstResponseTime,
 600                        bool isRegion)
 601 {
 602     PacketPtr pkt = srequest->pkt;
 603     Addr request_address = pkt->getAddr();
 604     Addr request_line_address = makeLineAddress(request_address);
 605
 606     RubyRequestType type = srequest->m_type;
 607
 608     // Set this cache entry to the most recently used
 609     if (type == RubyRequestType_IFETCH) {
 610         if (m_instCache_ptr->isTagPresent(request_line_address))
 611             m_instCache_ptr->setMRU(request_line_address);
 612     } else {
 613         if (m_dataCache_ptr->isTagPresent(request_line_address))
 614             m_dataCache_ptr->setMRU(request_line_address);
 615     }
 616
 617     recordMissLatency(srequest, mach,
 618                       initialRequestTime,
 619                       forwardRequestTime,
 620                       firstResponseTime,
 621                       success, isRegion);
 622     // update the data
 623     //
 624     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
 625     int len = reqCoalescer[request_line_address].size();
 626     std::vector<PacketPtr> mylist;
 627     for (int i = 0; i < len; ++i) {
 628         PacketPtr pkt = reqCoalescer[request_line_address][i].first;
 629         assert(type ==
 630                reqCoalescer[request_line_address][i].second[PrimaryType]);
 631         request_address = pkt->getAddr();
 632         request_line_address = makeLineAddress(pkt->getAddr());
 633         if (pkt->getPtr<uint8_t>()) {
 634             if ((type == RubyRequestType_LD) ||
 635                 (type == RubyRequestType_ATOMIC) ||
 636                 (type == RubyRequestType_ATOMIC_RETURN) ||
 637                 (type == RubyRequestType_IFETCH) ||
 638                 (type == RubyRequestType_RMW_Read) ||
 639                 (type == RubyRequestType_Locked_RMW_Read) ||
 640                 (type == RubyRequestType_Load_Linked)) {
 641                 memcpy(pkt->getPtr<uint8_t>(),
 642                        data.getData(getOffset(request_address),
 643                                     pkt->getSize()),
 644                        pkt->getSize());
 645             } else {
 646                 data.setData(pkt->getPtr<uint8_t>(),
 647                              getOffset(request_address), pkt->getSize());
 648             }
 649         } else {
 650             DPRINTF(MemoryAccess,
 651                     "WARNING.  Data not transfered from Ruby to M5 for type " \
 652                     "%s\n",
 653                     RubyRequestType_to_string(type));
 654         }
 655
 656         // If using the RubyTester, update the RubyTester sender state's
 657         // subBlock with the recieved data.  The tester will later access
 658         // this state.
 659         // Note: RubyPort will access it's sender state before the
 660         // RubyTester.
 661         if (m_usingRubyTester) {
 662             RubyPort::SenderState *requestSenderState =
 663                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
 664             RubyTester::SenderState* testerSenderState =
 665                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
 666             testerSenderState->subBlock.mergeFrom(data);
 667         }
 668
 669         mylist.push_back(pkt);
 670     }
 671     delete srequest;
 672     reqCoalescer.erase(request_line_address);
 673     assert(!reqCoalescer.count(request_line_address));
 674
 675
 676
 677     completeHitCallback(mylist, len);
 678 }
 679
 680 bool
 681 GPUCoalescer::empty() const
 682 {
 683     return m_writeRequestTable.empty() && m_readRequestTable.empty();
 684 }
 685
 686 // Analyzes the packet to see if this request can be coalesced.
 687 // If request can be coalesced, this request is added to the reqCoalescer table
 688 // and makeRequest returns RequestStatus_Issued;
 689 // If this is the first request to a cacheline, request is added to both
 690 // newRequests queue and to the reqCoalescer table; makeRequest
 691 // returns RequestStatus_Issued.
 692 // If there is a pending request to this cacheline and this request
 693 // can't be coalesced, RequestStatus_Aliased is returned and
 694 // the packet needs to be reissued.
 695 RequestStatus
 696 GPUCoalescer::makeRequest(PacketPtr pkt)
 697 {
 698     // Check for GPU Barrier Kernel End or Kernel Begin
 699     // Leave these to be handled by the child class
 700     // Kernel End/Barrier = isFlush + isRelease
 701     // Kernel Begin = isFlush + isAcquire
 702     if (pkt->req->isKernel()) {
 703         if (pkt->req->isAcquire()){
 704             // This is a Kernel Begin leave handling to
 705             // virtual xCoalescer::makeRequest
 706             return RequestStatus_Issued;
 707         }else if (pkt->req->isRelease()) {
 708             // This is a Kernel End leave handling to
 709             // virtual xCoalescer::makeRequest
 710             // If we are here then we didn't call
 711             // a virtual version of this function
 712             // so we will also schedule the callback
 713             int wf_id = 0;
 714             if (pkt->req->hasContextId()) {
 715                 wf_id = pkt->req->contextId();
 716             }
 717             insertKernel(wf_id, pkt);
 718             newKernelEnds.push_back(wf_id);
 719             if (!issueEvent.scheduled()) {
 720                 schedule(issueEvent, curTick());
 721             }
 722             return RequestStatus_Issued;
 723         }
 724     }
 725
 726     // If number of outstanding requests greater than the max allowed,
 727     // return RequestStatus_BufferFull. This logic can be extended to
 728     // support proper backpressure.
 729     if (m_outstanding_count >= m_max_outstanding_requests) {
 730         return RequestStatus_BufferFull;
 731     }
 732
 733     RubyRequestType primary_type = RubyRequestType_NULL;
 734     RubyRequestType secondary_type = RubyRequestType_NULL;
 735
 736     if (pkt->isLLSC()) {
 737         //
 738         // Alpha LL/SC instructions need to be handled carefully by the cache
 739         // coherence protocol to ensure they follow the proper semantics. In
 740         // particular, by identifying the operations as atomic, the protocol
 741         // should understand that migratory sharing optimizations should not
 742         // be performed (i.e. a load between the LL and SC should not steal
 743         // away exclusive permission).
 744         //
 745         if (pkt->isWrite()) {
 746             primary_type = RubyRequestType_Store_Conditional;
 747         } else {
 748             assert(pkt->isRead());
 749             primary_type = RubyRequestType_Load_Linked;
 750         }
 751         secondary_type = RubyRequestType_ATOMIC;
 752     } else if (pkt->req->isLockedRMW()) {
 753         //
 754         // x86 locked instructions are translated to store cache coherence
 755         // requests because these requests should always be treated as read
 756         // exclusive operations and should leverage any migratory sharing
 757         // optimization built into the protocol.
 758         //
 759         if (pkt->isWrite()) {
 760             primary_type = RubyRequestType_Locked_RMW_Write;
 761         } else {
 762             assert(pkt->isRead());
 763             primary_type = RubyRequestType_Locked_RMW_Read;
 764         }
 765         secondary_type = RubyRequestType_ST;
 766     } else if (pkt->isAtomicOp()) {
 767         //
 768         // GPU Atomic Operation
 769         //
 770         primary_type = RubyRequestType_ATOMIC;
 771         secondary_type = RubyRequestType_ATOMIC;
 772     } else {
 773         if (pkt->isRead()) {
 774             if (pkt->req->isInstFetch()) {
 775                 primary_type = secondary_type = RubyRequestType_IFETCH;
 776             } else {
 777 #if THE_ISA == X86_ISA
 778                 uint32_t flags = pkt->req->getFlags();
 779                 bool storeCheck = flags &
 780                         (TheISA::StoreCheck << TheISA::FlagShift);
 781 #else
 782                 bool storeCheck = false;
 783 #endif // X86_ISA
 784                 if (storeCheck) {
 785                     primary_type = RubyRequestType_RMW_Read;
 786                     secondary_type = RubyRequestType_ST;
 787                 } else {
 788                     primary_type = secondary_type = RubyRequestType_LD;
 789                 }
 790             }
 791         } else if (pkt->isWrite()) {
 792             //
 793             // Note: M5 packets do not differentiate ST from RMW_Write
 794             //
 795             primary_type = secondary_type = RubyRequestType_ST;
 796         } else if (pkt->isFlush()) {
 797             primary_type = secondary_type = RubyRequestType_FLUSH;
 798         } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
 799             if (assumingRfOCoherence) {
 800                 // If we reached here, this request must be a memFence
 801                 // and the protocol implements RfO, the coalescer can
 802                 // assume sequentially consistency and schedule the callback
 803                 // immediately.
 804                 // Currently the code implements fence callbacks
 805                 // by reusing the mechanism for kernel completions.
 806                 // This should be fixed.
 807                 int wf_id = 0;
 808                 if (pkt->req->hasContextId()) {
 809                     wf_id = pkt->req->contextId();
 810                 }
 811                 insertKernel(wf_id, pkt);
 812                 newKernelEnds.push_back(wf_id);
 813                 if (!issueEvent.scheduled()) {
 814                     schedule(issueEvent, curTick());
 815                 }
 816                 return RequestStatus_Issued;
 817             } else {
 818                 // If not RfO, return issued here and let the child coalescer
 819                 // take care of it.
 820                 return RequestStatus_Issued;
 821             }
 822         } else {
 823             panic("Unsupported ruby packet type\n");
 824         }
 825     }
 826
 827     // Check if there is any pending request to this cache line from
 828     // previous cycles.
 829     // If there is a pending request, return aliased. Since coalescing
 830     // across time is not permitted, aliased requests are not coalesced.
 831     // If a request for this address has already been issued, we must block
 832     RequestStatus status = getRequestStatus(pkt, primary_type);
 833     if (status != RequestStatus_Ready)
 834         return status;
 835
 836     Addr line_addr = makeLineAddress(pkt->getAddr());
 837
 838     // Check if this request can be coalesced with previous
 839     // requests from this cycle.
 840     if (!reqCoalescer.count(line_addr)) {
 841         // This is the first access to this cache line.
 842         // A new request to the memory subsystem has to be
 843         // made in the next cycle for this cache line, so
 844         // add this line addr to the "newRequests" queue
 845         newRequests.push_back(line_addr);
 846
 847     // There was a request to this cache line in this cycle,
 848     // let us see if we can coalesce this request with the previous
 849     // requests from this cycle
 850     } else if (primary_type !=
 851                reqCoalescer[line_addr][0].second[PrimaryType]) {
 852         // can't coalesce loads, stores and atomics!
 853         return RequestStatus_Aliased;
 854     } else if (pkt->req->isLockedRMW() ||
 855                reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
 856         // can't coalesce locked accesses, but can coalesce atomics!
 857         return RequestStatus_Aliased;
 858     } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
 859                pkt->req->contextId() !=
 860                reqCoalescer[line_addr][0].first->req->contextId()) {
 861         // can't coalesce releases from different wavefronts
 862         return RequestStatus_Aliased;
 863     }
 864
 865     // in addition to the packet, we need to save both request types
 866     reqCoalescer[line_addr].push_back(
 867             RequestDesc(pkt, std::vector<RubyRequestType>()) );
 868     reqCoalescer[line_addr].back().second.push_back(primary_type);
 869     reqCoalescer[line_addr].back().second.push_back(secondary_type);
 870     if (!issueEvent.scheduled())
 871         schedule(issueEvent, curTick());
 872     // TODO: issue hardware prefetches here
 873     return RequestStatus_Issued;
 874 }
 875
 876 void
 877 GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
 878 {
 879
 880     int proc_id = -1;
 881     if (pkt != NULL && pkt->req->hasContextId()) {
 882         proc_id = pkt->req->contextId();
 883     }
 884
 885     // If valid, copy the pc to the ruby request
 886     Addr pc = 0;
 887     if (pkt->req->hasPC()) {
 888         pc = pkt->req->getPC();
 889     }
 890
 891     // At the moment setting scopes only counts
 892     // for GPU spill space accesses
 893     // which is pkt->req->isStack()
 894     // this scope is REPLACE since it
 895     // does not need to be flushed at the end
 896     // of a kernel Private and local may need
 897     // to be visible at the end of the kernel
 898     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
 899     HSAScope accessScope = reqScopeToHSAScope(pkt->req);
 900
 901     Addr line_addr = makeLineAddress(pkt->getAddr());
 902
 903     // Creating WriteMask that records written bytes
 904     // and atomic operations. This enables partial writes
 905     // and partial reads of those writes
 906     DataBlock dataBlock;
 907     dataBlock.clear();
 908     uint32_t blockSize = RubySystem::getBlockSizeBytes();
 909     std::vector<bool> accessMask(blockSize,false);
 910     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
 911     uint32_t tableSize = reqCoalescer[line_addr].size();
 912     for (int i = 0; i < tableSize; i++) {
 913         PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
 914         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
 915         uint32_t tmpSize = tmpPkt->getSize();
 916         if (tmpPkt->isAtomicOp()) {
 917             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
 918                                                         tmpPkt->getAtomicOp());
 919             atomicOps.push_back(tmpAtomicOp);
 920         } else if (tmpPkt->isWrite()) {
 921             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
 922                               tmpOffset, tmpSize);
 923         }
 924         for (int j = 0; j < tmpSize; j++) {
 925             accessMask[tmpOffset + j] = true;
 926         }
 927     }
 928     std::shared_ptr<RubyRequest> msg;
 929     if (pkt->isAtomicOp()) {
 930         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 931                               pkt->getPtr<uint8_t>(),
 932                               pkt->getSize(), pc, secondary_type,
 933                               RubyAccessMode_Supervisor, pkt,
 934                               PrefetchBit_No, proc_id, 100,
 935                               blockSize, accessMask,
 936                               dataBlock, atomicOps,
 937                               accessScope, accessSegment);
 938     } else {
 939         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
 940                               pkt->getPtr<uint8_t>(),
 941                               pkt->getSize(), pc, secondary_type,
 942                               RubyAccessMode_Supervisor, pkt,
 943                               PrefetchBit_No, proc_id, 100,
 944                               blockSize, accessMask,
 945                               dataBlock,
 946                               accessScope, accessSegment);
 947     }
 948     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
 949              curTick(), m_version, "Coal", "Begin", "", "",
 950              printAddress(msg->getPhysicalAddress()),
 951              RubyRequestType_to_string(secondary_type));
 952
 953     fatal_if(secondary_type == RubyRequestType_IFETCH,
 954              "there should not be any I-Fetch requests in the GPU Coalescer");
 955
 956     // Send the message to the cache controller
 957     fatal_if(m_data_cache_hit_latency == 0,
 958              "should not have a latency of zero");
 959
 960     assert(m_mandatory_q_ptr);
 961     m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
 962 }
 963
 964 template <class KEY, class VALUE>
 965 std::ostream &
 966 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
 967 {
 968     out << "[";
 969     for (auto i = map.begin(); i != map.end(); ++i)
 970         out << " " << i->first << "=" << i->second;
 971     out << " ]";
 972
 973     return out;
 974 }
 975
 976 void
 977 GPUCoalescer::print(ostream& out) const
 978 {
 979     out << "[GPUCoalescer: " << m_version
 980         << ", outstanding requests: " << m_outstanding_count
 981         << ", read request table: " << m_readRequestTable
 982         << ", write request table: " << m_writeRequestTable
 983         << "]";
 984 }
 985
 986 // this can be called from setState whenever coherence permissions are
 987 // upgraded when invoked, coherence violations will be checked for the
 988 // given block
 989 void
 990 GPUCoalescer::checkCoherence(Addr addr)
 991 {
 992 #ifdef CHECK_COHERENCE
 993     m_ruby_system->checkGlobalCoherenceInvariant(addr);
 994 #endif
 995 }
 996
 997 void
 998 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
 999     DPRINTF(RubyStats, "Recorded statistic: %s\n",
1000             SequencerRequestType_to_string(requestType));
1001 }
1002
1003 GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
1004     : Event(Progress_Event_Pri), seq(_seq)
1005 {
1006 }
1007
1008
1009 void
1010 GPUCoalescer::completeIssue()
1011 {
1012     // newRequests has the cacheline addresses of all the
1013     // requests which need to be issued to the memory subsystem
1014     // in this cycle
1015     int len = newRequests.size();
1016     DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
1017     for (int i = 0; i < len; ++i) {
1018         // Get the requests from reqCoalescer table. Get only the
1019         // first request for each cacheline, the remaining requests
1020         // can be coalesced with the first request. So, only
1021         // one request is issued per cacheline.
1022         RequestDesc info = reqCoalescer[newRequests[i]][0];
1023         PacketPtr pkt = info.first;
1024         DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
1025                 i, pkt->req->getPaddr());
1026         // Insert this request to the read/writeRequestTables. These tables
1027         // are used to track aliased requests in makeRequest subroutine
1028         bool found = insertRequest(pkt, info.second[PrimaryType]);
1029
1030         if (found) {
1031             panic("GPUCoalescer::makeRequest should never be called if the "
1032                   "request is already outstanding\n");
1033         }
1034
1035         // Issue request to ruby subsystem
1036         issueRequest(pkt, info.second[SecondaryType]);
1037     }
1038     newRequests.clear();
1039
1040     // have Kernel End releases been issued this cycle
1041     len = newKernelEnds.size();
1042     for (int i = 0; i < len; i++) {
1043         kernelCallback(newKernelEnds[i]);
1044     }
1045     newKernelEnds.clear();
1046 }
1047
1048 void
1049 GPUCoalescer::IssueEvent::process()
1050 {
1051     seq->completeIssue();
1052 }
1053
1054 const char *
1055 GPUCoalescer::IssueEvent::description() const
1056 {
1057     return "Issue coalesced request";
1058 }
1059
1060 void
1061 GPUCoalescer::evictionCallback(Addr address)
1062 {
1063     ruby_eviction_callback(address);
1064 }
1065
1066 void
1067 GPUCoalescer::kernelCallback(int wavefront_id)
1068 {
1069     assert(kernelEndList.count(wavefront_id));
1070
1071     ruby_hit_callback(kernelEndList[wavefront_id]);
1072
1073     kernelEndList.erase(wavefront_id);
1074 }
1075
1076 void
1077 GPUCoalescer::atomicCallback(Addr address,
1078                              MachineType mach,
1079                              const DataBlock& data)
1080 {
1081     assert(address == makeLineAddress(address));
1082
1083     DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
1084     assert(m_writeRequestTable.count(makeLineAddress(address)));
1085
1086     RequestTable::iterator i = m_writeRequestTable.find(address);
1087     assert(i != m_writeRequestTable.end());
1088     GPUCoalescerRequest* srequest = i->second;
1089
1090     m_writeRequestTable.erase(i);
1091     markRemoved();
1092
1093     assert((srequest->m_type == RubyRequestType_ATOMIC) ||
1094            (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
1095            (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
1096
1097
1098     // Atomics don't write to cache, so there is no MRU update...
1099
1100     recordMissLatency(srequest, mach,
1101                       srequest->issue_time, Cycles(0), Cycles(0), true, false);
1102
1103     PacketPtr pkt = srequest->pkt;
1104     Addr request_address = pkt->getAddr();
1105     Addr request_line_address = makeLineAddress(pkt->getAddr());
1106
1107     int len = reqCoalescer[request_line_address].size();
1108     std::vector<PacketPtr> mylist;
1109     for (int i = 0; i < len; ++i) {
1110         PacketPtr pkt = reqCoalescer[request_line_address][i].first;
1111         assert(srequest->m_type ==
1112                reqCoalescer[request_line_address][i].second[PrimaryType]);
1113         request_address = (pkt->getAddr());
1114         request_line_address = makeLineAddress(request_address);
1115         if (pkt->getPtr<uint8_t>() &&
1116             srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
1117             /* atomics are done in memory, and return the data *before* the atomic op... */
1118             memcpy(pkt->getPtr<uint8_t>(),
1119                    data.getData(getOffset(request_address),
1120                                 pkt->getSize()),
1121                    pkt->getSize());
1122         } else {
1123             DPRINTF(MemoryAccess,
1124                     "WARNING.  Data not transfered from Ruby to M5 for type " \
1125                     "%s\n",
1126                     RubyRequestType_to_string(srequest->m_type));
1127         }
1128
1129         // If using the RubyTester, update the RubyTester sender state's
1130         // subBlock with the recieved data.  The tester will later access
1131         // this state.
1132         // Note: RubyPort will access it's sender state before the
1133         // RubyTester.
1134         if (m_usingRubyTester) {
1135             RubyPort::SenderState *requestSenderState =
1136                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
1137             RubyTester::SenderState* testerSenderState =
1138                 safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
1139             testerSenderState->subBlock.mergeFrom(data);
1140         }
1141
1142         mylist.push_back(pkt);
1143     }
1144     delete srequest;
1145     reqCoalescer.erase(request_line_address);
1146     assert(!reqCoalescer.count(request_line_address));
1147
1148     completeHitCallback(mylist, len);
1149 }
1150
1151 void
1152 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
1153 {
1154     if (myMachID == senderMachID) {
1155         CP_TCPLdHits++;
1156     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1157         CP_TCPLdTransfers++;
1158     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1159         CP_TCCLdHits++;
1160     } else {
1161         CP_LdMiss++;
1162     }
1163 }
1164
1165 void
1166 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
1167 {
1168     if (myMachID == senderMachID) {
1169         CP_TCPStHits++;
1170     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
1171         CP_TCPStTransfers++;
1172     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
1173         CP_TCCStHits++;
1174     } else {
1175         CP_StMiss++;
1176     }
1177 }
1178
1179 void
1180 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
1181 {
1182     for (int i = 0; i < len; ++i) {
1183         RubyPort::SenderState *ss =
1184             safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
1185         MemSlavePort *port = ss->port;
1186         assert(port != NULL);
1187
1188         mylist[i]->senderState = ss->predecessor;
1189         delete ss;
1190         port->hitCallback(mylist[i]);
1191         trySendRetries();
1192     }
1193
1194     testDrainComplete();
1195 }
1196
1197 PacketPtr
1198 GPUCoalescer::mapAddrToPkt(Addr address)
1199 {
1200     RequestTable::iterator i = m_readRequestTable.find(address);
1201     assert(i != m_readRequestTable.end());
1202     GPUCoalescerRequest* request = i->second;
1203     return request->pkt;
1204 }
1205
1206 void
1207 GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
1208                                 MachineType mach,
1209                                 Cycles initialRequestTime,
1210                                 Cycles forwardRequestTime,
1211                                 Cycles firstResponseTime,
1212                                 bool success, bool isRegion)
1213 {
1214     RubyRequestType type = srequest->m_type;
1215     Cycles issued_time = srequest->issue_time;
1216     Cycles completion_time = curCycle();
1217     assert(completion_time >= issued_time);
1218     Cycles total_lat = completion_time - issued_time;
1219
1220     // cache stats (valid for RfO protocol only)
1221     if (mach == MachineType_TCP) {
1222         if (type == RubyRequestType_LD) {
1223             GPU_TCPLdHits++;
1224         } else {
1225             GPU_TCPStHits++;
1226         }
1227     } else if (mach == MachineType_L1Cache_wCC) {
1228         if (type == RubyRequestType_LD) {
1229             GPU_TCPLdTransfers++;
1230         } else {
1231             GPU_TCPStTransfers++;
1232         }
1233     } else if (mach == MachineType_TCC) {
1234         if (type == RubyRequestType_LD) {
1235             GPU_TCCLdHits++;
1236         } else {
1237             GPU_TCCStHits++;
1238         }
1239     } else  {
1240         if (type == RubyRequestType_LD) {
1241             GPU_LdMiss++;
1242         } else {
1243             GPU_StMiss++;
1244         }
1245     }
1246
1247     // Profile all access latency, even zero latency accesses
1248     m_latencyHist.sample(total_lat);
1249     m_typeLatencyHist[type]->sample(total_lat);
1250
1251     // Profile the miss latency for all non-zero demand misses
1252     if (total_lat != Cycles(0)) {
1253         m_missLatencyHist.sample(total_lat);
1254         m_missTypeLatencyHist[type]->sample(total_lat);
1255
1256         if (mach != MachineType_NUM) {
1257             m_missMachLatencyHist[mach]->sample(total_lat);
1258             m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
1259
1260             if ((issued_time <= initialRequestTime) &&
1261                 (initialRequestTime <= forwardRequestTime) &&
1262                 (forwardRequestTime <= firstResponseTime) &&
1263                 (firstResponseTime <= completion_time)) {
1264
1265                 m_IssueToInitialDelayHist[mach]->sample(
1266                     initialRequestTime - issued_time);
1267                 m_InitialToForwardDelayHist[mach]->sample(
1268                     forwardRequestTime - initialRequestTime);
1269                 m_ForwardToFirstResponseDelayHist[mach]->sample(
1270                     firstResponseTime - forwardRequestTime);
1271                 m_FirstResponseToCompletionDelayHist[mach]->sample(
1272                     completion_time - firstResponseTime);
1273             }
1274         }
1275
1276     }
1277
1278     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
1279              curTick(), m_version, "Coal",
1280              success ? "Done" : "SC_Failed", "", "",
1281              printAddress(srequest->pkt->getAddr()), total_lat);
1282 }
1283
1284 void
1285 GPUCoalescer::regStats()
1286 {
1287     // These statistical variables are not for display.
1288     // The profiler will collate these across different
1289     // coalescers and display those collated statistics.
1290     m_outstandReqHist.init(10);
1291     m_latencyHist.init(10);
1292     m_missLatencyHist.init(10);
1293
1294     for (int i = 0; i < RubyRequestType_NUM; i++) {
1295         m_typeLatencyHist.push_back(new Stats::Histogram());
1296         m_typeLatencyHist[i]->init(10);
1297
1298         m_missTypeLatencyHist.push_back(new Stats::Histogram());
1299         m_missTypeLatencyHist[i]->init(10);
1300     }
1301
1302     for (int i = 0; i < MachineType_NUM; i++) {
1303         m_missMachLatencyHist.push_back(new Stats::Histogram());
1304         m_missMachLatencyHist[i]->init(10);
1305
1306         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
1307         m_IssueToInitialDelayHist[i]->init(10);
1308
1309         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
1310         m_InitialToForwardDelayHist[i]->init(10);
1311
1312         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
1313         m_ForwardToFirstResponseDelayHist[i]->init(10);
1314
1315         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
1316         m_FirstResponseToCompletionDelayHist[i]->init(10);
1317     }
1318
1319     for (int i = 0; i < RubyRequestType_NUM; i++) {
1320         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
1321
1322         for (int j = 0; j < MachineType_NUM; j++) {
1323             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
1324             m_missTypeMachLatencyHist[i][j]->init(10);
1325         }
1326     }
1327
1328     // GPU cache stats
1329     GPU_TCPLdHits
1330         .name(name() + ".gpu_tcp_ld_hits")
1331         .desc("loads that hit in the TCP")
1332         ;
1333     GPU_TCPLdTransfers
1334         .name(name() + ".gpu_tcp_ld_transfers")
1335         .desc("TCP to TCP load transfers")
1336         ;
1337     GPU_TCCLdHits
1338         .name(name() + ".gpu_tcc_ld_hits")
1339         .desc("loads that hit in the TCC")
1340         ;
1341     GPU_LdMiss
1342         .name(name() + ".gpu_ld_misses")
1343         .desc("loads that miss in the GPU")
1344         ;
1345
1346     GPU_TCPStHits
1347         .name(name() + ".gpu_tcp_st_hits")
1348         .desc("stores that hit in the TCP")
1349         ;
1350     GPU_TCPStTransfers
1351         .name(name() + ".gpu_tcp_st_transfers")
1352         .desc("TCP to TCP store transfers")
1353         ;
1354     GPU_TCCStHits
1355         .name(name() + ".gpu_tcc_st_hits")
1356         .desc("stores that hit in the TCC")
1357         ;
1358     GPU_StMiss
1359         .name(name() + ".gpu_st_misses")
1360         .desc("stores that miss in the GPU")
1361         ;
1362
1363     // CP cache stats
1364     CP_TCPLdHits
1365         .name(name() + ".cp_tcp_ld_hits")
1366         .desc("loads that hit in the TCP")
1367         ;
1368     CP_TCPLdTransfers
1369         .name(name() + ".cp_tcp_ld_transfers")
1370         .desc("TCP to TCP load transfers")
1371         ;
1372     CP_TCCLdHits
1373         .name(name() + ".cp_tcc_ld_hits")
1374         .desc("loads that hit in the TCC")
1375         ;
1376     CP_LdMiss
1377         .name(name() + ".cp_ld_misses")
1378         .desc("loads that miss in the GPU")
1379         ;
1380
1381     CP_TCPStHits
1382         .name(name() + ".cp_tcp_st_hits")
1383         .desc("stores that hit in the TCP")
1384         ;
1385     CP_TCPStTransfers
1386         .name(name() + ".cp_tcp_st_transfers")
1387         .desc("TCP to TCP store transfers")
1388         ;
1389     CP_TCCStHits
1390         .name(name() + ".cp_tcc_st_hits")
1391         .desc("stores that hit in the TCC")
1392         ;
1393     CP_StMiss
1394         .name(name() + ".cp_st_misses")
1395         .desc("stores that miss in the GPU")
1396         ;
1397 }