src/gpu-compute/gpu_tlb.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Lisa Hsu
  34  */
  35
  36 #include "gpu-compute/gpu_tlb.hh"
  37
  38 #include <cmath>
  39 #include <cstring>
  40
  41 #include "arch/x86/faults.hh"
  42 #include "arch/x86/insts/microldstop.hh"
  43 #include "arch/x86/pagetable.hh"
  44 #include "arch/x86/pagetable_walker.hh"
  45 #include "arch/x86/regs/misc.hh"
  46 #include "arch/x86/x86_traits.hh"
  47 #include "base/bitfield.hh"
  48 #include "base/output.hh"
  49 #include "base/trace.hh"
  50 #include "cpu/base.hh"
  51 #include "cpu/thread_context.hh"
  52 #include "debug/GPUPrefetch.hh"
  53 #include "debug/GPUTLB.hh"
  54 #include "mem/packet_access.hh"
  55 #include "mem/page_table.hh"
  56 #include "mem/request.hh"
  57 #include "sim/process.hh"
  58
  59 namespace X86ISA
  60 {
  61
  62     GpuTLB::GpuTLB(const Params *p)
  63         : MemObject(p), configAddress(0), size(p->size),
  64           cleanupEvent([this]{ cleanup(); }, name(), false,
  65                        Event::Maximum_Pri),
  66           exitEvent([this]{ exitCallback(); }, name())
  67     {
  68         assoc = p->assoc;
  69         assert(assoc <= size);
  70         numSets = size/assoc;
  71         allocationPolicy = p->allocationPolicy;
  72         hasMemSidePort = false;
  73         accessDistance = p->accessDistance;
  74         clock = p->clk_domain->clockPeriod();
  75
  76         tlb.assign(size, GpuTlbEntry());
  77
  78         freeList.resize(numSets);
  79         entryList.resize(numSets);
  80
  81         for (int set = 0; set < numSets; ++set) {
  82             for (int way = 0; way < assoc; ++way) {
  83                 int x = set * assoc + way;
  84                 freeList[set].push_back(&tlb.at(x));
  85             }
  86         }
  87
  88         FA = (size == assoc);
  89
  90         /**
  91          * @warning: the set-associative version assumes you have a
  92          * fixed page size of 4KB.
  93          * If the page size is greather than 4KB (as defined in the
  94          * TheISA::PageBytes), then there are various issues w/ the current
  95          * implementation (you'd have the same 8KB page being replicated in
  96          * different sets etc)
  97          */
  98         setMask = numSets - 1;
  99
 100     #if 0
 101         // GpuTLB doesn't yet support full system
 102         walker = p->walker;
 103         walker->setTLB(this);
 104     #endif
 105
 106         maxCoalescedReqs = p->maxOutstandingReqs;
 107
 108         // Do not allow maxCoalescedReqs to be more than the TLB associativity
 109         if (maxCoalescedReqs > assoc) {
 110             maxCoalescedReqs = assoc;
 111             cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
 112         }
 113
 114         outstandingReqs = 0;
 115         hitLatency = p->hitLatency;
 116         missLatency1 = p->missLatency1;
 117         missLatency2 = p->missLatency2;
 118
 119         // create the slave ports based on the number of connected ports
 120         for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
 121             cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
 122                                   name(), i), this, i));
 123         }
 124
 125         // create the master ports based on the number of connected ports
 126         for (size_t i = 0; i < p->port_master_connection_count; ++i) {
 127             memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
 128                                   name(), i), this, i));
 129         }
 130     }
 131
 132     // fixme: this is never called?
 133     GpuTLB::~GpuTLB()
 134     {
 135         // make sure all the hash-maps are empty
 136         assert(translationReturnEvent.empty());
 137     }
 138
 139     BaseSlavePort&
 140     GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
 141     {
 142         if (if_name == "slave") {
 143             if (idx >= static_cast<PortID>(cpuSidePort.size())) {
 144                 panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
 145             }
 146
 147             return *cpuSidePort[idx];
 148         } else {
 149             panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
 150         }
 151     }
 152
 153     BaseMasterPort&
 154     GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
 155     {
 156         if (if_name == "master") {
 157             if (idx >= static_cast<PortID>(memSidePort.size())) {
 158                 panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
 159             }
 160
 161             hasMemSidePort = true;
 162
 163             return *memSidePort[idx];
 164         } else {
 165             panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
 166         }
 167     }
 168
 169     GpuTlbEntry*
 170     GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
 171     {
 172         GpuTlbEntry *newEntry = nullptr;
 173
 174         /**
 175          * vpn holds the virtual page address
 176          * The least significant bits are simply masked
 177          */
 178         int set = (vpn >> TheISA::PageShift) & setMask;
 179
 180         if (!freeList[set].empty()) {
 181             newEntry = freeList[set].front();
 182             freeList[set].pop_front();
 183         } else {
 184             newEntry = entryList[set].back();
 185             entryList[set].pop_back();
 186         }
 187
 188         *newEntry = entry;
 189         newEntry->vaddr = vpn;
 190         entryList[set].push_front(newEntry);
 191
 192         return newEntry;
 193     }
 194
 195     GpuTLB::EntryList::iterator
 196     GpuTLB::lookupIt(Addr va, bool update_lru)
 197     {
 198         int set = (va >> TheISA::PageShift) & setMask;
 199
 200         if (FA) {
 201             assert(!set);
 202         }
 203
 204         auto entry = entryList[set].begin();
 205         for (; entry != entryList[set].end(); ++entry) {
 206             int page_size = (*entry)->size();
 207
 208             if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
 209                 DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
 210                         "with size %#x.\n", va, (*entry)->vaddr, page_size);
 211
 212                 if (update_lru) {
 213                     entryList[set].push_front(*entry);
 214                     entryList[set].erase(entry);
 215                     entry = entryList[set].begin();
 216                 }
 217
 218                 break;
 219             }
 220         }
 221
 222         return entry;
 223     }
 224
 225     GpuTlbEntry*
 226     GpuTLB::lookup(Addr va, bool update_lru)
 227     {
 228         int set = (va >> TheISA::PageShift) & setMask;
 229
 230         auto entry = lookupIt(va, update_lru);
 231
 232         if (entry == entryList[set].end())
 233             return nullptr;
 234         else
 235             return *entry;
 236     }
 237
 238     void
 239     GpuTLB::invalidateAll()
 240     {
 241         DPRINTF(GPUTLB, "Invalidating all entries.\n");
 242
 243         for (int i = 0; i < numSets; ++i) {
 244             while (!entryList[i].empty()) {
 245                 GpuTlbEntry *entry = entryList[i].front();
 246                 entryList[i].pop_front();
 247                 freeList[i].push_back(entry);
 248             }
 249         }
 250     }
 251
 252     void
 253     GpuTLB::setConfigAddress(uint32_t addr)
 254     {
 255         configAddress = addr;
 256     }
 257
 258     void
 259     GpuTLB::invalidateNonGlobal()
 260     {
 261         DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
 262
 263         for (int i = 0; i < numSets; ++i) {
 264             for (auto entryIt = entryList[i].begin();
 265                  entryIt != entryList[i].end();) {
 266                 if (!(*entryIt)->global) {
 267                     freeList[i].push_back(*entryIt);
 268                     entryList[i].erase(entryIt++);
 269                 } else {
 270                     ++entryIt;
 271                 }
 272             }
 273         }
 274     }
 275
 276     void
 277     GpuTLB::demapPage(Addr va, uint64_t asn)
 278     {
 279
 280         int set = (va >> TheISA::PageShift) & setMask;
 281         auto entry = lookupIt(va, false);
 282
 283         if (entry != entryList[set].end()) {
 284             freeList[set].push_back(*entry);
 285             entryList[set].erase(entry);
 286         }
 287     }
 288
 289     Fault
 290     GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
 291     {
 292         DPRINTF(GPUTLB, "Addresses references internal memory.\n");
 293         Addr vaddr = req->getVaddr();
 294         Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
 295
 296         if (prefix == IntAddrPrefixCPUID) {
 297             panic("CPUID memory space not yet implemented!\n");
 298         } else if (prefix == IntAddrPrefixMSR) {
 299             vaddr = vaddr >> 3;
 300             req->setFlags(Request::MMAPPED_IPR);
 301             Addr regNum = 0;
 302
 303             switch (vaddr & ~IntAddrPrefixMask) {
 304               case 0x10:
 305                 regNum = MISCREG_TSC;
 306                 break;
 307               case 0x1B:
 308                 regNum = MISCREG_APIC_BASE;
 309                 break;
 310               case 0xFE:
 311                 regNum = MISCREG_MTRRCAP;
 312                 break;
 313               case 0x174:
 314                 regNum = MISCREG_SYSENTER_CS;
 315                 break;
 316               case 0x175:
 317                 regNum = MISCREG_SYSENTER_ESP;
 318                 break;
 319               case 0x176:
 320                 regNum = MISCREG_SYSENTER_EIP;
 321                 break;
 322               case 0x179:
 323                 regNum = MISCREG_MCG_CAP;
 324                 break;
 325               case 0x17A:
 326                 regNum = MISCREG_MCG_STATUS;
 327                 break;
 328               case 0x17B:
 329                 regNum = MISCREG_MCG_CTL;
 330                 break;
 331               case 0x1D9:
 332                 regNum = MISCREG_DEBUG_CTL_MSR;
 333                 break;
 334               case 0x1DB:
 335                 regNum = MISCREG_LAST_BRANCH_FROM_IP;
 336                 break;
 337               case 0x1DC:
 338                 regNum = MISCREG_LAST_BRANCH_TO_IP;
 339                 break;
 340               case 0x1DD:
 341                 regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
 342                 break;
 343               case 0x1DE:
 344                 regNum = MISCREG_LAST_EXCEPTION_TO_IP;
 345                 break;
 346               case 0x200:
 347                 regNum = MISCREG_MTRR_PHYS_BASE_0;
 348                 break;
 349               case 0x201:
 350                 regNum = MISCREG_MTRR_PHYS_MASK_0;
 351                 break;
 352               case 0x202:
 353                 regNum = MISCREG_MTRR_PHYS_BASE_1;
 354                 break;
 355               case 0x203:
 356                 regNum = MISCREG_MTRR_PHYS_MASK_1;
 357                 break;
 358               case 0x204:
 359                 regNum = MISCREG_MTRR_PHYS_BASE_2;
 360                 break;
 361               case 0x205:
 362                 regNum = MISCREG_MTRR_PHYS_MASK_2;
 363                 break;
 364               case 0x206:
 365                 regNum = MISCREG_MTRR_PHYS_BASE_3;
 366                 break;
 367               case 0x207:
 368                 regNum = MISCREG_MTRR_PHYS_MASK_3;
 369                 break;
 370               case 0x208:
 371                 regNum = MISCREG_MTRR_PHYS_BASE_4;
 372                 break;
 373               case 0x209:
 374                 regNum = MISCREG_MTRR_PHYS_MASK_4;
 375                 break;
 376               case 0x20A:
 377                 regNum = MISCREG_MTRR_PHYS_BASE_5;
 378                 break;
 379               case 0x20B:
 380                 regNum = MISCREG_MTRR_PHYS_MASK_5;
 381                 break;
 382               case 0x20C:
 383                 regNum = MISCREG_MTRR_PHYS_BASE_6;
 384                 break;
 385               case 0x20D:
 386                 regNum = MISCREG_MTRR_PHYS_MASK_6;
 387                 break;
 388               case 0x20E:
 389                 regNum = MISCREG_MTRR_PHYS_BASE_7;
 390                 break;
 391               case 0x20F:
 392                 regNum = MISCREG_MTRR_PHYS_MASK_7;
 393                 break;
 394               case 0x250:
 395                 regNum = MISCREG_MTRR_FIX_64K_00000;
 396                 break;
 397               case 0x258:
 398                 regNum = MISCREG_MTRR_FIX_16K_80000;
 399                 break;
 400               case 0x259:
 401                 regNum = MISCREG_MTRR_FIX_16K_A0000;
 402                 break;
 403               case 0x268:
 404                 regNum = MISCREG_MTRR_FIX_4K_C0000;
 405                 break;
 406               case 0x269:
 407                 regNum = MISCREG_MTRR_FIX_4K_C8000;
 408                 break;
 409               case 0x26A:
 410                 regNum = MISCREG_MTRR_FIX_4K_D0000;
 411                 break;
 412               case 0x26B:
 413                 regNum = MISCREG_MTRR_FIX_4K_D8000;
 414                 break;
 415               case 0x26C:
 416                 regNum = MISCREG_MTRR_FIX_4K_E0000;
 417                 break;
 418               case 0x26D:
 419                 regNum = MISCREG_MTRR_FIX_4K_E8000;
 420                 break;
 421               case 0x26E:
 422                 regNum = MISCREG_MTRR_FIX_4K_F0000;
 423                 break;
 424               case 0x26F:
 425                 regNum = MISCREG_MTRR_FIX_4K_F8000;
 426                 break;
 427               case 0x277:
 428                 regNum = MISCREG_PAT;
 429                 break;
 430               case 0x2FF:
 431                 regNum = MISCREG_DEF_TYPE;
 432                 break;
 433               case 0x400:
 434                 regNum = MISCREG_MC0_CTL;
 435                 break;
 436               case 0x404:
 437                 regNum = MISCREG_MC1_CTL;
 438                 break;
 439               case 0x408:
 440                 regNum = MISCREG_MC2_CTL;
 441                 break;
 442               case 0x40C:
 443                 regNum = MISCREG_MC3_CTL;
 444                 break;
 445               case 0x410:
 446                 regNum = MISCREG_MC4_CTL;
 447                 break;
 448               case 0x414:
 449                 regNum = MISCREG_MC5_CTL;
 450                 break;
 451               case 0x418:
 452                 regNum = MISCREG_MC6_CTL;
 453                 break;
 454               case 0x41C:
 455                 regNum = MISCREG_MC7_CTL;
 456                 break;
 457               case 0x401:
 458                 regNum = MISCREG_MC0_STATUS;
 459                 break;
 460               case 0x405:
 461                 regNum = MISCREG_MC1_STATUS;
 462                 break;
 463               case 0x409:
 464                 regNum = MISCREG_MC2_STATUS;
 465                 break;
 466               case 0x40D:
 467                 regNum = MISCREG_MC3_STATUS;
 468                 break;
 469               case 0x411:
 470                 regNum = MISCREG_MC4_STATUS;
 471                 break;
 472               case 0x415:
 473                 regNum = MISCREG_MC5_STATUS;
 474                 break;
 475               case 0x419:
 476                 regNum = MISCREG_MC6_STATUS;
 477                 break;
 478               case 0x41D:
 479                 regNum = MISCREG_MC7_STATUS;
 480                 break;
 481               case 0x402:
 482                 regNum = MISCREG_MC0_ADDR;
 483                 break;
 484               case 0x406:
 485                 regNum = MISCREG_MC1_ADDR;
 486                 break;
 487               case 0x40A:
 488                 regNum = MISCREG_MC2_ADDR;
 489                 break;
 490               case 0x40E:
 491                 regNum = MISCREG_MC3_ADDR;
 492                 break;
 493               case 0x412:
 494                 regNum = MISCREG_MC4_ADDR;
 495                 break;
 496               case 0x416:
 497                 regNum = MISCREG_MC5_ADDR;
 498                 break;
 499               case 0x41A:
 500                 regNum = MISCREG_MC6_ADDR;
 501                 break;
 502               case 0x41E:
 503                 regNum = MISCREG_MC7_ADDR;
 504                 break;
 505               case 0x403:
 506                 regNum = MISCREG_MC0_MISC;
 507                 break;
 508               case 0x407:
 509                 regNum = MISCREG_MC1_MISC;
 510                 break;
 511               case 0x40B:
 512                 regNum = MISCREG_MC2_MISC;
 513                 break;
 514               case 0x40F:
 515                 regNum = MISCREG_MC3_MISC;
 516                 break;
 517               case 0x413:
 518                 regNum = MISCREG_MC4_MISC;
 519                 break;
 520               case 0x417:
 521                 regNum = MISCREG_MC5_MISC;
 522                 break;
 523               case 0x41B:
 524                 regNum = MISCREG_MC6_MISC;
 525                 break;
 526               case 0x41F:
 527                 regNum = MISCREG_MC7_MISC;
 528                 break;
 529               case 0xC0000080:
 530                 regNum = MISCREG_EFER;
 531                 break;
 532               case 0xC0000081:
 533                 regNum = MISCREG_STAR;
 534                 break;
 535               case 0xC0000082:
 536                 regNum = MISCREG_LSTAR;
 537                 break;
 538               case 0xC0000083:
 539                 regNum = MISCREG_CSTAR;
 540                 break;
 541               case 0xC0000084:
 542                 regNum = MISCREG_SF_MASK;
 543                 break;
 544               case 0xC0000100:
 545                 regNum = MISCREG_FS_BASE;
 546                 break;
 547               case 0xC0000101:
 548                 regNum = MISCREG_GS_BASE;
 549                 break;
 550               case 0xC0000102:
 551                 regNum = MISCREG_KERNEL_GS_BASE;
 552                 break;
 553               case 0xC0000103:
 554                 regNum = MISCREG_TSC_AUX;
 555                 break;
 556               case 0xC0010000:
 557                 regNum = MISCREG_PERF_EVT_SEL0;
 558                 break;
 559               case 0xC0010001:
 560                 regNum = MISCREG_PERF_EVT_SEL1;
 561                 break;
 562               case 0xC0010002:
 563                 regNum = MISCREG_PERF_EVT_SEL2;
 564                 break;
 565               case 0xC0010003:
 566                 regNum = MISCREG_PERF_EVT_SEL3;
 567                 break;
 568               case 0xC0010004:
 569                 regNum = MISCREG_PERF_EVT_CTR0;
 570                 break;
 571               case 0xC0010005:
 572                 regNum = MISCREG_PERF_EVT_CTR1;
 573                 break;
 574               case 0xC0010006:
 575                 regNum = MISCREG_PERF_EVT_CTR2;
 576                 break;
 577               case 0xC0010007:
 578                 regNum = MISCREG_PERF_EVT_CTR3;
 579                 break;
 580               case 0xC0010010:
 581                 regNum = MISCREG_SYSCFG;
 582                 break;
 583               case 0xC0010016:
 584                 regNum = MISCREG_IORR_BASE0;
 585                 break;
 586               case 0xC0010017:
 587                 regNum = MISCREG_IORR_BASE1;
 588                 break;
 589               case 0xC0010018:
 590                 regNum = MISCREG_IORR_MASK0;
 591                 break;
 592               case 0xC0010019:
 593                 regNum = MISCREG_IORR_MASK1;
 594                 break;
 595               case 0xC001001A:
 596                 regNum = MISCREG_TOP_MEM;
 597                 break;
 598               case 0xC001001D:
 599                 regNum = MISCREG_TOP_MEM2;
 600                 break;
 601               case 0xC0010114:
 602                 regNum = MISCREG_VM_CR;
 603                 break;
 604               case 0xC0010115:
 605                 regNum = MISCREG_IGNNE;
 606                 break;
 607               case 0xC0010116:
 608                 regNum = MISCREG_SMM_CTL;
 609                 break;
 610               case 0xC0010117:
 611                 regNum = MISCREG_VM_HSAVE_PA;
 612                 break;
 613               default:
 614                 return std::make_shared<GeneralProtection>(0);
 615             }
 616             //The index is multiplied by the size of a MiscReg so that
 617             //any memory dependence calculations will not see these as
 618             //overlapping.
 619             req->setPaddr(regNum * sizeof(MiscReg));
 620             return NoFault;
 621         } else if (prefix == IntAddrPrefixIO) {
 622             // TODO If CPL > IOPL or in virtual mode, check the I/O permission
 623             // bitmap in the TSS.
 624
 625             Addr IOPort = vaddr & ~IntAddrPrefixMask;
 626             // Make sure the address fits in the expected 16 bit IO address
 627             // space.
 628             assert(!(IOPort & ~0xFFFF));
 629
 630             if (IOPort == 0xCF8 && req->getSize() == 4) {
 631                 req->setFlags(Request::MMAPPED_IPR);
 632                 req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
 633             } else if ((IOPort & ~mask(2)) == 0xCFC) {
 634                 req->setFlags(Request::UNCACHEABLE);
 635
 636                 Addr configAddress =
 637                     tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
 638
 639                 if (bits(configAddress, 31, 31)) {
 640                     req->setPaddr(PhysAddrPrefixPciConfig |
 641                                   mbits(configAddress, 30, 2) |
 642                                   (IOPort & mask(2)));
 643                 } else {
 644                     req->setPaddr(PhysAddrPrefixIO | IOPort);
 645                 }
 646             } else {
 647                 req->setFlags(Request::UNCACHEABLE);
 648                 req->setPaddr(PhysAddrPrefixIO | IOPort);
 649             }
 650             return NoFault;
 651         } else {
 652             panic("Access to unrecognized internal address space %#x.\n",
 653                   prefix);
 654         }
 655     }
 656
 657     /**
 658      * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
 659      * and false on a TLB miss.
 660      * Many of the checks about different modes have been converted to
 661      * assertions, since these parts of the code are not really used.
 662      * On a hit it will update the LRU stack.
 663      */
 664     bool
 665     GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
 666     {
 667         bool tlb_hit = false;
 668     #ifndef NDEBUG
 669         uint32_t flags = req->getFlags();
 670         int seg = flags & SegmentFlagMask;
 671     #endif
 672
 673         assert(seg != SEGMENT_REG_MS);
 674         Addr vaddr = req->getVaddr();
 675         DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
 676         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
 677
 678         if (m5Reg.prot) {
 679             DPRINTF(GPUTLB, "In protected mode.\n");
 680             // make sure we are in 64-bit mode
 681             assert(m5Reg.mode == LongMode);
 682
 683             // If paging is enabled, do the translation.
 684             if (m5Reg.paging) {
 685                 DPRINTF(GPUTLB, "Paging enabled.\n");
 686                 //update LRU stack on a hit
 687                 GpuTlbEntry *entry = lookup(vaddr, true);
 688
 689                 if (entry)
 690                     tlb_hit = true;
 691
 692                 if (!update_stats) {
 693                     // functional tlb access for memory initialization
 694                     // i.e., memory seeding or instr. seeding -> don't update
 695                     // TLB and stats
 696                     return tlb_hit;
 697                 }
 698
 699                 localNumTLBAccesses++;
 700
 701                 if (!entry) {
 702                     localNumTLBMisses++;
 703                 } else {
 704                     localNumTLBHits++;
 705                 }
 706             }
 707         }
 708
 709         return tlb_hit;
 710     }
 711
 712     Fault
 713     GpuTLB::translate(RequestPtr req, ThreadContext *tc,
 714                       Translation *translation, Mode mode,
 715                       bool &delayedResponse, bool timing, int &latency)
 716     {
 717         uint32_t flags = req->getFlags();
 718         int seg = flags & SegmentFlagMask;
 719         bool storeCheck = flags & (StoreCheck << FlagShift);
 720
 721         // If this is true, we're dealing with a request
 722         // to a non-memory address space.
 723         if (seg == SEGMENT_REG_MS) {
 724             return translateInt(req, tc);
 725         }
 726
 727         delayedResponse = false;
 728         Addr vaddr = req->getVaddr();
 729         DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
 730
 731         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
 732
 733         // If protected mode has been enabled...
 734         if (m5Reg.prot) {
 735             DPRINTF(GPUTLB, "In protected mode.\n");
 736             // If we're not in 64-bit mode, do protection/limit checks
 737             if (m5Reg.mode != LongMode) {
 738                 DPRINTF(GPUTLB, "Not in long mode. Checking segment "
 739                         "protection.\n");
 740
 741                 // Check for a null segment selector.
 742                 if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
 743                     seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
 744                     && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
 745                     return std::make_shared<GeneralProtection>(0);
 746                 }
 747
 748                 bool expandDown = false;
 749                 SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
 750
 751                 if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
 752                     if (!attr.writable && (mode == BaseTLB::Write ||
 753                         storeCheck))
 754                         return std::make_shared<GeneralProtection>(0);
 755
 756                     if (!attr.readable && mode == BaseTLB::Read)
 757                         return std::make_shared<GeneralProtection>(0);
 758
 759                     expandDown = attr.expandDown;
 760
 761                 }
 762
 763                 Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
 764                 Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
 765                 // This assumes we're not in 64 bit mode. If we were, the
 766                 // default address size is 64 bits, overridable to 32.
 767                 int size = 32;
 768                 bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
 769                 SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
 770
 771                 if ((csAttr.defaultSize && sizeOverride) ||
 772                     (!csAttr.defaultSize && !sizeOverride)) {
 773                     size = 16;
 774                 }
 775
 776                 Addr offset = bits(vaddr - base, size - 1, 0);
 777                 Addr endOffset = offset + req->getSize() - 1;
 778
 779                 if (expandDown) {
 780                     DPRINTF(GPUTLB, "Checking an expand down segment.\n");
 781                     warn_once("Expand down segments are untested.\n");
 782
 783                     if (offset <= limit || endOffset <= limit)
 784                         return std::make_shared<GeneralProtection>(0);
 785                 } else {
 786                     if (offset > limit || endOffset > limit)
 787                         return std::make_shared<GeneralProtection>(0);
 788                 }
 789             }
 790
 791             // If paging is enabled, do the translation.
 792             if (m5Reg.paging) {
 793                 DPRINTF(GPUTLB, "Paging enabled.\n");
 794                 // The vaddr already has the segment base applied.
 795                 GpuTlbEntry *entry = lookup(vaddr);
 796                 localNumTLBAccesses++;
 797
 798                 if (!entry) {
 799                     localNumTLBMisses++;
 800                     if (timing) {
 801                         latency = missLatency1;
 802                     }
 803
 804                     if (FullSystem) {
 805                         fatal("GpuTLB doesn't support full-system mode\n");
 806                     } else {
 807                         DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
 808                                 "at pc %#x.\n", vaddr, tc->instAddr());
 809
 810                         Process *p = tc->getProcessPtr();
 811                         const EmulationPageTable::Entry *pte =
 812                             p->pTable->lookup(vaddr);
 813
 814                         if (!pte && mode != BaseTLB::Execute) {
 815                             // penalize a "page fault" more
 816                             if (timing)
 817                                 latency += missLatency2;
 818
 819                             if (p->fixupStackFault(vaddr))
 820                                 pte = p->pTable->lookup(vaddr);
 821                         }
 822
 823                         if (!pte) {
 824                             return std::make_shared<PageFault>(vaddr, true,
 825                                                                mode, true,
 826                                                                false);
 827                         } else {
 828                             Addr alignedVaddr = p->pTable->pageAlign(vaddr);
 829
 830                             DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
 831                                     alignedVaddr, pte->paddr);
 832
 833                             GpuTlbEntry gpuEntry(
 834                                 p->pTable->pid(), alignedVaddr,
 835                                 pte->paddr, true);
 836                             entry = insert(alignedVaddr, gpuEntry);
 837                         }
 838
 839                         DPRINTF(GPUTLB, "Miss was serviced.\n");
 840                     }
 841                 } else {
 842                     localNumTLBHits++;
 843
 844                     if (timing) {
 845                         latency = hitLatency;
 846                     }
 847                 }
 848
 849                 // Do paging protection checks.
 850                 bool inUser = (m5Reg.cpl == 3 &&
 851                                !(flags & (CPL0FlagBit << FlagShift)));
 852
 853                 CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
 854                 bool badWrite = (!entry->writable && (inUser || cr0.wp));
 855
 856                 if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
 857                      badWrite)) {
 858                     // The page must have been present to get into the TLB in
 859                     // the first place. We'll assume the reserved bits are
 860                     // fine even though we're not checking them.
 861                     return std::make_shared<PageFault>(vaddr, true, mode,
 862                                                        inUser, false);
 863                 }
 864
 865                 if (storeCheck && badWrite) {
 866                     // This would fault if this were a write, so return a page
 867                     // fault that reflects that happening.
 868                     return std::make_shared<PageFault>(vaddr, true,
 869                                                        BaseTLB::Write,
 870                                                        inUser, false);
 871                 }
 872
 873
 874                 DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
 875                         "checks.\n", entry->paddr);
 876
 877                 int page_size = entry->size();
 878                 Addr paddr = entry->paddr | (vaddr & (page_size - 1));
 879                 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
 880                 req->setPaddr(paddr);
 881
 882                 if (entry->uncacheable)
 883                     req->setFlags(Request::UNCACHEABLE);
 884             } else {
 885                 //Use the address which already has segmentation applied.
 886                 DPRINTF(GPUTLB, "Paging disabled.\n");
 887                 DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
 888                 req->setPaddr(vaddr);
 889             }
 890         } else {
 891             // Real mode
 892             DPRINTF(GPUTLB, "In real mode.\n");
 893             DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
 894             req->setPaddr(vaddr);
 895         }
 896
 897         // Check for an access to the local APIC
 898         if (FullSystem) {
 899             LocalApicBase localApicBase =
 900                 tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
 901
 902             Addr baseAddr = localApicBase.base * PageBytes;
 903             Addr paddr = req->getPaddr();
 904
 905             if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
 906                 // Force the access to be uncacheable.
 907                 req->setFlags(Request::UNCACHEABLE);
 908                 req->setPaddr(x86LocalAPICAddress(tc->contextId(),
 909                                                   paddr - baseAddr));
 910             }
 911         }
 912
 913         return NoFault;
 914     };
 915
 916     Fault
 917     GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
 918                             int &latency)
 919     {
 920         bool delayedResponse;
 921
 922         return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
 923                                  latency);
 924     }
 925
 926     void
 927     GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
 928             Translation *translation, Mode mode, int &latency)
 929     {
 930         bool delayedResponse;
 931         assert(translation);
 932
 933         Fault fault = GpuTLB::translate(req, tc, translation, mode,
 934                                         delayedResponse, true, latency);
 935
 936         if (!delayedResponse)
 937             translation->finish(fault, req, tc, mode);
 938     }
 939
 940     Walker*
 941     GpuTLB::getWalker()
 942     {
 943         return walker;
 944     }
 945
 946
 947     void
 948     GpuTLB::serialize(CheckpointOut &cp) const
 949     {
 950     }
 951
 952     void
 953     GpuTLB::unserialize(CheckpointIn &cp)
 954     {
 955     }
 956
 957     void
 958     GpuTLB::regStats()
 959     {
 960         MemObject::regStats();
 961
 962         localNumTLBAccesses
 963             .name(name() + ".local_TLB_accesses")
 964             .desc("Number of TLB accesses")
 965             ;
 966
 967         localNumTLBHits
 968             .name(name() + ".local_TLB_hits")
 969             .desc("Number of TLB hits")
 970             ;
 971
 972         localNumTLBMisses
 973             .name(name() + ".local_TLB_misses")
 974             .desc("Number of TLB misses")
 975             ;
 976
 977         localTLBMissRate
 978             .name(name() + ".local_TLB_miss_rate")
 979             .desc("TLB miss rate")
 980             ;
 981
 982         accessCycles
 983             .name(name() + ".access_cycles")
 984             .desc("Cycles spent accessing this TLB level")
 985             ;
 986
 987         pageTableCycles
 988             .name(name() + ".page_table_cycles")
 989             .desc("Cycles spent accessing the page table")
 990             ;
 991
 992         localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
 993
 994         numUniquePages
 995             .name(name() + ".unique_pages")
 996             .desc("Number of unique pages touched")
 997             ;
 998
 999         localCycles
1000             .name(name() + ".local_cycles")
1001             .desc("Number of cycles spent in queue for all incoming reqs")
1002             ;
1003
1004         localLatency
1005             .name(name() + ".local_latency")
1006             .desc("Avg. latency over incoming coalesced reqs")
1007             ;
1008
1009         localLatency = localCycles / localNumTLBAccesses;
1010
1011         globalNumTLBAccesses
1012             .name(name() + ".global_TLB_accesses")
1013             .desc("Number of TLB accesses")
1014             ;
1015
1016         globalNumTLBHits
1017             .name(name() + ".global_TLB_hits")
1018             .desc("Number of TLB hits")
1019             ;
1020
1021         globalNumTLBMisses
1022             .name(name() + ".global_TLB_misses")
1023             .desc("Number of TLB misses")
1024             ;
1025
1026         globalTLBMissRate
1027             .name(name() + ".global_TLB_miss_rate")
1028             .desc("TLB miss rate")
1029             ;
1030
1031         globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
1032
1033         avgReuseDistance
1034             .name(name() + ".avg_reuse_distance")
1035             .desc("avg. reuse distance over all pages (in ticks)")
1036             ;
1037
1038     }
1039
1040     /**
1041      * Do the TLB lookup for this coalesced request and schedule
1042      * another event <TLB access latency> cycles later.
1043      */
1044
1045     void
1046     GpuTLB::issueTLBLookup(PacketPtr pkt)
1047     {
1048         assert(pkt);
1049         assert(pkt->senderState);
1050
1051         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1052                                         TheISA::PageBytes);
1053
1054         TranslationState *sender_state =
1055                 safe_cast<TranslationState*>(pkt->senderState);
1056
1057         bool update_stats = !sender_state->prefetch;
1058         ThreadContext * tmp_tc = sender_state->tc;
1059
1060         DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
1061                 virt_page_addr);
1062
1063         int req_cnt = sender_state->reqCnt.back();
1064
1065         if (update_stats) {
1066             accessCycles -= (curTick() * req_cnt);
1067             localCycles -= curTick();
1068             updatePageFootprint(virt_page_addr);
1069             globalNumTLBAccesses += req_cnt;
1070         }
1071
1072         tlbOutcome lookup_outcome = TLB_MISS;
1073         RequestPtr tmp_req = pkt->req;
1074
1075         // Access the TLB and figure out if it's a hit or a miss.
1076         bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
1077
1078         if (success) {
1079             lookup_outcome = TLB_HIT;
1080             // Put the entry in SenderState
1081             GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
1082             assert(entry);
1083
1084             sender_state->tlbEntry =
1085                 new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
1086
1087             if (update_stats) {
1088                 // the reqCnt has an entry per level, so its size tells us
1089                 // which level we are in
1090                 sender_state->hitLevel = sender_state->reqCnt.size();
1091                 globalNumTLBHits += req_cnt;
1092             }
1093         } else {
1094             if (update_stats)
1095                 globalNumTLBMisses += req_cnt;
1096         }
1097
1098         /*
1099          * We now know the TLB lookup outcome (if it's a hit or a miss), as well
1100          * as the TLB access latency.
1101          *
1102          * We create and schedule a new TLBEvent which will help us take the
1103          * appropriate actions (e.g., update TLB on a hit, send request to lower
1104          * level TLB on a miss, or start a page walk if this was the last-level
1105          * TLB)
1106          */
1107         TLBEvent *tlb_event =
1108             new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
1109
1110         if (translationReturnEvent.count(virt_page_addr)) {
1111             panic("Virtual Page Address %#x already has a return event\n",
1112                   virt_page_addr);
1113         }
1114
1115         translationReturnEvent[virt_page_addr] = tlb_event;
1116         assert(tlb_event);
1117
1118         DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
1119                 curTick() + this->ticks(hitLatency));
1120
1121         schedule(tlb_event, curTick() + this->ticks(hitLatency));
1122     }
1123
1124     GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
1125                                PacketPtr _pkt)
1126         : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
1127         outcome(tlb_outcome), pkt(_pkt)
1128     {
1129     }
1130
1131     /**
1132      * Do Paging protection checks. If we encounter a page fault, then
1133      * an assertion is fired.
1134      */
1135     void
1136     GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
1137             GpuTlbEntry * tlb_entry, Mode mode)
1138     {
1139         HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
1140         uint32_t flags = pkt->req->getFlags();
1141         bool storeCheck = flags & (StoreCheck << FlagShift);
1142
1143         // Do paging protection checks.
1144         bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
1145         CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
1146
1147         bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
1148
1149         if ((inUser && !tlb_entry->user) ||
1150             (mode == BaseTLB::Write && badWrite)) {
1151            // The page must have been present to get into the TLB in
1152            // the first place. We'll assume the reserved bits are
1153            // fine even though we're not checking them.
1154            assert(false);
1155         }
1156
1157         if (storeCheck && badWrite) {
1158            // This would fault if this were a write, so return a page
1159            // fault that reflects that happening.
1160            assert(false);
1161         }
1162     }
1163
1164     /**
1165      * handleTranslationReturn is called on a TLB hit,
1166      * when a TLB miss returns or when a page fault returns.
1167      * The latter calls handelHit with TLB miss as tlbOutcome.
1168      */
1169     void
1170     GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
1171             PacketPtr pkt)
1172     {
1173
1174         assert(pkt);
1175         Addr vaddr = pkt->req->getVaddr();
1176
1177         TranslationState *sender_state =
1178             safe_cast<TranslationState*>(pkt->senderState);
1179
1180         ThreadContext *tc = sender_state->tc;
1181         Mode mode = sender_state->tlbMode;
1182
1183         GpuTlbEntry *local_entry, *new_entry;
1184
1185         if (tlb_outcome == TLB_HIT) {
1186             DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
1187             local_entry = sender_state->tlbEntry;
1188         } else {
1189             DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
1190                     vaddr);
1191
1192             // We are returning either from a page walk or from a hit at a lower
1193             // TLB level. The senderState should be "carrying" a pointer to the
1194             // correct TLBEntry.
1195             new_entry = sender_state->tlbEntry;
1196             assert(new_entry);
1197             local_entry = new_entry;
1198
1199             if (allocationPolicy) {
1200                 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1201                         virt_page_addr);
1202
1203                 local_entry = insert(virt_page_addr, *new_entry);
1204             }
1205
1206             assert(local_entry);
1207         }
1208
1209         /**
1210          * At this point the packet carries an up-to-date tlbEntry pointer
1211          * in its senderState.
1212          * Next step is to do the paging protection checks.
1213          */
1214         DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "
1215                 "while paddr was %#x.\n", local_entry->vaddr,
1216                 local_entry->paddr);
1217
1218         pagingProtectionChecks(tc, pkt, local_entry, mode);
1219         int page_size = local_entry->size();
1220         Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1221         DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1222
1223         // Since this packet will be sent through the cpu side slave port,
1224         // it must be converted to a response pkt if it is not one already
1225         if (pkt->isRequest()) {
1226             pkt->makeTimingResponse();
1227         }
1228
1229         pkt->req->setPaddr(paddr);
1230
1231         if (local_entry->uncacheable) {
1232              pkt->req->setFlags(Request::UNCACHEABLE);
1233         }
1234
1235         //send packet back to coalescer
1236         cpuSidePort[0]->sendTimingResp(pkt);
1237         //schedule cleanup event
1238         cleanupQueue.push(virt_page_addr);
1239
1240         // schedule this only once per cycle.
1241         // The check is required because we might have multiple translations
1242         // returning the same cycle
1243         // this is a maximum priority event and must be on the same cycle
1244         // as the cleanup event in TLBCoalescer to avoid a race with
1245         // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
1246         if (!cleanupEvent.scheduled())
1247             schedule(cleanupEvent, curTick());
1248     }
1249
1250     /**
1251      * Here we take the appropriate actions based on the result of the
1252      * TLB lookup.
1253      */
1254     void
1255     GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
1256                               PacketPtr pkt)
1257     {
1258         DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
1259
1260         assert(translationReturnEvent[virtPageAddr]);
1261         assert(pkt);
1262
1263         TranslationState *tmp_sender_state =
1264             safe_cast<TranslationState*>(pkt->senderState);
1265
1266         int req_cnt = tmp_sender_state->reqCnt.back();
1267         bool update_stats = !tmp_sender_state->prefetch;
1268
1269
1270         if (outcome == TLB_HIT) {
1271             handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
1272
1273             if (update_stats) {
1274                 accessCycles += (req_cnt * curTick());
1275                 localCycles += curTick();
1276             }
1277
1278         } else if (outcome == TLB_MISS) {
1279
1280             DPRINTF(GPUTLB, "This is a TLB miss\n");
1281             if (update_stats) {
1282                 accessCycles += (req_cnt*curTick());
1283                 localCycles += curTick();
1284             }
1285
1286             if (hasMemSidePort) {
1287                 // the one cyle added here represent the delay from when we get
1288                 // the reply back till when we propagate it to the coalescer
1289                 // above.
1290                 if (update_stats) {
1291                     accessCycles += (req_cnt * 1);
1292                     localCycles += 1;
1293                 }
1294
1295                 /**
1296                  * There is a TLB below. Send the coalesced request.
1297                  * We actually send the very first packet of all the
1298                  * pending packets for this virtual page address.
1299                  */
1300                 if (!memSidePort[0]->sendTimingReq(pkt)) {
1301                     DPRINTF(GPUTLB, "Failed sending translation request to "
1302                             "lower level TLB for addr %#x\n", virtPageAddr);
1303
1304                     memSidePort[0]->retries.push_back(pkt);
1305                 } else {
1306                     DPRINTF(GPUTLB, "Sent translation request to lower level "
1307                             "TLB for addr %#x\n", virtPageAddr);
1308                 }
1309             } else {
1310                 //this is the last level TLB. Start a page walk
1311                 DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
1312                         "addr %#x\n", virtPageAddr);
1313
1314                 if (update_stats)
1315                     pageTableCycles -= (req_cnt*curTick());
1316
1317                 TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
1318                 assert(tlb_event);
1319                 tlb_event->updateOutcome(PAGE_WALK);
1320                 schedule(tlb_event, curTick() + ticks(missLatency2));
1321             }
1322         } else if (outcome == PAGE_WALK) {
1323             if (update_stats)
1324                 pageTableCycles += (req_cnt*curTick());
1325
1326             // Need to access the page table and update the TLB
1327             DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1328                     virtPageAddr);
1329
1330             TranslationState *sender_state =
1331                 safe_cast<TranslationState*>(pkt->senderState);
1332
1333             Process *p = sender_state->tc->getProcessPtr();
1334             Addr vaddr = pkt->req->getVaddr();
1335     #ifndef NDEBUG
1336             Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1337             assert(alignedVaddr == virtPageAddr);
1338     #endif
1339             const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);
1340             if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1341                     p->fixupStackFault(vaddr)) {
1342                 pte = p->pTable->lookup(vaddr);
1343             }
1344
1345             if (pte) {
1346                 DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1347                         pte->paddr);
1348
1349                 sender_state->tlbEntry =
1350                     new GpuTlbEntry(0, virtPageAddr, pte->paddr, true);
1351             } else {
1352                 sender_state->tlbEntry =
1353                     new GpuTlbEntry(0, 0, 0, false);
1354             }
1355
1356             handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1357         } else if (outcome == MISS_RETURN) {
1358             /** we add an extra cycle in the return path of the translation
1359              * requests in between the various TLB levels.
1360              */
1361             handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
1362         } else {
1363             assert(false);
1364         }
1365     }
1366
1367     void
1368     GpuTLB::TLBEvent::process()
1369     {
1370         tlb->translationReturn(virtPageAddr, outcome, pkt);
1371     }
1372
1373     const char*
1374     GpuTLB::TLBEvent::description() const
1375     {
1376         return "trigger translationDoneEvent";
1377     }
1378
1379     void
1380     GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
1381     {
1382         outcome = _outcome;
1383     }
1384
1385     Addr
1386     GpuTLB::TLBEvent::getTLBEventVaddr()
1387     {
1388         return virtPageAddr;
1389     }
1390
1391     /*
1392      * recvTiming receives a coalesced timing request from a TLBCoalescer
1393      * and it calls issueTLBLookup()
1394      * It only rejects the packet if we have exceeded the max
1395      * outstanding number of requests for the TLB
1396      */
1397     bool
1398     GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
1399     {
1400         if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
1401             tlb->issueTLBLookup(pkt);
1402             // update number of outstanding translation requests
1403             tlb->outstandingReqs++;
1404             return true;
1405          } else {
1406             DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
1407                     tlb->outstandingReqs);
1408             return false;
1409          }
1410     }
1411
1412     /**
1413      * handleFuncTranslationReturn is called on a TLB hit,
1414      * when a TLB miss returns or when a page fault returns.
1415      * It updates LRU, inserts the TLB entry on a miss
1416      * depending on the allocation policy and does the required
1417      * protection checks. It does NOT create a new packet to
1418      * update the packet's addr; this is done in hsail-gpu code.
1419      */
1420     void
1421     GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
1422     {
1423         TranslationState *sender_state =
1424             safe_cast<TranslationState*>(pkt->senderState);
1425
1426         ThreadContext *tc = sender_state->tc;
1427         Mode mode = sender_state->tlbMode;
1428         Addr vaddr = pkt->req->getVaddr();
1429
1430         GpuTlbEntry *local_entry, *new_entry;
1431
1432         if (tlb_outcome == TLB_HIT) {
1433             DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
1434                     "%#x\n", vaddr);
1435
1436             local_entry = sender_state->tlbEntry;
1437         } else {
1438             DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
1439                     "%#x\n", vaddr);
1440
1441             // We are returning either from a page walk or from a hit at a lower
1442             // TLB level. The senderState should be "carrying" a pointer to the
1443             // correct TLBEntry.
1444             new_entry = sender_state->tlbEntry;
1445             assert(new_entry);
1446             local_entry = new_entry;
1447
1448             if (allocationPolicy) {
1449                 Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
1450
1451                 DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
1452                         virt_page_addr);
1453
1454                 local_entry = insert(virt_page_addr, *new_entry);
1455             }
1456
1457             assert(local_entry);
1458         }
1459
1460         DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
1461                 "while paddr was %#x.\n", local_entry->vaddr,
1462                 local_entry->paddr);
1463
1464         // Do paging checks if it's a normal functional access.  If it's for a
1465         // prefetch, then sometimes you can try to prefetch something that won't
1466         // pass protection. We don't actually want to fault becuase there is no
1467         // demand access to deem this a violation.  Just put it in the TLB and
1468         // it will fault if indeed a future demand access touches it in
1469         // violation.
1470         if (!sender_state->prefetch && sender_state->tlbEntry->valid)
1471             pagingProtectionChecks(tc, pkt, local_entry, mode);
1472
1473         int page_size = local_entry->size();
1474         Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
1475         DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
1476
1477         pkt->req->setPaddr(paddr);
1478
1479         if (local_entry->uncacheable)
1480              pkt->req->setFlags(Request::UNCACHEABLE);
1481     }
1482
1483     // This is used for atomic translations. Need to
1484     // make it all happen during the same cycle.
1485     void
1486     GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
1487     {
1488         TranslationState *sender_state =
1489             safe_cast<TranslationState*>(pkt->senderState);
1490
1491         ThreadContext *tc = sender_state->tc;
1492         bool update_stats = !sender_state->prefetch;
1493
1494         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1495                                         TheISA::PageBytes);
1496
1497         if (update_stats)
1498             tlb->updatePageFootprint(virt_page_addr);
1499
1500         // do the TLB lookup without updating the stats
1501         bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
1502         tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
1503
1504         // functional mode means no coalescing
1505         // global metrics are the same as the local metrics
1506         if (update_stats) {
1507             tlb->globalNumTLBAccesses++;
1508
1509             if (success) {
1510                 sender_state->hitLevel = sender_state->reqCnt.size();
1511                 tlb->globalNumTLBHits++;
1512             }
1513         }
1514
1515         if (!success) {
1516             if (update_stats)
1517                 tlb->globalNumTLBMisses++;
1518             if (tlb->hasMemSidePort) {
1519                 // there is a TLB below -> propagate down the TLB hierarchy
1520                 tlb->memSidePort[0]->sendFunctional(pkt);
1521                 // If no valid translation from a prefetch, then just return
1522                 if (sender_state->prefetch && !pkt->req->hasPaddr())
1523                     return;
1524             } else {
1525                 // Need to access the page table and update the TLB
1526                 DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
1527                         virt_page_addr);
1528
1529                 Process *p = tc->getProcessPtr();
1530
1531                 Addr vaddr = pkt->req->getVaddr();
1532     #ifndef NDEBUG
1533                 Addr alignedVaddr = p->pTable->pageAlign(vaddr);
1534                 assert(alignedVaddr == virt_page_addr);
1535     #endif
1536
1537                 const EmulationPageTable::Entry *pte =
1538                         p->pTable->lookup(vaddr);
1539                 if (!pte && sender_state->tlbMode != BaseTLB::Execute &&
1540                         p->fixupStackFault(vaddr)) {
1541                     pte = p->pTable->lookup(vaddr);
1542                 }
1543
1544                 if (!sender_state->prefetch) {
1545                     // no PageFaults are permitted after
1546                     // the second page table lookup
1547                     assert(pte);
1548
1549                     DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1550                             pte->paddr);
1551
1552                     sender_state->tlbEntry =
1553                         new GpuTlbEntry(0, virt_page_addr,
1554                                         pte->paddr, true);
1555                 } else {
1556                     // If this was a prefetch, then do the normal thing if it
1557                     // was a successful translation.  Otherwise, send an empty
1558                     // TLB entry back so that it can be figured out as empty and
1559                     // handled accordingly.
1560                     if (pte) {
1561                         DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
1562                                 pte->paddr);
1563
1564                         sender_state->tlbEntry =
1565                             new GpuTlbEntry(0, virt_page_addr,
1566                                             pte->paddr, true);
1567                     } else {
1568                         DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
1569                                 alignedVaddr);
1570
1571                         sender_state->tlbEntry = new GpuTlbEntry();
1572
1573                         return;
1574                     }
1575                 }
1576             }
1577         } else {
1578             DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
1579                     tlb->lookup(pkt->req->getVaddr()));
1580
1581             GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
1582                                              update_stats);
1583
1584             assert(entry);
1585
1586             sender_state->tlbEntry =
1587                 new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
1588         }
1589         // This is the function that would populate pkt->req with the paddr of
1590         // the translation. But if no translation happens (i.e Prefetch fails)
1591         // then the early returns in the above code wiill keep this function
1592         // from executing.
1593         tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
1594     }
1595
1596     void
1597     GpuTLB::CpuSidePort::recvReqRetry()
1598     {
1599         // The CPUSidePort never sends anything but replies. No retries
1600         // expected.
1601         assert(false);
1602     }
1603
1604     AddrRangeList
1605     GpuTLB::CpuSidePort::getAddrRanges() const
1606     {
1607         // currently not checked by the master
1608         AddrRangeList ranges;
1609
1610         return ranges;
1611     }
1612
1613     /**
1614      * MemSidePort receives the packet back.
1615      * We need to call the handleTranslationReturn
1616      * and propagate up the hierarchy.
1617      */
1618     bool
1619     GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
1620     {
1621         Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
1622                                         TheISA::PageBytes);
1623
1624         DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
1625                 virt_page_addr);
1626
1627         TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
1628         assert(tlb_event);
1629         assert(virt_page_addr == tlb_event->getTLBEventVaddr());
1630
1631         tlb_event->updateOutcome(MISS_RETURN);
1632         tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
1633
1634         return true;
1635     }
1636
1637     void
1638     GpuTLB::MemSidePort::recvReqRetry()
1639     {
1640         // No retries should reach the TLB. The retries
1641         // should only reach the TLBCoalescer.
1642         assert(false);
1643     }
1644
1645     void
1646     GpuTLB::cleanup()
1647     {
1648         while (!cleanupQueue.empty()) {
1649             Addr cleanup_addr = cleanupQueue.front();
1650             cleanupQueue.pop();
1651
1652             // delete TLBEvent
1653             TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
1654             delete old_tlb_event;
1655             translationReturnEvent.erase(cleanup_addr);
1656
1657             // update number of outstanding requests
1658             outstandingReqs--;
1659         }
1660
1661         /** the higher level coalescer should retry if it has
1662          * any pending requests.
1663          */
1664         for (int i = 0; i < cpuSidePort.size(); ++i) {
1665             cpuSidePort[i]->sendRetryReq();
1666         }
1667     }
1668
1669     void
1670     GpuTLB::updatePageFootprint(Addr virt_page_addr)
1671     {
1672
1673         std::pair<AccessPatternTable::iterator, bool> ret;
1674
1675         AccessInfo tmp_access_info;
1676         tmp_access_info.lastTimeAccessed = 0;
1677         tmp_access_info.accessesPerPage = 0;
1678         tmp_access_info.totalReuseDistance = 0;
1679         tmp_access_info.sumDistance = 0;
1680         tmp_access_info.meanDistance = 0;
1681
1682         ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
1683                                   tmp_access_info));
1684
1685         bool first_page_access = ret.second;
1686
1687         if (first_page_access) {
1688             numUniquePages++;
1689         } else  {
1690             int accessed_before;
1691             accessed_before  = curTick() - ret.first->second.lastTimeAccessed;
1692             ret.first->second.totalReuseDistance += accessed_before;
1693         }
1694
1695         ret.first->second.accessesPerPage++;
1696         ret.first->second.lastTimeAccessed = curTick();
1697
1698         if (accessDistance) {
1699             ret.first->second.localTLBAccesses
1700                 .push_back(localNumTLBAccesses.value());
1701         }
1702     }
1703
1704     void
1705     GpuTLB::exitCallback()
1706     {
1707         std::ostream *page_stat_file = nullptr;
1708
1709         if (accessDistance) {
1710
1711             // print per page statistics to a separate file (.csv format)
1712             // simout is the gem5 output directory (default is m5out or the one
1713             // specified with -d
1714             page_stat_file = simout.create(name().c_str())->stream();
1715
1716             // print header
1717             *page_stat_file << "page,max_access_distance,mean_access_distance, "
1718                             << "stddev_distance" << std::endl;
1719         }
1720
1721         // update avg. reuse distance footprint
1722         AccessPatternTable::iterator iter, iter_begin, iter_end;
1723         unsigned int sum_avg_reuse_distance_per_page = 0;
1724
1725         // iterate through all pages seen by this TLB
1726         for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
1727             sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
1728                                                iter->second.accessesPerPage;
1729
1730             if (accessDistance) {
1731                 unsigned int tmp = iter->second.localTLBAccesses[0];
1732                 unsigned int prev = tmp;
1733
1734                 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1735                     if (i) {
1736                         tmp = prev + 1;
1737                     }
1738
1739                     prev = iter->second.localTLBAccesses[i];
1740                     // update the localTLBAccesses value
1741                     // with the actual differece
1742                     iter->second.localTLBAccesses[i] -= tmp;
1743                     // compute the sum of AccessDistance per page
1744                     // used later for mean
1745                     iter->second.sumDistance +=
1746                         iter->second.localTLBAccesses[i];
1747                 }
1748
1749                 iter->second.meanDistance =
1750                     iter->second.sumDistance / iter->second.accessesPerPage;
1751
1752                 // compute std_dev and max  (we need a second round because we
1753                 // need to know the mean value
1754                 unsigned int max_distance = 0;
1755                 unsigned int stddev_distance = 0;
1756
1757                 for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
1758                     unsigned int tmp_access_distance =
1759                         iter->second.localTLBAccesses[i];
1760
1761                     if (tmp_access_distance > max_distance) {
1762                         max_distance = tmp_access_distance;
1763                     }
1764
1765                     unsigned int diff =
1766                         tmp_access_distance - iter->second.meanDistance;
1767                     stddev_distance += pow(diff, 2);
1768
1769                 }
1770
1771                 stddev_distance =
1772                     sqrt(stddev_distance/iter->second.accessesPerPage);
1773
1774                 if (page_stat_file) {
1775                     *page_stat_file << std::hex << iter->first << ",";
1776                     *page_stat_file << std::dec << max_distance << ",";
1777                     *page_stat_file << std::dec << iter->second.meanDistance
1778                                     << ",";
1779                     *page_stat_file << std::dec << stddev_distance;
1780                     *page_stat_file << std::endl;
1781                 }
1782
1783                 // erase the localTLBAccesses array
1784                 iter->second.localTLBAccesses.clear();
1785             }
1786         }
1787
1788         if (!TLBFootprint.empty()) {
1789             avgReuseDistance =
1790                 sum_avg_reuse_distance_per_page / TLBFootprint.size();
1791         }
1792
1793         //clear the TLBFootprint map
1794         TLBFootprint.clear();
1795     }
1796 } // namespace X86ISA
1797
1798 X86ISA::GpuTLB*
1799 X86GPUTLBParams::create()
1800 {
1801     return new X86ISA::GpuTLB(this);
1802 }
1803