src/gpu-compute/shader.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Authors: Steve Reinhardt
  34  */
  35
  36 #include "gpu-compute/shader.hh"
  37
  38 #include <limits>
  39
  40 #include "arch/x86/linux/linux.hh"
  41 #include "base/chunk_generator.hh"
  42 #include "debug/GPUDisp.hh"
  43 #include "debug/GPUMem.hh"
  44 #include "debug/HSAIL.hh"
  45 #include "gpu-compute/dispatcher.hh"
  46 #include "gpu-compute/gpu_static_inst.hh"
  47 #include "gpu-compute/qstruct.hh"
  48 #include "gpu-compute/wavefront.hh"
  49 #include "mem/packet.hh"
  50 #include "mem/ruby/system/RubySystem.hh"
  51 #include "sim/sim_exit.hh"
  52
  53 Shader::Shader(const Params *p)
  54     : ClockedObject(p), clock(p->clk_domain->clockPeriod()),
  55       cpuThread(nullptr), gpuTc(nullptr), cpuPointer(p->cpu_pointer),
  56       tickEvent([this]{ processTick(); }, "Shader tick",
  57                 false, Event::CPU_Tick_Pri),
  58       timingSim(p->timing), hsail_mode(SIMT),
  59       impl_kern_boundary_sync(p->impl_kern_boundary_sync),
  60       separate_acquire_release(p->separate_acquire_release), coissue_return(1),
  61       trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
  62       globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
  63       box_tick_cnt(0), start_tick_cnt(0)
  64 {
  65
  66     cuList.resize(n_cu);
  67
  68     for (int i = 0; i < n_cu; ++i) {
  69         cuList[i] = p->CUs[i];
  70         assert(i == cuList[i]->cu_id);
  71         cuList[i]->shader = this;
  72     }
  73 }
  74
  75 Addr
  76 Shader::mmap(int length)
  77 {
  78
  79     Addr start;
  80
  81     // round up length to the next page
  82     length = roundUp(length, TheISA::PageBytes);
  83
  84     Process *proc = gpuTc->getProcessPtr();
  85     auto mem_state = proc->memState;
  86
  87     if (proc->mmapGrowsDown()) {
  88         DPRINTF(HSAIL, "GROWS DOWN");
  89         start = mem_state->getMmapEnd() - length;
  90         mem_state->setMmapEnd(start);
  91     } else {
  92         DPRINTF(HSAIL, "GROWS UP");
  93         start = mem_state->getMmapEnd();
  94         mem_state->setMmapEnd(start + length);
  95
  96         // assertion to make sure we don't overwrite the stack (it grows down)
  97         assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
  98                mem_state->getMmapEnd());
  99     }
 100
 101     DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
 102
 103     proc->allocateMem(start, length);
 104
 105     return start;
 106 }
 107
 108 void
 109 Shader::init()
 110 {
 111     // grab the threadContext of the thread running on the CPU
 112     assert(cpuPointer);
 113     gpuTc = cpuPointer->getContext(0);
 114     assert(gpuTc);
 115 }
 116
 117 Shader::~Shader()
 118 {
 119     for (int j = 0; j < n_cu; ++j)
 120         delete cuList[j];
 121 }
 122
 123 void
 124 Shader::updateContext(int cid) {
 125     // context of the thread which dispatched work
 126     assert(cpuPointer);
 127     gpuTc = cpuPointer->getContext(cid);
 128     assert(gpuTc);
 129 }
 130
 131 void
 132 Shader::hostWakeUp(BaseCPU *cpu) {
 133     if (cpuPointer == cpu) {
 134         if (gpuTc->status() == ThreadContext::Suspended)
 135             cpu->activateContext(gpuTc->threadId());
 136     } else {
 137         //Make sure both dispatcher and shader are trying to
 138         //wakeup same host. Hack here to enable kernel launch
 139         //from multiple CPUs
 140         panic("Dispatcher wants to wakeup a different host");
 141     }
 142 }
 143
 144 Shader*
 145 ShaderParams::create()
 146 {
 147     return new Shader(this);
 148 }
 149
 150 void
 151 Shader::exec()
 152 {
 153     tick_cnt = curTick();
 154     box_tick_cnt = curTick() - start_tick_cnt;
 155
 156     // apply any scheduled adds
 157     for (int i = 0; i < sa_n; ++i) {
 158         if (sa_when[i] <= tick_cnt) {
 159             *sa_val[i] += sa_x[i];
 160             sa_val.erase(sa_val.begin() + i);
 161             sa_x.erase(sa_x.begin() + i);
 162             sa_when.erase(sa_when.begin() + i);
 163             --sa_n;
 164             --i;
 165         }
 166     }
 167
 168     // clock all of the cu's
 169     for (int i = 0; i < n_cu; ++i)
 170         cuList[i]->exec();
 171 }
 172
 173 bool
 174 Shader::dispatch_workgroups(NDRange *ndr)
 175 {
 176     bool scheduledSomething = false;
 177     int cuCount = 0;
 178     int curCu = nextSchedCu;
 179
 180     while (cuCount < n_cu) {
 181         //Every time we try a CU, update nextSchedCu
 182         nextSchedCu = (nextSchedCu + 1) % n_cu;
 183
 184         // dispatch workgroup iff the following two conditions are met:
 185         // (a) wg_rem is true - there are unassigned workgroups in the grid
 186         // (b) there are enough free slots in cu cuList[i] for this wg
 187         if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
 188             scheduledSomething = true;
 189             DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
 190
 191             // ticks() member function translates cycles to simulation ticks.
 192             if (!tickEvent.scheduled()) {
 193                 schedule(tickEvent, curTick() + this->ticks(1));
 194             }
 195
 196             cuList[curCu]->StartWorkgroup(ndr);
 197             ndr->wgId[0]++;
 198             ndr->globalWgId++;
 199             if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
 200                 ndr->wgId[0] = 0;
 201                 ndr->wgId[1]++;
 202
 203                 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
 204                     ndr->wgId[1] = 0;
 205                     ndr->wgId[2]++;
 206
 207                     if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
 208                         ndr->wg_disp_rem = false;
 209                         break;
 210                     }
 211                 }
 212             }
 213         }
 214
 215         ++cuCount;
 216         curCu = nextSchedCu;
 217     }
 218
 219     return scheduledSomething;
 220 }
 221
 222 void
 223 Shader::handshake(GpuDispatcher *_dispatcher)
 224 {
 225     dispatcher = _dispatcher;
 226 }
 227
 228 void
 229 Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
 230                            bool suppress_func_errors, int cu_id)
 231 {
 232     int block_size = cuList.at(cu_id)->cacheLineSize();
 233     unsigned size = req->getSize();
 234
 235     Addr tmp_addr;
 236     BaseTLB::Mode trans_mode;
 237
 238     if (cmd == MemCmd::ReadReq) {
 239         trans_mode = BaseTLB::Read;
 240     } else if (cmd == MemCmd::WriteReq) {
 241         trans_mode = BaseTLB::Write;
 242     } else {
 243         fatal("unexcepted MemCmd\n");
 244     }
 245
 246     tmp_addr = req->getVaddr();
 247     Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
 248
 249     assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
 250
 251     // Misaligned access
 252     if (split_addr > tmp_addr) {
 253         RequestPtr req1, req2;
 254         req->splitOnVaddr(split_addr, req1, req2);
 255
 256
 257         PacketPtr pkt1 = new Packet(req2, cmd);
 258         PacketPtr pkt2 = new Packet(req1, cmd);
 259
 260         functionalTLBAccess(pkt1, cu_id, trans_mode);
 261         functionalTLBAccess(pkt2, cu_id, trans_mode);
 262
 263         PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
 264         PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
 265
 266         new_pkt1->dataStatic(data);
 267         new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
 268
 269         if (suppress_func_errors) {
 270             new_pkt1->setSuppressFuncError();
 271             new_pkt2->setSuppressFuncError();
 272         }
 273
 274         // fixme: this should be cuList[cu_id] if cu_id != n_cu
 275         // The latter requires a memPort in the dispatcher
 276         cuList[0]->memPort[0]->sendFunctional(new_pkt1);
 277         cuList[0]->memPort[0]->sendFunctional(new_pkt2);
 278
 279         delete new_pkt1;
 280         delete new_pkt2;
 281         delete pkt1;
 282         delete pkt2;
 283     } else {
 284         PacketPtr pkt = new Packet(req, cmd);
 285         functionalTLBAccess(pkt, cu_id, trans_mode);
 286         PacketPtr new_pkt = new Packet(pkt->req, cmd);
 287         new_pkt->dataStatic(data);
 288
 289         if (suppress_func_errors) {
 290             new_pkt->setSuppressFuncError();
 291         };
 292
 293         // fixme: this should be cuList[cu_id] if cu_id != n_cu
 294         // The latter requires a memPort in the dispatcher
 295         cuList[0]->memPort[0]->sendFunctional(new_pkt);
 296
 297         delete new_pkt;
 298         delete pkt;
 299     }
 300 }
 301
 302 bool
 303 Shader::busy()
 304 {
 305     for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
 306         if (!cuList[i_cu]->isDone()) {
 307             return true;
 308         }
 309     }
 310
 311     return false;
 312 }
 313
 314 void
 315 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
 316 {
 317     sa_val.push_back(val);
 318     sa_when.push_back(tick_cnt + when);
 319     sa_x.push_back(x);
 320     ++sa_n;
 321 }
 322
 323
 324 void
 325 Shader::processTick()
 326 {
 327     if (busy()) {
 328         exec();
 329         schedule(tickEvent, curTick() + ticks(1));
 330     }
 331 }
 332
 333 void
 334 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 335                   MemCmd cmd, bool suppress_func_errors)
 336 {
 337     uint8_t *data_buf = (uint8_t*)ptr;
 338
 339     for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
 340          !gen.done(); gen.next()) {
 341         RequestPtr req = new Request(0, gen.addr(), gen.size(), 0,
 342                                    cuList[0]->masterId(), 0, 0, 0);
 343
 344         doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
 345         data_buf += gen.size();
 346         delete req;
 347     }
 348 }
 349
 350 void
 351 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
 352 {
 353     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
 354 }
 355
 356 void
 357 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 358                 bool suppress_func_errors)
 359 {
 360     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
 361 }
 362
 363 void
 364 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
 365 {
 366     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
 367 }
 368
 369 void
 370 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 371                  bool suppress_func_errors)
 372 {
 373     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
 374               suppress_func_errors);
 375 }
 376
 377 /*
 378  * Send a packet through the appropriate TLB functional port.
 379  * If cu_id=n_cu, then this is the dispatcher's TLB.
 380  * Otherwise it's the TLB of the cu_id compute unit.
 381  */
 382 void
 383 Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
 384 {
 385     // update senderState. Need to know the gpuTc and the TLB mode
 386     pkt->senderState =
 387         new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
 388
 389     if (cu_id == n_cu) {
 390         dispatcher->tlbPort->sendFunctional(pkt);
 391     } else {
 392         // even when the perLaneTLB flag is turned on
 393         // it's ok tp send all accesses through lane 0
 394         // since the lane # is not known here,
 395         // This isn't important since these are functional accesses.
 396         cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
 397     }
 398
 399     /* safe_cast the senderState */
 400     TheISA::GpuTLB::TranslationState *sender_state =
 401                safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 402
 403     delete sender_state->tlbEntry;
 404     delete pkt->senderState;
 405 }