src/gpu-compute/shader.cc

   1 /*
   2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its contributors
  18  * may be used to endorse or promote products derived from this software
  19  * without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Author: Steve Reinhardt
  34  */
  35
  36 #include "gpu-compute/shader.hh"
  37
  38 #include <limits>
  39
  40 #include "arch/x86/linux/linux.hh"
  41 #include "base/chunk_generator.hh"
  42 #include "debug/GPUDisp.hh"
  43 #include "debug/GPUMem.hh"
  44 #include "debug/HSAIL.hh"
  45 #include "gpu-compute/dispatcher.hh"
  46 #include "gpu-compute/gpu_static_inst.hh"
  47 #include "gpu-compute/qstruct.hh"
  48 #include "gpu-compute/wavefront.hh"
  49 #include "mem/packet.hh"
  50 #include "mem/ruby/system/RubySystem.hh"
  51 #include "sim/sim_exit.hh"
  52
  53 Shader::Shader(const Params *p) : SimObject(p),
  54     clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
  55     cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
  56     hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
  57     separate_acquire_release(p->separate_acquire_release), coissue_return(1),
  58     trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
  59     globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
  60     box_tick_cnt(0), start_tick_cnt(0)
  61 {
  62
  63     cuList.resize(n_cu);
  64
  65     for (int i = 0; i < n_cu; ++i) {
  66         cuList[i] = p->CUs[i];
  67         assert(i == cuList[i]->cu_id);
  68         cuList[i]->shader = this;
  69     }
  70 }
  71
  72 Addr
  73 Shader::mmap(int length)
  74 {
  75
  76     Addr start;
  77
  78     // round up length to the next page
  79     length = roundUp(length, TheISA::PageBytes);
  80
  81     if (X86Linux64::mmapGrowsDown()) {
  82         DPRINTF(HSAIL, "GROWS DOWN");
  83         start = gpuTc->getProcessPtr()->mmap_end -length;
  84         gpuTc->getProcessPtr()->mmap_end = start;
  85     } else {
  86         DPRINTF(HSAIL, "GROWS UP");
  87         start = gpuTc->getProcessPtr()->mmap_end;
  88         gpuTc->getProcessPtr()->mmap_end += length;
  89
  90         // assertion to make sure we don't overwrite the stack (it grows down)
  91         assert(gpuTc->getProcessPtr()->mmap_end <
  92                 gpuTc->getProcessPtr()->stack_base -
  93                 gpuTc->getProcessPtr()->max_stack_size);
  94
  95     }
  96
  97     DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
  98
  99     gpuTc->getProcessPtr()->allocateMem(start,length);
 100
 101     return start;
 102 }
 103
 104 void
 105 Shader::init()
 106 {
 107     // grab the threadContext of the thread running on the CPU
 108     assert(cpuPointer);
 109     gpuTc = cpuPointer->getContext(0);
 110     assert(gpuTc);
 111 }
 112
 113 Shader::~Shader()
 114 {
 115     for (int j = 0; j < n_cu; ++j)
 116         delete cuList[j];
 117 }
 118
 119 void
 120 Shader::updateThreadContext(int tid) {
 121     // thread context of the thread which dispatched work
 122     assert(cpuPointer);
 123     gpuTc = cpuPointer->getContext(tid);
 124     assert(gpuTc);
 125 }
 126
 127 void
 128 Shader::hostWakeUp(BaseCPU *cpu) {
 129     if (cpuPointer == cpu) {
 130         if (gpuTc->status() == ThreadContext::Suspended)
 131             cpu->activateContext(gpuTc->threadId());
 132     } else {
 133         //Make sure both dispatcher and shader are trying to
 134         //wakeup same host. Hack here to enable kernel launch
 135         //from multiple CPUs
 136         panic("Dispatcher wants to wakeup a different host");
 137     }
 138 }
 139
 140 Shader*
 141 ShaderParams::create()
 142 {
 143     return new Shader(this);
 144 }
 145
 146 void
 147 Shader::exec()
 148 {
 149     tick_cnt = curTick();
 150     box_tick_cnt = curTick() - start_tick_cnt;
 151
 152     // apply any scheduled adds
 153     for (int i = 0; i < sa_n; ++i) {
 154         if (sa_when[i] <= tick_cnt) {
 155             *sa_val[i] += sa_x[i];
 156             sa_val.erase(sa_val.begin() + i);
 157             sa_x.erase(sa_x.begin() + i);
 158             sa_when.erase(sa_when.begin() + i);
 159             --sa_n;
 160             --i;
 161         }
 162     }
 163
 164     // clock all of the cu's
 165     for (int i = 0; i < n_cu; ++i)
 166         cuList[i]->exec();
 167 }
 168
 169 bool
 170 Shader::dispatch_workgroups(NDRange *ndr)
 171 {
 172     bool scheduledSomething = false;
 173     int cuCount = 0;
 174     int curCu = nextSchedCu;
 175
 176     while (cuCount < n_cu) {
 177         //Every time we try a CU, update nextSchedCu
 178         nextSchedCu = (nextSchedCu + 1) % n_cu;
 179
 180         // dispatch workgroup iff the following two conditions are met:
 181         // (a) wg_rem is true - there are unassigned workgroups in the grid
 182         // (b) there are enough free slots in cu cuList[i] for this wg
 183         if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
 184             scheduledSomething = true;
 185             DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
 186
 187             // ticks() member function translates cycles to simulation ticks.
 188             if (!tickEvent.scheduled()) {
 189                 schedule(tickEvent, curTick() + this->ticks(1));
 190             }
 191
 192             cuList[curCu]->StartWorkgroup(ndr);
 193             ndr->wgId[0]++;
 194             ndr->globalWgId++;
 195             if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
 196                 ndr->wgId[0] = 0;
 197                 ndr->wgId[1]++;
 198
 199                 if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
 200                     ndr->wgId[1] = 0;
 201                     ndr->wgId[2]++;
 202
 203                     if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
 204                         ndr->wg_disp_rem = false;
 205                         break;
 206                     }
 207                 }
 208             }
 209         }
 210
 211         ++cuCount;
 212         curCu = nextSchedCu;
 213     }
 214
 215     return scheduledSomething;
 216 }
 217
 218 void
 219 Shader::handshake(GpuDispatcher *_dispatcher)
 220 {
 221     dispatcher = _dispatcher;
 222 }
 223
 224 void
 225 Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
 226                            bool suppress_func_errors, int cu_id)
 227 {
 228     unsigned block_size = RubySystem::getBlockSizeBytes();
 229     unsigned size = req->getSize();
 230
 231     Addr tmp_addr;
 232     BaseTLB::Mode trans_mode;
 233
 234     if (cmd == MemCmd::ReadReq) {
 235         trans_mode = BaseTLB::Read;
 236     } else if (cmd == MemCmd::WriteReq) {
 237         trans_mode = BaseTLB::Write;
 238     } else {
 239         fatal("unexcepted MemCmd\n");
 240     }
 241
 242     tmp_addr = req->getVaddr();
 243     Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
 244
 245     assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
 246
 247     // Misaligned access
 248     if (split_addr > tmp_addr) {
 249         RequestPtr req1, req2;
 250         req->splitOnVaddr(split_addr, req1, req2);
 251
 252
 253         PacketPtr pkt1 = new Packet(req2, cmd);
 254         PacketPtr pkt2 = new Packet(req1, cmd);
 255
 256         functionalTLBAccess(pkt1, cu_id, trans_mode);
 257         functionalTLBAccess(pkt2, cu_id, trans_mode);
 258
 259         PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
 260         PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
 261
 262         new_pkt1->dataStatic(data);
 263         new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
 264
 265         if (suppress_func_errors) {
 266             new_pkt1->setSuppressFuncError();
 267             new_pkt2->setSuppressFuncError();
 268         }
 269
 270         // fixme: this should be cuList[cu_id] if cu_id != n_cu
 271         // The latter requires a memPort in the dispatcher
 272         cuList[0]->memPort[0]->sendFunctional(new_pkt1);
 273         cuList[0]->memPort[0]->sendFunctional(new_pkt2);
 274
 275         delete new_pkt1;
 276         delete new_pkt2;
 277         delete pkt1;
 278         delete pkt2;
 279     } else {
 280         PacketPtr pkt = new Packet(req, cmd);
 281         functionalTLBAccess(pkt, cu_id, trans_mode);
 282         PacketPtr new_pkt = new Packet(pkt->req, cmd);
 283         new_pkt->dataStatic(data);
 284
 285         if (suppress_func_errors) {
 286             new_pkt->setSuppressFuncError();
 287         };
 288
 289         // fixme: this should be cuList[cu_id] if cu_id != n_cu
 290         // The latter requires a memPort in the dispatcher
 291         cuList[0]->memPort[0]->sendFunctional(new_pkt);
 292
 293         delete new_pkt;
 294         delete pkt;
 295     }
 296 }
 297
 298 bool
 299 Shader::busy()
 300 {
 301     for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
 302         if (!cuList[i_cu]->isDone()) {
 303             return true;
 304         }
 305     }
 306
 307     return false;
 308 }
 309
 310 void
 311 Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
 312 {
 313     sa_val.push_back(val);
 314     sa_when.push_back(tick_cnt + when);
 315     sa_x.push_back(x);
 316     ++sa_n;
 317 }
 318
 319 Shader::TickEvent::TickEvent(Shader *_shader)
 320     : Event(CPU_Tick_Pri), shader(_shader)
 321 {
 322 }
 323
 324
 325 void
 326 Shader::TickEvent::process()
 327 {
 328     if (shader->busy()) {
 329         shader->exec();
 330         shader->schedule(this, curTick() + shader->ticks(1));
 331     }
 332 }
 333
 334 const char*
 335 Shader::TickEvent::description() const
 336 {
 337     return "Shader tick";
 338 }
 339
 340 void
 341 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 342                   MemCmd cmd, bool suppress_func_errors)
 343 {
 344     uint8_t *data_buf = (uint8_t*)ptr;
 345
 346     for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
 347          !gen.done(); gen.next()) {
 348         Request *req = new Request(0, gen.addr(), gen.size(), 0,
 349                                    cuList[0]->masterId(), 0, 0, 0);
 350
 351         doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
 352         data_buf += gen.size();
 353         delete req;
 354     }
 355 }
 356
 357 void
 358 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
 359 {
 360     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
 361 }
 362
 363 void
 364 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 365                 bool suppress_func_errors)
 366 {
 367     AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
 368 }
 369
 370 void
 371 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
 372 {
 373     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
 374 }
 375
 376 void
 377 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
 378                  bool suppress_func_errors)
 379 {
 380     AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
 381               suppress_func_errors);
 382 }
 383
 384 /*
 385  * Send a packet through the appropriate TLB functional port.
 386  * If cu_id=n_cu, then this is the dispatcher's TLB.
 387  * Otherwise it's the TLB of the cu_id compute unit.
 388  */
 389 void
 390 Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
 391 {
 392     // update senderState. Need to know the gpuTc and the TLB mode
 393     pkt->senderState =
 394         new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
 395
 396     if (cu_id == n_cu) {
 397         dispatcher->tlbPort->sendFunctional(pkt);
 398     } else {
 399         // even when the perLaneTLB flag is turned on
 400         // it's ok tp send all accesses through lane 0
 401         // since the lane # is not known here,
 402         // This isn't important since these are functional accesses.
 403         cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
 404     }
 405
 406     /* safe_cast the senderState */
 407     TheISA::GpuTLB::TranslationState *sender_state =
 408                safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
 409
 410     delete sender_state->tlbEntry;
 411     delete pkt->senderState;
 412 }