src/gpu-compute/lds_state.cc

   1 /*
   2  * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
   3  * All rights reserved.
   4  *
   5  * For use for simulation and test purposes only
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions are met:
   9  *
  10  * 1. Redistributions of source code must retain the above copyright notice,
  11  * this list of conditions and the following disclaimer.
  12  *
  13  * 2. Redistributions in binary form must reproduce the above copyright notice,
  14  * this list of conditions and the following disclaimer in the documentation
  15  * and/or other materials provided with the distribution.
  16  *
  17  * 3. Neither the name of the copyright holder nor the names of its
  18  * contributors may be used to endorse or promote products derived from this
  19  * software without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "gpu-compute/lds_state.hh"
  35
  36 #include <array>
  37 #include <cstdio>
  38 #include <cstdlib>
  39
  40 #include "gpu-compute/compute_unit.hh"
  41 #include "gpu-compute/gpu_dyn_inst.hh"
  42 #include "gpu-compute/shader.hh"
  43
  44 /**
  45  * the default constructor that works with SWIG
  46  */
  47 LdsState::LdsState(const Params *params) :
  48     ClockedObject(params),
  49     tickEvent(this),
  50     cuPort(name() + ".port", this),
  51     maximumSize(params->size),
  52     range(params->range),
  53     bankConflictPenalty(params->bankConflictPenalty),
  54     banks(params->banks)
  55 {
  56     fatal_if(params->banks <= 0,
  57              "Number of LDS banks should be positive number");
  58     fatal_if((params->banks & (params->banks - 1)) != 0,
  59              "Number of LDS banks should be a power of 2");
  60     fatal_if(params->size <= 0,
  61              "cannot allocate an LDS with a size less than 1");
  62     fatal_if(params->size % 2,
  63           "the LDS should be an even number");
  64 }
  65
  66 /**
  67  * Needed by the SWIG compiler
  68  */
  69 LdsState *
  70 LdsStateParams::create()
  71 {
  72     return new LdsState(this);
  73 }
  74
  75 /**
  76  * set the parent and name based on the parent
  77  */
  78 void
  79 LdsState::setParent(ComputeUnit *x_parent)
  80 {
  81     // check that this gets assigned to the same thing each time
  82     fatal_if(!x_parent, "x_parent should not be nullptr");
  83     fatal_if(x_parent == parent,
  84              "should not be setting the parent twice");
  85
  86     parent = x_parent;
  87     _name = x_parent->name() + ".LdsState";
  88 }
  89
  90 /**
  91  * derive the gpu mem packet from the packet and then count the bank conflicts
  92  */
  93 unsigned
  94 LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
  95 {
  96     Packet::SenderState *baseSenderState = packet->senderState;
  97     while (baseSenderState->predecessor) {
  98         baseSenderState = baseSenderState->predecessor;
  99     }
 100     const ComputeUnit::LDSPort::SenderState *senderState =
 101             dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
 102
 103     fatal_if(!senderState,
 104              "did not get the right sort of sender state");
 105
 106     GPUDynInstPtr gpuDynInst = senderState->getMemInst();
 107
 108     return countBankConflicts(gpuDynInst, bankAccesses);
 109 }
 110
 111 // Count the total number of bank conflicts for the local memory packet
 112 unsigned
 113 LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
 114                              unsigned *numBankAccesses)
 115 {
 116     int bank_conflicts = 0;
 117     std::vector<int> bank;
 118     // the number of LDS banks being touched by the memory instruction
 119     int numBanks = std::min(parent->wfSize(), banks);
 120     // if the wavefront size is larger than the number of LDS banks, we
 121     // need to iterate over all work items to calculate the total
 122     // number of bank conflicts
 123     int groups = (parent->wfSize() > numBanks) ?
 124         (parent->wfSize() / numBanks) : 1;
 125     for (int i = 0; i < groups; i++) {
 126         // Address Array holding all the work item addresses of an instruction
 127         std::vector<Addr> addr_array;
 128         addr_array.resize(numBanks, 0);
 129         bank.clear();
 130         bank.resize(banks, 0);
 131         int max_bank = 0;
 132
 133         // populate the address array for all active work items
 134         for (int j = 0; j < numBanks; j++) {
 135             if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
 136                 addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
 137             } else {
 138                 addr_array[j] = std::numeric_limits<Addr>::max();
 139             }
 140         }
 141
 142         if (gpuDynInst->isLoad() || gpuDynInst->isStore()) {
 143             // mask identical addresses
 144             for (int j = 0; j < numBanks; ++j) {
 145                 for (int j0 = 0; j0 < j; j0++) {
 146                     if (addr_array[j] != std::numeric_limits<Addr>::max()
 147                                     && addr_array[j] == addr_array[j0]) {
 148                         addr_array[j] = std::numeric_limits<Addr>::max();
 149                     }
 150                 }
 151             }
 152         }
 153         // calculate bank conflicts
 154         for (int j = 0; j < numBanks; ++j) {
 155             if (addr_array[j] != std::numeric_limits<Addr>::max()) {
 156                 int bankId = addr_array[j] % banks;
 157                 bank[bankId]++;
 158                 max_bank = std::max(max_bank, bank[bankId]);
 159                 // Count the number of LDS banks accessed.
 160                 // Since we have masked identical addresses all remaining
 161                 // accesses will need to be serialized if they access
 162                 // the same bank (bank conflict).
 163                 (*numBankAccesses)++;
 164             }
 165         }
 166         bank_conflicts += max_bank;
 167     }
 168     panic_if(bank_conflicts > parent->wfSize(),
 169              "Max bank conflicts should match num of work items per instr");
 170     return bank_conflicts;
 171 }
 172
 173 /**
 174  * receive the packet from the CU
 175  */
 176 bool
 177 LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
 178 {
 179     return ownerLds->processPacket(packet);
 180 }
 181
 182 GPUDynInstPtr
 183 LdsState::getDynInstr(PacketPtr packet)
 184 {
 185     ComputeUnit::LDSPort::SenderState *ss =
 186         dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
 187                      packet->senderState);
 188     return ss->getMemInst();
 189 }
 190
 191 /**
 192  * process an incoming packet, add it to the return queue
 193  */
 194 bool
 195 LdsState::processPacket(PacketPtr packet)
 196 {
 197     unsigned bankAccesses = 0;
 198     // the number of conflicts this packet will have when accessing the LDS
 199     unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
 200     // count the total number of physical LDS bank accessed
 201     parent->ldsBankAccesses += bankAccesses;
 202     // count the LDS bank conflicts. A number set to 1 indicates one
 203     // access per bank maximum so there are no bank conflicts
 204     parent->ldsBankConflictDist.sample(bankConflicts-1);
 205
 206     GPUDynInstPtr dynInst = getDynInstr(packet);
 207     // account for the LDS bank conflict overhead
 208     int busLength = (dynInst->isLoad()) ? parent->loadBusLength() :
 209         (dynInst->isStore()) ? parent->storeBusLength() :
 210         parent->loadBusLength();
 211     // delay for accessing the LDS
 212     Tick processingTime =
 213         parent->cyclesToTicks(Cycles(bankConflicts * bankConflictPenalty)) +
 214         parent->cyclesToTicks(Cycles(busLength));
 215     // choose (delay + last packet in queue) or (now + delay) as the time to
 216     // return this
 217     Tick doneAt = earliestReturnTime() + processingTime;
 218     // then store it for processing
 219     return returnQueuePush(std::make_pair(doneAt, packet));
 220 }
 221
 222 /**
 223  * add this to the queue of packets to be returned
 224  */
 225 bool
 226 LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
 227 {
 228     // TODO add time limits (e.g. one packet per cycle) and queue size limits
 229     // and implement flow control
 230     returnQueue.push(thePair);
 231
 232     // if there is no set wakeup time, look through the queue
 233     if (!tickEvent.scheduled()) {
 234         process();
 235     }
 236
 237     return true;
 238 }
 239
 240 /**
 241  * receive a packet in functional mode
 242  */
 243 void
 244 LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
 245 {
 246     fatal("not implemented");
 247 }
 248
 249 /**
 250  * receive a retry for a response
 251  */
 252 void
 253 LdsState::CuSidePort::recvRespRetry()
 254 {
 255     // TODO verify that this is the right way to do this
 256     assert(ownerLds->isRetryResp());
 257     ownerLds->setRetryResp(false);
 258     ownerLds->process();
 259 }
 260
 261 /**
 262  * receive a retry
 263  */
 264 void
 265 LdsState::CuSidePort::recvRetry()
 266 {
 267     fatal("not implemented");
 268 }
 269
 270 /**
 271  * look for packets to return at this time
 272  */
 273 bool
 274 LdsState::process()
 275 {
 276     Tick now = clockEdge();
 277
 278     // send back completed packets
 279     while (!returnQueue.empty() && returnQueue.front().first <= now) {
 280         PacketPtr packet = returnQueue.front().second;
 281
 282         ComputeUnit::LDSPort::SenderState *ss =
 283             dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
 284                             packet->senderState);
 285
 286         GPUDynInstPtr gpuDynInst = ss->getMemInst();
 287
 288         gpuDynInst->initiateAcc(gpuDynInst);
 289
 290         packet->makeTimingResponse();
 291
 292         returnQueue.pop();
 293
 294         bool success = cuPort.sendTimingResp(packet);
 295
 296         if (!success) {
 297             retryResp = true;
 298             panic("have not handled timing responses being NACK'd when sent"
 299                             "back");
 300         }
 301     }
 302
 303     // determine the next wakeup time
 304     if (!returnQueue.empty()) {
 305
 306         Tick next = returnQueue.front().first;
 307
 308         if (tickEvent.scheduled()) {
 309
 310             if (next < tickEvent.when()) {
 311
 312                 tickEvent.deschedule();
 313                 tickEvent.schedule(next);
 314             }
 315         } else {
 316             tickEvent.schedule(next);
 317         }
 318     }
 319
 320     return true;
 321 }
 322
 323 /**
 324  * wake up at this time and perform specified actions
 325  */
 326 void
 327 LdsState::TickEvent::process()
 328 {
 329     ldsState->process();
 330 }