--- /dev/null
+/*
+ * Copyright (c) 2018 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Matt Sinclair
+ */
+
+#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__
+
+#include "arch/gcn3/insts/gpu_static_inst.hh"
+#include "arch/gcn3/insts/op_encodings.hh"
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+/**
+ * Helper function for instructions declared in op_encodings. This function
+ * takes in all of the arguments for a given memory request we are trying to
+ * initialize, then submits the request or requests depending on if the
+ * original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,
+ bool is_atomic=false)
+{
+ // local variables
+ int req_size = N * sizeof(T);
+ int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+ Addr vaddr = 0, split_addr = 0;
+ bool misaligned_acc = false;
+ RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;
+ PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;
+
+ gpuDynInst->resetEntireStatusVector();
+ for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {
+ if (gpuDynInst->exec_mask[lane]) {
+ vaddr = gpuDynInst->addr[lane];
+
+ /**
+ * the base address of the cache line where the the last
+ * byte of the request will be stored.
+ */
+ split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+ assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+ /**
+ * if the base cache line address of the last byte is
+ * greater than the address of the first byte then we have
+ * a misaligned access.
+ */
+ misaligned_acc = split_addr > vaddr;
+
+ if (is_atomic) {
+ req = std::make_shared<Request>(vaddr, sizeof(T), 0,
+ gpuDynInst->computeUnit()->masterId(), 0,
+ gpuDynInst->wfDynId,
+ gpuDynInst->makeAtomicOpFunctor<T>(
+ &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
+ &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));
+ } else {
+ req = std::make_shared<Request>(vaddr, req_size, 0,
+ gpuDynInst->computeUnit()->masterId(), 0,
+ gpuDynInst->wfDynId);
+ }
+
+ if (misaligned_acc) {
+ gpuDynInst->setStatusVector(lane, 2);
+ req->splitOnVaddr(split_addr, req1, req2);
+ gpuDynInst->setRequestFlags(req1);
+ gpuDynInst->setRequestFlags(req2);
+ pkt1 = new Packet(req1, mem_req_type);
+ pkt2 = new Packet(req2, mem_req_type);
+ pkt1->dataStatic(&(reinterpret_cast<T*>(
+ gpuDynInst->d_data))[lane * N]);
+ pkt2->dataStatic(&(reinterpret_cast<T*>(
+ gpuDynInst->d_data))[lane * N + req1->getSize()]);
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "
+ "request for %#x\n", gpuDynInst->cu_id,
+ gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,
+ split_addr);
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);
+ } else {
+ gpuDynInst->setStatusVector(lane, 1);
+ gpuDynInst->setRequestFlags(req);
+ pkt = new Packet(req, mem_req_type);
+ pkt->dataStatic(&(reinterpret_cast<T*>(
+ gpuDynInst->d_data))[lane * N]);
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);
+ }
+ } else { // if lane is not active, then no pending requests
+ gpuDynInst->setStatusVector(lane, 0);
+ }
+ }
+}
+
+/**
+ * Helper function for scalar instructions declared in op_encodings. This
+ * function takes in all of the arguments for a given memory request we are
+ * trying to initialize, then submits the request or requests depending on if
+ * the original request is aligned or unaligned.
+ */
+template<typename T, int N>
+inline void
+initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
+{
+ int req_size = N * sizeof(T);
+ int block_size = gpuDynInst->computeUnit()->cacheLineSize();
+ Addr vaddr = gpuDynInst->scalarAddr;
+
+ /**
+ * the base address of the cache line where the the last byte of
+ * the request will be stored.
+ */
+ Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
+
+ assert(split_addr <= vaddr || split_addr - vaddr < block_size);
+ /**
+ * if the base cache line address of the last byte is greater
+ * than the address of the first byte then we have a misaligned
+ * access.
+ */
+ bool misaligned_acc = split_addr > vaddr;
+
+ RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
+ gpuDynInst->computeUnit()->masterId(), 0,
+ gpuDynInst->wfDynId);
+
+ if (misaligned_acc) {
+ RequestPtr req1, req2;
+ req->splitOnVaddr(split_addr, req1, req2);
+ gpuDynInst->numScalarReqs = 2;
+ gpuDynInst->setRequestFlags(req1);
+ gpuDynInst->setRequestFlags(req2);
+ PacketPtr pkt1 = new Packet(req1, mem_req_type);
+ PacketPtr pkt2 = new Packet(req2, mem_req_type);
+ pkt1->dataStatic(gpuDynInst->scalar_data);
+ pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"
+ " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, split_addr);
+ gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
+ gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
+ } else {
+ gpuDynInst->numScalarReqs = 1;
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, mem_req_type);
+ pkt->dataStatic(gpuDynInst->scalar_data);
+ gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
+ }
+}
+
+#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__
#define __ARCH_GCN3_INSTS_OP_ENCODINGS_HH__
#include "arch/gcn3/gpu_decoder.hh"
+#include "arch/gcn3/gpu_mem_helpers.hh"
#include "arch/gcn3/insts/gpu_static_inst.hh"
#include "arch/gcn3/operand.hh"
#include "debug/GPUExec.hh"
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- int block_size = gpuDynInst->computeUnit()->cacheLineSize();
- int req_size = N * sizeof(ScalarRegU32);
- Addr vaddr = gpuDynInst->scalarAddr;
-
- /**
- * the base address of the cache line where the the last byte of
- * the request will be stored.
- */
- Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
- assert(split_addr <= vaddr || split_addr - vaddr < block_size);
- /**
- * if the base cache line address of the last byte is greater
- * than the address of the first byte then we have a misaligned
- * access.
- */
- bool misaligned_acc = split_addr > vaddr;
-
- RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId);
-
- if (misaligned_acc) {
- RequestPtr req1, req2;
- req->splitOnVaddr(split_addr, req1, req2);
- gpuDynInst->numScalarReqs = 2;
- gpuDynInst->setRequestFlags(req1);
- gpuDynInst->setRequestFlags(req2);
- PacketPtr pkt1 = new Packet(req1, MemCmd::ReadReq);
- PacketPtr pkt2 = new Packet(req2, MemCmd::ReadReq);
- pkt1->dataStatic(gpuDynInst->scalar_data);
- pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
- } else {
- gpuDynInst->numScalarReqs = 1;
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- pkt->dataStatic(gpuDynInst->scalar_data);
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
- }
+ initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+ MemCmd::ReadReq);
}
/**
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- int block_size = gpuDynInst->computeUnit()->cacheLineSize();
- int req_size = N * sizeof(ScalarRegU32);
- Addr vaddr = gpuDynInst->scalarAddr;
-
- /**
- * the base address of the cache line where the the last byte of
- * the request will be stored.
- */
- Addr split_addr = roundDown(vaddr + req_size - 1, block_size);
-
- assert(split_addr <= vaddr || split_addr - vaddr < block_size);
- /**
- * if the base cache line address of the last byte is greater
- * than the address of the first byte then we have a misaligned
- * access.
- */
- bool misaligned_acc = split_addr > vaddr;
-
- RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId);
-
- if (misaligned_acc) {
- RequestPtr req1, req2;
- req->splitOnVaddr(split_addr, req1, req2);
- gpuDynInst->numScalarReqs = 2;
- gpuDynInst->setRequestFlags(req1);
- gpuDynInst->setRequestFlags(req2);
- PacketPtr pkt1 = new Packet(req1, MemCmd::WriteReq);
- PacketPtr pkt2 = new Packet(req2, MemCmd::WriteReq);
- pkt1->dataStatic(gpuDynInst->scalar_data);
- pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);
- } else {
- gpuDynInst->numScalarReqs = 1;
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- pkt->dataStatic(gpuDynInst->scalar_data);
- gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);
- }
+ initMemReqScalarHelper<ScalarRegU32, N>(gpuDynInst,
+ MemCmd::WriteReq);
}
void
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr,
- sizeof(T), 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
-
- PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- pkt->dataStatic(&(reinterpret_cast<T*>(
- gpuDynInst->d_data))[lane]);
-
- gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
- pkt);
- }
- }
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr,
- sizeof(T), 0,
- gpuDynInst->computeUnit()->masterId(),
- 0, gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
- pkt->dataStatic(&(reinterpret_cast<T*>(
- gpuDynInst->d_data))[lane]);
- gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
- pkt);
- }
- }
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
}
void
injectGlobalMemFence(GPUDynInstPtr gpuDynInst)
{
// create request and set flags
- gpuDynInst->statusBitVector = VectorMask(1);
+ gpuDynInst->resetEntireStatusVector();
+ gpuDynInst->setStatusVector(0, 1);
RequestPtr req = std::make_shared<Request>(0, 0, 0,
gpuDynInst->computeUnit()->
masterId(), 0,
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr,
- sizeof(T), 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- pkt->dataStatic(&(reinterpret_cast<T*>(
- gpuDynInst->d_data))[lane]);
- gpuDynInst->computeUnit()
- ->sendRequest(gpuDynInst, lane, pkt);
- }
- }
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::ReadReq);
}
template<int N>
void
initMemRead(GPUDynInstPtr gpuDynInst)
{
- int req_size = N * sizeof(VecElemU32);
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr, req_size,
- 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
- pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
- gpuDynInst->d_data))[lane * N]);
- gpuDynInst->computeUnit()
- ->sendRequest(gpuDynInst, lane, pkt);
- }
- }
+ initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::ReadReq);
}
template<typename T>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr,
- sizeof(T), 0,
- gpuDynInst->computeUnit()->masterId(),
- 0, gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
- pkt->dataStatic(&(reinterpret_cast<T*>(
- gpuDynInst->d_data))[lane]);
- gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
- pkt);
- }
- }
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::WriteReq);
}
template<int N>
void
initMemWrite(GPUDynInstPtr gpuDynInst)
{
- int req_size = N * sizeof(VecElemU32);
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr, req_size,
- 0,
- gpuDynInst->computeUnit()->masterId(),
- 0, gpuDynInst->wfDynId);
-
- gpuDynInst->setRequestFlags(req);
- PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
- pkt->dataStatic(&(reinterpret_cast<VecElemU32*>(
- gpuDynInst->d_data))[lane * N]);
- gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
- pkt);
- }
- }
+ initMemReqHelper<VecElemU32, N>(gpuDynInst, MemCmd::WriteReq);
}
template<typename T>
void
initAtomicAccess(GPUDynInstPtr gpuDynInst)
{
- gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
-
- for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
- if (gpuDynInst->exec_mask[lane]) {
- Addr vaddr = gpuDynInst->addr[lane];
-
- RequestPtr req = std::make_shared<Request>(vaddr,
- sizeof(T), 0,
- gpuDynInst->computeUnit()->masterId(), 0,
- gpuDynInst->wfDynId,
- gpuDynInst->makeAtomicOpFunctor<T>(
- &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],
- &(reinterpret_cast<T*>(
- gpuDynInst->x_data))[lane]));
-
- gpuDynInst->setRequestFlags(req);
-
- PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
- pkt->dataStatic(&(reinterpret_cast<T*>(
- gpuDynInst->d_data))[lane]);
-
- gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,
- pkt);
- }
- }
+ initMemReqHelper<T, 1>(gpuDynInst, MemCmd::SwapReq, true);
}
void
gpuDynInst->wfSlotId, gpuDynInst->wfDynId,
gpuDynInst->disassemble(), w->outstandingReqs,
w->outstandingReqs - 1);
- if (gpuDynInst->statusBitVector.none()) {
+ if (gpuDynInst->allLanesZero()) {
// ask gm pipe to decrement request counters, instead of directly
// performing here, to avoid asynchronous counter update and
// instruction retirement (which may hurt waincnt effects)
gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
gpuDynInst->tlbHitLevel[index] = hit_level;
-
// translation is done. Schedule the mem_req_event at the
// appropriate cycle to send the timing memory request to ruby
EventFunctionWrapper *mem_req_event =
}
} else {
if (pkt->cmd == MemCmd::MemSyncReq) {
- gpuDynInst->statusBitVector = VectorMask(0);
+ gpuDynInst->resetEntireStatusVector();
} else {
- gpuDynInst->statusBitVector &= (~(1ll << index));
+ gpuDynInst->decrementStatusVector(index);
}
// New SenderState for the memory access
gpuDynInst->memStatusVector[paddr].pop_back();
gpuDynInst->pAddr = pkt->req->getPaddr();
- gpuDynInst->statusBitVector &= (~(1ULL << index));
-
- DPRINTF(GPUMem, "bitvector is now %#x\n",
- gpuDynInst->statusBitVector);
+ gpuDynInst->decrementStatusVector(index);
+ DPRINTF(GPUMem, "bitvector is now %s\n", gpuDynInst->printStatusVector());
- if (gpuDynInst->statusBitVector == VectorMask(0)) {
+ if (gpuDynInst->allLanesZero()) {
auto iter = gpuDynInst->memStatusVector.begin();
auto end = gpuDynInst->memStatusVector.end();
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *static_inst, InstSeqNum instSeqNum)
: GPUExecContext(_cu, _wf), scalarAddr(0), addr(computeUnit()->wfSize(),
- (Addr)0), statusBitVector(0), numScalarReqs(0), isSaveRestore(false),
+ (Addr)0), numScalarReqs(0), isSaveRestore(false),
_staticInst(static_inst), _seqNum(instSeqNum)
{
+ statusVector.assign(TheGpuISA::NumVecElemPerVecReg, 0);
tlbHitLevel.assign(computeUnit()->wfSize(), -1);
// vector instructions can have up to 4 source/destination operands
d_data = new uint8_t[computeUnit()->wfSize() * 4 * sizeof(double)];
#include "base/amo.hh"
#include "base/logging.hh"
+#include "base/trace.hh"
+#include "debug/GPUMem.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"
}
}
+ // reset the number of pending memory requests for all lanes
+ void
+ resetEntireStatusVector()
+ {
+ assert(statusVector.size() == TheGpuISA::NumVecElemPerVecReg);
+ for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+ resetStatusVector(lane);
+ }
+ }
+
+ // reset the number of pending memory requests for the inputted lane
+ void
+ resetStatusVector(int lane)
+ {
+ setStatusVector(lane, 0);
+ }
+
+ // set the number of pending memory requests for the inputted lane
+ void
+ setStatusVector(int lane, int newVal)
+ {
+ // currently we can have up to 2 memory requests per lane (if the
+ // lane's request goes across multiple cache lines)
+ assert((newVal >= 0) && (newVal <= 2));
+ statusVector[lane] = newVal;
+ }
+
+ // subtracts the number of pending memory requests for the inputted lane
+ // by 1
+ void
+ decrementStatusVector(int lane)
+ {
+ // this lane may have multiple requests, so only subtract one for
+ // this request
+ assert(statusVector[lane] >= 1);
+ statusVector[lane]--;
+ }
+
+ // return the current number of pending memory requests for the inputted
+ // lane
+ int
+ getLaneStatus(int lane) const
+ {
+ return statusVector[lane];
+ }
+
+ // returns true if all memory requests from all lanes have been received,
+ // else returns false
+ bool
+ allLanesZero() const
+ {
+ // local variables
+ bool allZero = true;
+
+ // iterate over all lanes, checking the number of pending memory
+ // requests they have
+ for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+ // if any lane still has pending requests, return false
+ if (statusVector[lane] > 0) {
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: lane: %d has %d pending "
+ "request(s) for %#x\n", cu_id, simdId, wfSlotId, lane,
+ statusVector[lane], addr[lane]);
+ allZero = false;
+ }
+ }
+
+ if (allZero) {
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: all lanes have no pending"
+ " requests for %#x\n", cu_id, simdId, wfSlotId, addr[0]);
+ }
+ return allZero;
+ }
+
+ // returns a string representing the current state of the statusVector
+ std::string
+ printStatusVector() const
+ {
+ std::string statusVec_str = "[";
+
+ // iterate over all lanes, adding the current number of pending
+ // requests for this lane to the string
+ for (int lane = 0; lane < TheGpuISA::NumVecElemPerVecReg; ++lane) {
+ statusVec_str += std::to_string(statusVector[lane]);
+ }
+ statusVec_str += "]";
+
+ return statusVec_str;
+ }
+
// Map returned packets and the addresses they satisfy with which lane they
// were requested from
typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
StatusVector memStatusVector;
- // Track the status of memory requests per lane, a bit per lane
- VectorMask statusBitVector;
+ // Track the status of memory requests per lane, an int per lane to allow
+ // unaligned accesses
+ std::vector<int> statusVector;
// for ld_v# or st_v#
std::vector<int> tlbHitLevel;
void
DataBlock::setData(const uint8_t *data, int offset, int len)
{
- assert(offset + len <= RubySystem::getBlockSizeBytes());
memcpy(&m_data[offset], data, len);
}
curTick() + rs->clockPeriod());
return true;
}
-
- assert(getOffset(pkt->getAddr()) + pkt->getSize() <=
- RubySystem::getBlockSizeBytes());
}
// Save the port in the sender state object to be used later to