#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
#define __ARCH_HSAIL_INSTS_MEM_HH__
+#include <type_traits>
+
#include "arch/hsail/insts/decl.hh"
#include "arch/hsail/insts/gpu_static_inst.hh"
#include "arch/hsail/operand.hh"
+#include "gpu-compute/compute_unit.hh"
namespace HsailISA
{
gpuDynInst->updateStats();
}
+ void
+ completeAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ typedef typename MemDataType::CType c1;
+
+ constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
+
+ /**
+ * this code essentially replaces the long if-else chain
+ * that was in used GlobalMemPipeline::exec() to infer the
+ * size (single/double) and type (floating point/integer) of
+ * the destination register. this is needed for load
+ * instructions because the loaded value and the
+ * destination type can be of different sizes, and we also
+ * need to know if the value we're writing back is floating
+ * point and signed/unsigned, so we can properly cast the
+ * writeback value
+ */
+ typedef typename std::conditional<is_vt_32,
+ typename std::conditional<std::is_floating_point<c1>::value,
+ float, typename std::conditional<std::is_signed<c1>::value,
+ int32_t, uint32_t>::type>::type,
+ typename std::conditional<std::is_floating_point<c1>::value,
+ double, typename std::conditional<std::is_signed<c1>::value,
+ int64_t, uint64_t>::type>::type>::type c0;
+
+
+ Wavefront *w = gpuDynInst->wavefront();
+
+ std::vector<uint32_t> regVec;
+ // iterate over number of destination register operands since
+ // this is a load
+ for (int k = 0; k < num_dest_operands; ++k) {
+ assert((sizeof(c1) * num_dest_operands)
+ <= MAX_WIDTH_FOR_MEM_INST);
+
+ int dst = this->dest.regIndex() + k;
+ if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ dst = dest_vect[k].regIndex();
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst, sizeof(c0), 1);
+ // save the physical VGPR index
+ regVec.push_back(physVgpr);
+
+ c1 *p1 =
+ &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
+
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+ "$%s%d <- %d global ld done (src = wavefront "
+ "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+ dst, *p1);
+ // write the value into the physical VGPR. This is a
+ // purely functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+ *p1, i);
+ }
+ ++p1;
+ }
+ }
+
+ // Schedule the write operation of the load data on the VRF.
+ // This simply models the timing aspect of the VRF write operation.
+ // It does not modify the physical VGPR.
+ int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
+ vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
+ sizeof(c0), gpuDynInst->time);
+
+ if (this->isGlobalMem()) {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+ } else {
+ assert(this->isLocalMem());
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+ }
+ }
+
private:
void
execLdAcq(GPUDynInstPtr gpuDynInst) override
execSt(gpuDynInst);
}
+ // stores don't write anything back, so there is nothing
+ // to do here. we only override this method to avoid the
+ // fatal in the base class implementation
+ void completeAcc(GPUDynInstPtr gpuDynInst) override { }
+
private:
// execSt may be called through a continuation
// if the store had release semantics. see comment for
}
+ void
+ completeAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ // if this is not an atomic return op, then we
+ // have nothing more to do.
+ if (this->isAtomicRet()) {
+ // the size of the src operands and the
+ // memory being operated on must match
+ // for HSAIL atomics - this assumption may
+ // not apply to all ISAs
+ typedef typename MemDataType::CType CType;
+
+ Wavefront *w = gpuDynInst->wavefront();
+ int dst = this->dest.regIndex();
+ std::vector<uint32_t> regVec;
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst, sizeof(CType), 1);
+ regVec.push_back(physVgpr);
+ CType *p1 = &((CType*)gpuDynInst->d_data)[0];
+
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+ "$%s%d <- %d global ld done (src = wavefront "
+ "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
+ dst, *p1);
+ // write the value into the physical VGPR. This is a
+ // purely functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
+ }
+ ++p1;
+ }
+
+ // Schedule the write operation of the load data on the VRF.
+ // This simply models the timing aspect of the VRF write operation.
+ // It does not modify the physical VGPR.
+ int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
+ vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
+ sizeof(CType), gpuDynInst->time);
+
+ if (this->isGlobalMem()) {
+ gpuDynInst->computeUnit()->globalMemoryPipe
+ .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+ } else {
+ assert(this->isLocalMem());
+ gpuDynInst->computeUnit()->localMemoryPipe
+ .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
+ }
+ }
+ }
+
void execute(GPUDynInstPtr gpuDynInst) override;
private:
!gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
bool accessVrf = true;
+ Wavefront *w = nullptr;
+
// check the VRF to see if the operands of a load (or load component
// of an atomic) are accessible
if ((m) && (m->isLoad() || m->isAtomicRet())) {
- Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+ w = m->wavefront();
accessVrf =
- w->computeUnit->vrf[m->simdId]->
+ w->computeUnit->vrf[w->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m,
VrfAccessType::WRITE);
}
(computeUnit->shader->coissue_return ||
computeUnit->wfWait.at(m->pipeId).rdy())) {
- if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
- doGmReturn<uint32_t, uint8_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
- doGmReturn<uint32_t, uint16_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
- doGmReturn<uint32_t, uint32_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
- doGmReturn<int32_t, int8_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
- doGmReturn<int32_t, int16_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
- doGmReturn<int32_t, int32_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
- doGmReturn<float, Float16>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
- doGmReturn<float, float>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
- doGmReturn<uint64_t, uint8_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
- doGmReturn<uint64_t, uint16_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
- doGmReturn<uint64_t, uint32_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
- doGmReturn<uint64_t, uint64_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
- doGmReturn<int64_t, int8_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
- doGmReturn<int64_t, int16_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
- doGmReturn<int64_t, int32_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
- doGmReturn<int64_t, int64_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
- doGmReturn<double, Float16>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
- doGmReturn<double, float>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
- doGmReturn<double, double>(m);
+ w = m->wavefront();
+
+ m->completeAcc(m);
+
+ if (m->isLoad() || m->isAtomic()) {
+ gmReturnedLoads.pop();
+ assert(inflightLoads > 0);
+ --inflightLoads;
+ } else {
+ assert(m->isStore());
+ gmReturnedStores.pop();
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ // Decrement outstanding register count
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+ if (m->isStore() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
+ m->time, -1);
+ }
+
+ if (m->isLoad() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
+ m->time, -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->glbMemToVrfBus.set(m->time);
+ if (!computeUnit->shader->coissue_return)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
}
// If pipeline has executed a global memory instruction
}
}
-template<typename c0, typename c1>
-void
-GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
-{
- Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
-
- // Return data to registers
- if (m->isLoad() || m->isAtomic()) {
- gmReturnedLoads.pop();
- assert(inflightLoads > 0);
- --inflightLoads;
-
- if (m->isLoad() || m->isAtomicRet()) {
- std::vector<uint32_t> regVec;
- // iterate over number of destination register operands since
- // this is a load or atomic operation
- for (int k = 0; k < m->n_reg; ++k) {
- assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
- int dst = m->dst_reg + k;
-
- if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
- dst = m->dst_reg_vec[k];
- // virtual->physical VGPR mapping
- int physVgpr = w->remap(dst, sizeof(c0), 1);
- // save the physical VGPR index
- regVec.push_back(physVgpr);
- c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
-
- for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
- if (m->exec_mask[i]) {
- DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
- "$%s%d <- %d global ld done (src = wavefront "
- "ld inst)\n", w->computeUnit->cu_id, w->simdId,
- w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
- dst, *p1);
- // write the value into the physical VGPR. This is a
- // purely functional operation. No timing is modeled.
- w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
- *p1, i);
- }
- ++p1;
- }
- }
-
- // Schedule the write operation of the load data on the VRF.
- // This simply models the timing aspect of the VRF write operation.
- // It does not modify the physical VGPR.
- loadVrfBankConflictCycles +=
- w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
- w, regVec, sizeof(c0),
- m->time);
- }
- } else {
- gmReturnedStores.pop();
- assert(inflightStores > 0);
- --inflightStores;
- }
-
- // Decrement outstanding register count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-
- if (m->isStore() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time,
- -1);
- }
-
- if (m->isLoad() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time,
- -1);
- }
-
- // Mark write bus busy for appropriate amount of time
- computeUnit->glbMemToVrfBus.set(m->time);
- if (!computeUnit->shader->coissue_return)
- w->computeUnit->wfWait.at(m->pipeId).set(m->time);
-}
-
void
GlobalMemPipeline::regStats()
{
void init(ComputeUnit *cu);
void exec();
- template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
-
std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
const std::string &name() const { return _name; }
void regStats();
+ void
+ incLoadVRFBankConflictCycles(int num_cycles)
+ {
+ loadVrfBankConflictCycles += num_cycles;
+ }
+
private:
ComputeUnit *computeUnit;
std::string _name;
time = 0;
}
+void
+GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
+{
+ _staticInst->completeAcc(gpuDynInst);
+}
+
/**
* accessor methods for the attributes of
* the underlying GPU static instruction
// Initiate the specified memory operation, by creating a
// memory request and sending it off to the memory system.
void initiateAcc(GPUDynInstPtr gpuDynInst);
+ // Complete the specified memory operation, by writing
+ // value back to the RF in the case of a load or atomic
+ // return or, in the case of a store, we do nothing
+ void completeAcc(GPUDynInstPtr gpuDynInst);
void updateStats();
lmReturnedRequests.front() : nullptr;
bool accessVrf = true;
+ Wavefront *w = nullptr;
+
if ((m) && (m->isLoad() || m->isAtomicRet())) {
- Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+ w = m->wavefront();
accessVrf =
- w->computeUnit->vrf[m->simdId]->
+ w->computeUnit->vrf[w->simdId]->
vrfOperandAccessReady(m->seqNum(), w, m,
VrfAccessType::WRITE);
}
if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
|| computeUnit->wfWait.at(m->pipeId).rdy())) {
- if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
- doSmReturn<uint32_t, uint8_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
- doSmReturn<uint32_t, uint16_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
- doSmReturn<uint32_t, uint32_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
- doSmReturn<int32_t, int8_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
- doSmReturn<int32_t, int16_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
- doSmReturn<int32_t, int32_t>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
- doSmReturn<float, Float16>(m);
- else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
- doSmReturn<float, float>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
- doSmReturn<uint64_t, uint8_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
- doSmReturn<uint64_t, uint16_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
- doSmReturn<uint64_t, uint32_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
- doSmReturn<uint64_t, uint64_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
- doSmReturn<int64_t, int8_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
- doSmReturn<int64_t, int16_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
- doSmReturn<int64_t, int32_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
- doSmReturn<int64_t, int64_t>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
- doSmReturn<double, Float16>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
- doSmReturn<double, float>(m);
- else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
- doSmReturn<double, double>(m);
+
+ lmReturnedRequests.pop();
+ w = m->wavefront();
+
+ m->completeAcc(m);
+
+ // Decrement outstanding request count
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
+
+ if (m->isStore() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
+ m->time, -1);
+ }
+
+ if (m->isLoad() || m->isAtomic()) {
+ computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
+ m->time, -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->locMemToVrfBus.set(m->time);
+ if (computeUnit->shader->coissue_return == 0)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
}
// If pipeline has executed a local memory instruction
}
}
-template<typename c0, typename c1>
-void
-LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
-{
- lmReturnedRequests.pop();
- Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
-
- // Return data to registers
- if (m->isLoad() || m->isAtomicRet()) {
- std::vector<uint32_t> regVec;
- for (int k = 0; k < m->n_reg; ++k) {
- int dst = m->dst_reg+k;
-
- if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
- dst = m->dst_reg_vec[k];
- // virtual->physical VGPR mapping
- int physVgpr = w->remap(dst,sizeof(c0),1);
- // save the physical VGPR index
- regVec.push_back(physVgpr);
- c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
-
- for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
- if (m->exec_mask[i]) {
- // write the value into the physical VGPR. This is a purely
- // functional operation. No timing is modeled.
- w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
- *p1, i);
- }
- ++p1;
- }
- }
-
- // Schedule the write operation of the load data on the VRF. This simply
- // models the timing aspect of the VRF write operation. It does not
- // modify the physical VGPR.
- loadVrfBankConflictCycles +=
- w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
- regVec, sizeof(c0), m->time);
- }
-
- // Decrement outstanding request count
- computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-
- if (m->isStore() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm,
- m->time, -1);
- }
-
- if (m->isLoad() || m->isAtomic()) {
- computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm,
- m->time, -1);
- }
-
- // Mark write bus busy for appropriate amount of time
- computeUnit->locMemToVrfBus.set(m->time);
- if (computeUnit->shader->coissue_return == 0)
- w->computeUnit->wfWait.at(m->pipeId).set(m->time);
-}
-
void
LocalMemPipeline::regStats()
{
void init(ComputeUnit *cu);
void exec();
- template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
-
std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
const std::string& name() const { return _name; }
void regStats();
+ void
+ incLoadVRFBankConflictCycles(int num_cycles)
+ {
+ loadVrfBankConflictCycles += num_cycles;
+ }
+
private:
ComputeUnit *computeUnit;
std::string _name;