vrfs = []
for j in xrange(options.simds_per_cu):
for k in xrange(shader.n_wf):
- wavefronts.append(Wavefront(simdId = j, wf_slot_id = k))
+ wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
+ wfSize = options.wf_size))
vrfs.append(VectorRegisterFile(simd_id=j,
num_regs_per_simd=options.vreg_file_size))
compute_units[-1].wavefronts = wavefronts
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestCType dest_val = $expr;
this->dest.set(w, lane, dest_val);
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
DestCType dest_val = $expr;
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
if ($dest_is_src_flag) {
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestT dest_val;
if ($dest_is_src_flag) {
Wavefront *w = gpuDynInst->wavefront();
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType dest_val;
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
DestCType dest_val;
SrcCType src_val[$num_srcs];
// taken branch
const uint32_t true_pc = getTargetPc();
VectorMask true_mask;
- for (unsigned int lane = 0; lane < VSZ; ++lane) {
+ for (unsigned int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
}
const VectorMask &mask = w->get_pred();
// mask off completed work-items
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->init_mask[lane] = 0;
}
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_dest_operands > 1) {
- for (int i = 0; i < VSZ; ++i)
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_dest_operands);
else
for (int k = 0; k < num_dest_operands; ++k) {
- c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+ c0 *d = &((c0*)gpuDynInst->d_data)
+ [k * gpuDynInst->computeUnit()->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
if (num_src_operands > 1) {
- for (int i = 0; i < VSZ; ++i)
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i)
if (gpuDynInst->exec_mask[i])
gpuDynInst->statusVector.push_back(num_src_operands);
else
}
for (int k = 0; k < num_src_operands; ++k) {
- c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+ c0 *d = &((c0*)gpuDynInst->d_data)
+ [k * gpuDynInst->computeUnit()->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
c0 *e = &((c0*) gpuDynInst->a_data)[0];
c0 *f = &((c0*) gpuDynInst->x_data)[0];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < gpuDynInst->computeUnit()->wfSize(); ++i) {
if (gpuDynInst->exec_mask[i]) {
Addr vaddr = gpuDynInst->addr[i];
typedef typename DestDataType::CType CType M5_VAR_USED;
const VectorMask &mask = w->get_pred();
- uint64_t addr_vec[VSZ];
+ std::vector<Addr> addr_vec;
+ addr_vec.resize(w->computeUnit->wfSize(), (Addr)0);
this->addr.calcVector(w, addr_vec);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, addr_vec[lane]);
}
}
+ addr_vec.clear();
}
template<typename MemDataType, typename DestDataType,
i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
assert(se);
- return w->wfSlotId * w->privSizePerItem * VSZ +
- se->offset * VSZ +
+ return w->wfSlotId * w->privSizePerItem * w->computeUnit->wfSize() +
+ se->offset * w->computeUnit->wfSize() +
lane * se->size;
*/
Addr addr_div8 = addr / 8;
Addr addr_mod8 = addr % 8;
- Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+ Addr ret = addr_div8 * 8 * w->computeUnit->wfSize() + lane * 8 +
+ addr_mod8 + w->privBase;
- assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+ assert(ret < w->privBase +
+ (w->privSizePerItem * w->computeUnit->wfSize()));
return ret;
}
DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
this->dest.set(w, lane, val);
}
return;
} else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
MemCType val = w->readCallArgMem<MemCType>(lane, address);
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (m->addr[lane] < w->privSizePerItem) {
if (mask[lane]) {
// what is the size of the object we are accessing?
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
// note: this calculation will NOT WORK if the compiler
// ever generates loads/stores to the same address with
// different widths (e.g., a ld_u32 addr and a ld_u16 addr)
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
m->addr[lane] += w->roBase;
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
if (this->segment == Brig::BRIG_SEGMENT_ARG) {
uint64_t address = this->addr.calcUniform();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
CType data = this->src.template get<CType>(w, lane);
DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
this->addr.calcVector(w, m->addr);
if (num_src_operands == 1) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
((CType*)m->d_data)[lane] =
this->src.template get<CType>(w, lane);
}
} else {
for (int k= 0; k < num_src_operands; ++k) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
- ((CType*)m->d_data)[k * VSZ + lane] =
+ ((CType*)m->d_data)[k * w->computeUnit->wfSize() + lane] =
this->src_vect[k].template get<CType>(w, lane);
}
}
// this is a complete hack to get around a compiler bug
// (the compiler currently generates global access for private
// addresses (starting from 0). We need to add the private offset)
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
if (m->addr[lane] < w->privSizePerItem) {
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->spillSizePerItem);
m->pipeId = GLBMEM_PIPE;
m->latency.set(w->computeUnit->shader->ticks(1));
{
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
assert(m->addr[lane] < w->privSizePerItem);
m->addr[lane] = m->addr[lane] + lane *
this->addr.calcVector(w, m->addr);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((CType *)m->a_data)[lane] =
this->src[0].template get<CType>(w, lane);
}
// load second source operand for CAS
if (NumSrcOperands > 1) {
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((CType*)m->x_data)[lane] =
this->src[1].template get<CType>(w, lane);
}
int op = 0;
bool got_op = false;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val0 = src1.get<int>(w, lane, 0);
if (got_op) {
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
{
#if TRACING_ON
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
int src_val3 = -1;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 3)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
std::string res_str;
res_str = csprintf("krl_prt (%s)\n", disassemble());
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (!(lane & 7)) {
res_str += csprintf("DB%03d: ", (int)w->wfDynId);
}
res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
res_str += csprintf(" Exec mask: ");
- for (int i = VSZ - 1; i >= 0; --i) {
+ for (int i = w->computeUnit->wfSize() - 1; i >= 0; --i) {
if (w->execMask(i))
res_str += "1";
else
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
dest.set<int>(w, lane, res);
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
res += src_val1;
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
- if (lane < (VSZ/2)) {
+ if (lane < (w->computeUnit->wfSize()/2)) {
res = res | ((uint32_t)(1) << lane);
}
}
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
{
const VectorMask &mask = w->get_pred();
int res = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
if (src_val1) {
- if (lane >= (VSZ/2)) {
- res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+ if (lane >= (w->computeUnit->wfSize()/2)) {
+ res = res | ((uint32_t)(1) <<
+ (lane - (w->computeUnit->wfSize()/2)));
}
}
}
}
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
dest.set<int>(w, lane, res);
}
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]++;
const VectorMask &mask = w->get_pred();
int max_cnt = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
w->bar_cnt[lane]--;
}
{
const VectorMask &mask = w->get_pred();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
int src_val1 = src1.get<int>(w, lane, 1);
panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
{
// the address is in src1 | src2
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
int src_val1 = src1.get<int>(w, lane, 1);
int src_val2 = src1.get<int>(w, lane, 2);
Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
calcAddr(w, m);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
}
GPUDynInstPtr m = gpuDynInst;
calcAddr(w, m);
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
}
const VectorMask &mask = w->get_pred();
int src_val1 = 0;
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (mask[lane]) {
src_val1 = src1.get<int>(w, lane, 1);
break;
const VectorMask &mask = w->get_pred();
unsigned mst = true;
- for (int lane = VSZ - 1; lane >= 0; --lane) {
+ for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
if (mask[lane]) {
dest.set<int>(w, lane, mst);
mst = false;
int res = 0;
bool got_res = false;
- for (int lane = VSZ - 1; lane >= 0; --lane) {
+ for (int lane = w->computeUnit->wfSize() - 1; lane >= 0; --lane) {
if (mask[lane]) {
if (!got_res) {
res = src1.get<int>(w, lane, 1);
* Defines classes encapsulating HSAIL instruction operands.
*/
+#include <limits>
#include <string>
#include "arch/hsail/Brig.h"
template<typename T>
class ImmOperand : public BaseOperand
{
+ private:
+ uint16_t kind;
public:
T bits;
template<typename OperandType>
OperandType
- get()
+ get(Wavefront *w)
{
assert(sizeof(OperandType) <= sizeof(T));
+ panic_if(w == nullptr, "WF pointer needs to be set");
+
+ switch (kind) {
+ // immediate operand is WF size
+ case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+ return (OperandType)w->computeUnit->wfSize();
+ break;
- return *(OperandType*)&bits;
+ default:
+ return *(OperandType*)&bits;
+ break;
+ }
}
// This version of get() takes a WF* and a lane id for
OperandType
get(Wavefront *w, int lane)
{
- return get<OperandType>();
+ return get<OperandType>(w);
}
};
auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
bits = *((T*)(obj->getData(cbptr->bytes + 4)));
-
+ kind = brigOp->kind;
return true;
}
break;
case Brig::BRIG_KIND_OPERAND_WAVESIZE:
- bits = VSZ;
+ kind = brigOp->kind;
+ bits = std::numeric_limits<unsigned long long>::digits;
return true;
default:
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
}
const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
(const Brig::BrigOperand *)obj->getOperand(*data_offset);
if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ kind = Brig::BRIG_KIND_NONE;
return false;
}
OperandType
get(Wavefront *w, int lane)
{
- return is_imm ? imm_op.template get<OperandType>() :
+ return is_imm ? imm_op.template get<OperandType>(w) :
reg_op.template get<OperandType>(w, lane);
}
uint64_t calcUniformBase();
public:
- virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+ virtual void calcVector(Wavefront *w, std::vector<Addr> &addrVec) = 0;
virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
uint64_t offset;
RegOperandType reg;
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
- void calcVector(Wavefront *w, uint64_t *addrVec);
+ void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
uint32_t opSize() { return reg.opSize(); }
bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
template<typename RegOperandType>
void
-RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w,
+ std::vector<Addr> &addrVec)
{
Addr address = calcUniformBase();
- for (int lane = 0; lane < VSZ; ++lane) {
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane) {
if (w->execMask(lane)) {
if (reg.regFileChar == 's') {
addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
public:
void init(unsigned opOffset, const BrigObject *obj);
uint64_t calcUniform();
- void calcVector(Wavefront *w, uint64_t *addrVec);
+ void calcVector(Wavefront *w, std::vector<Addr> &addrVec);
uint64_t calcLane(Wavefront *w, int lane=0);
std::string disassemble();
};
}
inline void
-NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+NoRegAddrOperand::calcVector(Wavefront *w, std::vector<Addr> &addrVec)
{
uint64_t address = calcUniformBase();
- for (int lane = 0; lane < VSZ; ++lane)
+ for (int lane = 0; lane < w->computeUnit->wfSize(); ++lane)
addrVec[lane] = address;
}
simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
class Wavefront(SimObject):
simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
class ComputeUnit(MemObject):
type = 'ComputeUnit'
case HSA_GET_VSZ:
{
BufferArg buf(buf_addr, sizeof(uint32_t));
- *((uint32_t*)buf.bufferPtr()) = VSZ;
+ *((uint32_t*)buf.bufferPtr()) = dispatcher->wfSize();
buf.copyOut(tc->getMemProxy());
}
break;
*
* Author: John Kalamatianos, Anthony Gutierrez
*/
-
#include "gpu-compute/compute_unit.hh"
+#include <limits>
+
#include "base/output.hh"
#include "debug/GPUDisp.hh"
#include "debug/GPUExec.hh"
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
{
- // this check will be eliminated once we have wavefront size support added
- fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ /**
+ * This check is necessary because std::bitset only provides conversion
+ * to unsigned long or unsigned long long via to_ulong() or to_ullong().
+ * there are * a few places in the code where to_ullong() is used, however
+ * if VSZ is larger than a value the host can support then bitset will
+ * throw a runtime exception. we should remove all use of to_long() or
+ * to_ullong() so we can have VSZ greater than 64b, however until that is
+ * done this assert is required.
+ */
+ fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
+ p->wfSize <= 0,
+ "WF size is larger than the host can support");
+ fatal_if(!isPowerOf2(wavefrontSize),
+ "Wavefront size should be a power of 2");
// calculate how many cycles a vector load or store will need to transfer
// its data over the corresponding buses
- numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
- / (double)vrfToCoalescerBusWidth);
+ numCyclesPerStoreTransfer =
+ (uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
+ (double)vrfToCoalescerBusWidth);
- numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
/ coalescerToVrfBusWidth;
lastVaddrWF.resize(numSIMDs);
lastVaddrWF[j].resize(p->n_wf);
for (int i = 0; i < p->n_wf; ++i) {
- lastVaddrWF[j][i].resize(VSZ);
+ lastVaddrWF[j][i].resize(wfSize());
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
wfList[j][i]->setParent(this);
- for (int k = 0; k < VSZ; ++k) {
+ for (int k = 0; k < wfSize(); ++k) {
lastVaddrWF[j][i][k] = 0;
}
}
}
- lastVaddrPhase.resize(numSIMDs);
+ lastVaddrSimd.resize(numSIMDs);
for (int i = 0; i < numSIMDs; ++i) {
- lastVaddrPhase[i] = LastVaddrWave();
+ lastVaddrSimd[i].resize(wfSize(), 0);
}
- lastVaddrCU = LastVaddrWave();
+ lastVaddrCU.resize(wfSize());
lds.setParent(this);
fatal("Invalid WF execution policy (CU)\n");
}
- memPort.resize(VSZ);
+ memPort.resize(wfSize());
// resize the tlbPort vectorArray
- int tlbPort_width = perLaneTLB ? VSZ : 1;
+ int tlbPort_width = perLaneTLB ? wfSize() : 1;
tlbPort.resize(tlbPort_width);
cuExitCallback = new CUExitCallback(this);
ComputeUnit::~ComputeUnit()
{
// Delete wavefront slots
-
- for (int j = 0; j < numSIMDs; ++j)
+ for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
delete wfList[j][i];
}
-
+ lastVaddrSimd[j].clear();
+ }
+ lastVaddrCU.clear();
readyList.clear();
waveStatusList.clear();
dispatchList.clear();
VectorMask init_mask;
init_mask.reset();
- for (int k = 0; k < VSZ; ++k) {
- if (k + cnt * VSZ < trueWgSizeTotal)
+ for (int k = 0; k < wfSize(); ++k) {
+ if (k + cnt * wfSize() < trueWgSizeTotal)
init_mask[k] = 1;
}
wfCtx->init_mask = init_mask.to_ullong();
wfCtx->exec_mask = init_mask.to_ullong();
- for (int i = 0; i < VSZ; ++i) {
- wfCtx->bar_cnt[i] = 0;
- }
+ wfCtx->bar_cnt.resize(wfSize(), 0);
wfCtx->max_bar_cnt = 0;
wfCtx->old_barrier_cnt = 0;
wfCtx->barrier_cnt = 0;
wfCtx->privBase = ndr->q.privMemStart;
- ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
wfCtx->spillBase = ndr->q.spillMemStart;
- ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
wfCtx->pc = 0;
wfCtx->rpc = UINT32_MAX;
w->dynwaveid = cnt;
w->init_mask = wfCtx->init_mask;
- for (int k = 0; k < VSZ; ++k) {
- w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
- w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
- w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+ for (int k = 0; k < wfSize(); ++k) {
+ w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
+ w->workitemid[1][k] =
+ ((k + cnt * wfSize()) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] =
+ (k + cnt * wfSize()) / (trueWgSize[0] * trueWgSize[1]);
w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
w->barrier_cnt = wfCtx->barrier_cnt;
- w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+ w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < wfSize(); ++i) {
w->bar_cnt[i] = wfCtx->bar_cnt[i];
}
// is this the last wavefront in the workgroup
// if set the spillWidth to be the remaining work-items
// so that the vector access is correct
- if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
- w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ if ((cnt + 1) * wfSize() >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * wfSize());
} else {
- w->spillWidth = VSZ;
+ w->spillWidth = wfSize();
}
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
w->start(++_n_wave, ndr->q.code_ptr);
+ wfCtx->bar_cnt.clear();
}
void
// Send L1 cache acquire
// isKernel + isAcquire = Kernel Begin
if (shader->impl_kern_boundary_sync) {
- GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(this,
nullptr,
nullptr, 0);
if (w->status == Wavefront::S_STOPPED) {
// if we have scheduled all work items then stop
// scheduling wavefronts
- if (cnt * VSZ >= trueWgSizeTotal)
+ if (cnt * wfSize() >= trueWgSizeTotal)
break;
// reserve vector registers for the scheduled wavefront
// work item of the work group
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
bool vregAvail = true;
- int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
int freeWfSlots = 0;
// check if the total number of VGPRs required by all WFs of the WG
// fit in the VRFs of all SIMD units
// Setup space for call args
for (int j = 0; j < numSIMDs; ++j) {
for (int i = 0; i < shader->n_wf; ++i) {
- wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
}
}
Addr last = 0;
switch(computeUnit->prefetchType) {
- case Enums::PF_CU:
+ case Enums::PF_CU:
last = computeUnit->lastVaddrCU[mp_index];
break;
- case Enums::PF_PHASE:
- last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrSimd[simdId][mp_index];
break;
- case Enums::PF_WF:
+ case Enums::PF_WF:
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
- default:
+ default:
break;
}
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
computeUnit->lastVaddrCU[mp_index] = vaddr;
- computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
;
ldsBankConflictDist
- .init(0, VSZ, 2)
+ .init(0, wfSize(), 2)
.name(name() + ".lds_bank_conflicts")
.desc("Number of bank conflicts per LDS memory packet")
;
;
pageDivergenceDist
- // A wavefront can touch 1 to VSZ pages per memory instruction.
- // The number of pages per bin can be configured (here it's 4).
- .init(1, VSZ, 4)
+ // A wavefront can touch up to N pages per memory instruction where
+ // N is equal to the wavefront size
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, wfSize(), 4)
.name(name() + ".page_divergence_dist")
.desc("pages touched per wf (over all mem. instr.)")
;
controlFlowDivergenceDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".warp_execution_dist")
.desc("number of lanes active per instruction (oval all instructions)")
;
activeLanesPerGMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".gmem_lanes_execution_dist")
.desc("number of active lanes per global memory instruction")
;
activeLanesPerLMemInstrDist
- .init(1, VSZ, 4)
+ .init(1, wfSize(), 4)
.name(name() + ".lmem_lanes_execution_dist")
.desc("number of active lanes per local memory instruction")
;
numVecOpsExecuted
.name(name() + ".num_vec_ops_executed")
- .desc("number of vec ops executed (e.g. VSZ/inst)")
+ .desc("number of vec ops executed (e.g. WF size/inst)")
;
totalCycles
// if fixed-stride prefetching, this is the stride.
int prefetchStride;
- class LastVaddrWave
- {
- public:
- Addr vaddrs[VSZ];
- Addr& operator[](int idx) {
- return vaddrs[idx];
- }
-
- LastVaddrWave() {
- for (int i = 0; i < VSZ; ++i)
- vaddrs[i] = 0;
- }
- };
-
- LastVaddrWave lastVaddrCU;
- std::vector<LastVaddrWave> lastVaddrPhase;
+ std::vector<Addr> lastVaddrCU;
+ std::vector<std::vector<Addr>> lastVaddrSimd;
std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
Enums::PrefetchType prefetchType;
EXEC_POLICY exec_policy;
return shader->cuList.size();
}
+int
+GpuDispatcher::wfSize() const
+{
+ return shader->cuList[0]->wfSize();
+}
+
void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
// helper functions to retrieve/set GPU attributes
int getNumCUs();
+ int wfSize() const;
void setFuncargsSize(int funcargs_size);
};
int physVgpr = w->remap(dst, sizeof(c0), 1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
"$%s%d <- %d global ld done (src = wavefront "
GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
GPUStaticInst *_staticInst, uint64_t instSeqNum)
- : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+ : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0),
+ m_op(Enums::MO_UNDEF),
memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false),
statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
{
- tlbHitLevel.assign(VSZ, -1);
+ tlbHitLevel.assign(computeUnit()->wfSize(), -1);
+ d_data = new uint8_t[computeUnit()->wfSize() * 16];
+ a_data = new uint8_t[computeUnit()->wfSize() * 8];
+ x_data = new uint8_t[computeUnit()->wfSize() * 8];
+ for (int i = 0; i < (computeUnit()->wfSize() * 8); ++i) {
+ a_data[i] = 0;
+ x_data[i] = 0;
+ }
+ for (int i = 0; i < (computeUnit()->wfSize() * 16); ++i) {
+ d_data[i] = 0;
+ }
+}
+
+GPUDynInst::~GPUDynInst()
+{
+ delete[] d_data;
+ delete[] a_data;
+ delete[] x_data;
}
void
public:
GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
uint64_t instSeqNum);
-
+ ~GPUDynInst();
void execute();
int numSrcRegOperands();
int numDstRegOperands();
Enums::StorageClassType executedAs();
// The address of the memory operation
- Addr addr[VSZ];
+ std::vector<Addr> addr;
Addr pAddr;
// The data to get written
- uint8_t d_data[VSZ * 16];
+ uint8_t *d_data;
// Additional data (for atomics)
- uint8_t a_data[VSZ * 8];
+ uint8_t *a_data;
// Additional data (for atomics)
- uint8_t x_data[VSZ * 8];
+ uint8_t *x_data;
// The execution mask
VectorMask exec_mask;
int physVgpr = w->remap(dst,sizeof(c0),1);
// save the physical VGPR index
regVec.push_back(physVgpr);
- c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+ c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
- for (int i = 0; i < VSZ; ++i) {
+ for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
if (m->exec_mask[i]) {
// write the value into the physical VGPR. This is a purely
// functional operation. No timing is modeled.
#define __MISC_HH__
#include <bitset>
+#include <limits>
#include <memory>
#include "base/misc.hh"
class GPUDynInst;
-// wavefront size of the machine
-static const int VSZ = 64;
-
-/*
- This check is necessary because std::bitset only provides conversion to
- unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
- a few places in the code where to_ullong() is used, however if VSZ is larger
- than a value the host can support then bitset will throw a runtime exception.
-
- we should remove all use of to_long() or to_ullong() so we can have VSZ
- greater than 64b, however until that is done this assert is required.
- */
-static_assert(VSZ <= sizeof(unsigned long long) * 8,
- "VSZ is larger than the host can support");
-
-typedef std::bitset<VSZ> VectorMask;
+typedef std::bitset<std::numeric_limits<unsigned long long>::digits> VectorMask;
typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
class WaitClass
{
// 32 bit values
// barrier state
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
// id (which WF in the WG)
int cnt;
nxtBusy.clear();
nxtBusy.resize(numRegsPerSimd, 0);
- vgprState->init(numRegsPerSimd);
+ vgprState->init(numRegsPerSimd, p->wfSize);
}
void
#include "gpu-compute/vector_register_state.hh"
+#include <limits>
+
#include "gpu-compute/compute_unit.hh"
VecRegisterState::VecRegisterState() : computeUnit(nullptr)
}
void
-VecRegisterState::init(uint32_t _size)
+VecRegisterState::init(uint32_t _size, uint32_t wf_size)
{
s_reg.resize(_size);
+ fatal_if(wf_size > std::numeric_limits<unsigned long long>::digits ||
+ wf_size <= 0,
+ "WF size is larger than the host can support or is zero");
+ fatal_if((wf_size & (wf_size - 1)) != 0,
+ "Wavefront size should be a power of 2");
+ for (int i = 0; i < s_reg.size(); ++i) {
+ s_reg[i].resize(wf_size, 0);
+ }
d_reg.resize(_size);
+ for (int i = 0; i < d_reg.size(); ++i) {
+ d_reg[i].resize(wf_size, 0);
+ }
}
{
public:
VecRegisterState();
- void init(uint32_t _size);
+ void init(uint32_t _size, uint32_t wf_size);
const std::string& name() const { return _name; }
void setParent(ComputeUnit *_computeUnit);
ComputeUnit *computeUnit;
std::string _name;
// 32-bit Single Precision Vector Register State
- std::vector<std::array<uint32_t, VSZ>> s_reg;
+ std::vector<std::vector<uint32_t>> s_reg;
// 64-bit Double Precision Vector Register State
- std::vector<std::array<uint64_t, VSZ>> d_reg;
+ std::vector<std::vector<uint64_t>> d_reg;
};
#endif // __VECTOR_REGISTER_STATE_HH__
last_trace = 0;
simdId = p->simdId;
wfSlotId = p->wf_slot_id;
-
status = S_STOPPED;
reservedVectorRegs = 0;
startVgprIndex = 0;
mem_trace_busy = 0;
old_vgpr_tcnt = 0xffffffffffffffffll;
old_dgpr_tcnt = 0xffffffffffffffffll;
+ old_vgpr.resize(p->wfSize);
pendingFetch = false;
dropFetch = false;
condRegState = new ConditionRegisterState();
maxSpVgprs = 0;
maxDpVgprs = 0;
+ last_addr.resize(p->wfSize);
+ workitemFlatId.resize(p->wfSize);
+ old_dgpr.resize(p->wfSize);
+ bar_cnt.resize(p->wfSize);
+ for (int i = 0; i < 3; ++i) {
+ workitemid[i].resize(p->wfSize);
+ }
}
void
{
if (callArgMem)
delete callArgMem;
+ delete condRegState;
}
void
public:
// pointer to buffer for storing function arguments
uint8_t *mem;
+ int wfSize;
// size of function args
int funcArgsSizePerItem;
int
getLaneOffset(int lane, int addr)
{
- return addr * VSZ + sizeof(CType) * lane;
+ return addr * wfSize + sizeof(CType) * lane;
}
- CallArgMem(int func_args_size_per_item)
- : funcArgsSizePerItem(func_args_size_per_item)
+ CallArgMem(int func_args_size_per_item, int wf_size)
+ : wfSize(wf_size), funcArgsSizePerItem(func_args_size_per_item)
{
- mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+ mem = (uint8_t*)malloc(funcArgsSizePerItem * wfSize);
}
~CallArgMem()
bool isOldestInstALU();
bool isOldestInstBarrier();
// used for passing spill address to DDInstGPU
- uint64_t last_addr[VSZ];
- uint32_t workitemid[3][VSZ];
- uint32_t workitemFlatId[VSZ];
+ std::vector<Addr> last_addr;
+ std::vector<uint32_t> workitemid[3];
+ std::vector<uint32_t> workitemFlatId;
uint32_t workgroupid[3];
uint32_t workgroupsz[3];
uint32_t gridsz[3];
uint32_t startVgprIndex;
// Old value of destination gpr (for trace)
- uint32_t old_vgpr[VSZ];
+ std::vector<uint32_t> old_vgpr;
// Id of destination gpr (for trace)
uint32_t old_vgpr_id;
// Tick count of last old_vgpr copy
uint64_t old_vgpr_tcnt;
// Old value of destination gpr (for trace)
- uint64_t old_dgpr[VSZ];
+ std::vector<uint64_t> old_dgpr;
// Id of destination gpr (for trace)
uint32_t old_dgpr_id;
// Tick count of last old_vgpr copy
VectorMask init_mask;
// number of barriers this WF has joined
- int bar_cnt[VSZ];
+ std::vector<int> bar_cnt;
int max_bar_cnt;
// Flag to stall a wave on barrier
bool stalledAtBarrier;
// argument memory for hsail call instruction
CallArgMem *callArgMem;
void
- initCallArgMem(int func_args_size_per_item)
+ initCallArgMem(int func_args_size_per_item, int wf_size)
{
- callArgMem = new CallArgMem(func_args_size_per_item);
+ callArgMem = new CallArgMem(func_args_size_per_item, wf_size);
}
template<typename CType>
}
void start(uint64_t _wfDynId, uint64_t _base_ptr);
-
void exec();
void updateResources();
int ready(itype_e type);