From 7ac38849abaf6aeccf39137bc8acb9e44d192e82 Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Wed, 26 Oct 2016 22:47:11 -0400 Subject: [PATCH] gpu-compute: remove inst enums and use bit flag for attributes this patch removes the GPUStaticInst enums that were defined in GPU.py. instead, a simple set of attribute flags that can be set in the base instruction class are used. this will help unify the attributes of HSAIL and machine ISA instructions within the model itself. because the static instrution now carries the attributes, a GPUDynInst must carry a pointer to a valid GPUStaticInst so a new static kernel launch instruction is added, which carries the attributes needed to perform a the kernel launch. --- src/arch/hsail/SConscript | 1 - src/arch/hsail/generic_types.cc | 47 -- src/arch/hsail/generic_types.hh | 16 - src/arch/hsail/insts/branch.hh | 14 +- src/arch/hsail/insts/decl.hh | 125 ++++- src/arch/hsail/insts/main.cc | 5 +- src/arch/hsail/insts/mem.cc | 63 --- src/arch/hsail/insts/mem.hh | 542 +++++++++++----------- src/arch/hsail/insts/mem_impl.hh | 25 - src/arch/hsail/insts/pseudo_inst.cc | 31 +- src/gpu-compute/GPU.py | 108 ----- src/gpu-compute/GPUStaticInstFlags.py | 111 +++++ src/gpu-compute/SConscript | 1 + src/gpu-compute/code_enums.hh | 116 ----- src/gpu-compute/compute_unit.cc | 26 +- src/gpu-compute/compute_unit.hh | 1 + src/gpu-compute/global_memory_pipeline.cc | 23 +- src/gpu-compute/gpu_dyn_inst.cc | 382 +++++++++++++-- src/gpu-compute/gpu_dyn_inst.hh | 219 +++++---- src/gpu-compute/gpu_static_inst.cc | 6 +- src/gpu-compute/gpu_static_inst.hh | 167 ++++++- src/gpu-compute/kernel_cfg.cc | 10 +- src/gpu-compute/lds_state.cc | 7 +- src/gpu-compute/lds_state.hh | 1 - src/gpu-compute/local_memory_pipeline.cc | 9 +- src/gpu-compute/shader.hh | 1 - src/gpu-compute/vector_register_file.cc | 5 +- src/gpu-compute/wavefront.cc | 207 +++------ 28 files changed, 1205 insertions(+), 1064 deletions(-) delete mode 100644 src/arch/hsail/generic_types.cc delete mode 100644 src/arch/hsail/generic_types.hh create mode 100644 src/gpu-compute/GPUStaticInstFlags.py delete mode 100644 src/gpu-compute/code_enums.hh diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript index 3455823a6..251c103fd 100644 --- a/src/arch/hsail/SConscript +++ b/src/arch/hsail/SConscript @@ -43,7 +43,6 @@ if env['TARGET_GPU_ISA'] == 'hsail': env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'], 'gen.py', '$SOURCE $TARGETS') - Source('generic_types.cc') Source('gpu_decoder.cc') Source('insts/branch.cc') Source('insts/gen_exec.cc') diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc deleted file mode 100644 index 0cd55d1d5..000000000 --- a/src/arch/hsail/generic_types.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "arch/hsail/generic_types.hh" -#include "base/misc.hh" - -using namespace Brig; - -namespace HsailISA -{ - Enums::GenericMemoryOrder - getGenericMemoryOrder(BrigMemoryOrder brig_memory_order) - { - switch(brig_memory_order) { - case BRIG_MEMORY_ORDER_NONE: - return Enums::MEMORY_ORDER_NONE; - case BRIG_MEMORY_ORDER_RELAXED: - return Enums::MEMORY_ORDER_RELAXED; - case BRIG_MEMORY_ORDER_SC_ACQUIRE: - return Enums::MEMORY_ORDER_SC_ACQUIRE; - case BRIG_MEMORY_ORDER_SC_RELEASE: - return Enums::MEMORY_ORDER_SC_RELEASE; - case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: - return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE; - default: - fatal("HsailISA::MemInst::getGenericMemoryOrder -> ", - "bad BrigMemoryOrder\n"); - } - } - - Enums::GenericMemoryScope - getGenericMemoryScope(BrigMemoryScope brig_memory_scope) - { - switch(brig_memory_scope) { - case BRIG_MEMORY_SCOPE_NONE: - return Enums::MEMORY_SCOPE_NONE; - case BRIG_MEMORY_SCOPE_WORKITEM: - return Enums::MEMORY_SCOPE_WORKITEM; - case BRIG_MEMORY_SCOPE_WORKGROUP: - return Enums::MEMORY_SCOPE_WORKGROUP; - case BRIG_MEMORY_SCOPE_AGENT: - return Enums::MEMORY_SCOPE_DEVICE; - case BRIG_MEMORY_SCOPE_SYSTEM: - return Enums::MEMORY_SCOPE_SYSTEM; - default: - fatal("HsailISA::MemInst::getGenericMemoryScope -> ", - "bad BrigMemoryScope\n"); - } - } -} // namespace HsailISA diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh deleted file mode 100644 index 50e430bef..000000000 --- a/src/arch/hsail/generic_types.hh +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__ -#define __ARCH_HSAIL_GENERIC_TYPES_HH__ - -#include "arch/hsail/Brig.h" -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" - -namespace HsailISA -{ - Enums::GenericMemoryOrder - getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order); - Enums::GenericMemoryScope - getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope); -} // namespace HsailISA - -#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__ diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh index 45cd876ad..89bcc1277 100644 --- a/src/arch/hsail/insts/branch.hh +++ b/src/arch/hsail/insts/branch.hh @@ -59,16 +59,15 @@ namespace HsailISA BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "brn") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); + setFlag(UnconditionalJump); width = ((Brig::BrigInstBr*)ib)->width; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } - bool unconditionalJumpInstruction() override { return true; } bool isVectorRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); return target.isVectorRegister(); @@ -175,13 +174,12 @@ namespace HsailISA CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "cbr") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); width = ((Brig::BrigInstBr *)ib)->width; unsigned op_offs = obj->getOperandPtr(ib->operands, 0); cond.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } @@ -343,17 +341,15 @@ namespace HsailISA BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "br") { - o_type = Enums::OT_BRANCH; + setFlag(Branch); + setFlag(UnconditionalJump); width.init(((Brig::BrigInstBr *)ib)->width, obj); unsigned op_offs = obj->getOperandPtr(ib->operands, 0); target.init(op_offs, obj); - o_type = Enums::OT_BRANCH; } uint32_t getTargetPc() override { return target.getTarget(0, 0); } - bool unconditionalJumpInstruction() override { return true; } - void execute(GPUDynInstPtr gpuDynInst) override; bool isVectorRegister(int operandIndex) override { assert(operandIndex >= 0 && operandIndex < getNumOperands()); diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh index 48e022ff7..94f23ac1f 100644 --- a/src/arch/hsail/insts/decl.hh +++ b/src/arch/hsail/insts/decl.hh @@ -38,11 +38,9 @@ #include -#include "arch/hsail/generic_types.hh" #include "arch/hsail/insts/gpu_static_inst.hh" #include "arch/hsail/operand.hh" #include "debug/HSAIL.hh" -#include "enums/OpType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -127,6 +125,8 @@ namespace HsailISA const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -240,6 +240,8 @@ namespace HsailISA const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -414,6 +416,8 @@ namespace HsailISA const BrigObject *obj, const char *opcode) : HsailGPUStaticInst(obj, opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -818,6 +822,8 @@ namespace HsailISA const BrigObject *obj, const char *_opcode) : HsailGPUStaticInst(obj, _opcode) { + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); @@ -874,7 +880,7 @@ namespace HsailISA Ret(const Brig::BrigInstBase *ib, const BrigObject *obj) : Base(ib, obj, "ret") { - o_type = Enums::OT_RET; + setFlag(GPUStaticInst::Return); } void execute(GPUDynInstPtr gpuDynInst); @@ -889,7 +895,7 @@ namespace HsailISA Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj) : Base(ib, obj, "barrier") { - o_type = Enums::OT_BARRIER; + setFlag(GPUStaticInst::MemBarrier); assert(ib->base.kind == Brig::BRIG_KIND_INST_BR); width = (uint8_t)((Brig::BrigInstBr*)ib)->width; } @@ -924,14 +930,105 @@ namespace HsailISA memFenceMemOrder = (Brig::BrigMemoryOrder) ((Brig::BrigInstMemFence*)ib)->memoryOrder; - // set o_type based on scopes + setFlag(MemoryRef); + setFlag(GPUStaticInst::MemFence); + + switch (memFenceMemOrder) { + case Brig::BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case Brig::BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case Brig::BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case Brig::BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; + default: + fatal("MemInst has bad BrigMemoryOrder\n"); + } + + // set inst flags based on scopes if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE && memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_BOTH_MEMFENCE; + setFlag(GPUStaticInst::GlobalSegment); + + /** + * A memory fence that has scope for + * both segments will use the global + * segment, and be executed in the + * global memory pipeline, therefore, + * we set the segment to match the + * global scope only + */ + switch (memFenceScopeSegGlobal) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad global scope type\n"); + } } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_GLOBAL_MEMFENCE; + setFlag(GPUStaticInst::GlobalSegment); + + switch (memFenceScopeSegGlobal) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad global scope type\n"); + } } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) { - o_type = Enums::OT_SHARED_MEMFENCE; + setFlag(GPUStaticInst::GroupSegment); + + switch (memFenceScopeSegGroup) { + case Brig::BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case Brig::BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case Brig::BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case Brig::BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("MemFence has bad group scope type\n"); + } } else { fatal("MemFence constructor: bad scope specifiers\n"); } @@ -955,18 +1052,13 @@ namespace HsailISA // etc.). We send a packet, tagged with the memory order and // scope, and let the GPU coalescer handle it. - if (o_type == Enums::OT_GLOBAL_MEMFENCE || - o_type == Enums::OT_BOTH_MEMFENCE) { + if (isGlobalSeg()) { gpuDynInst->simdId = w->simdId; gpuDynInst->wfSlotId = w->wfSlotId; gpuDynInst->wfDynId = w->wfDynId; gpuDynInst->kern_id = w->kernId; gpuDynInst->cu_id = w->computeUnit->cu_id; - gpuDynInst->memoryOrder = - getGenericMemoryOrder(memFenceMemOrder); - gpuDynInst->scope = - getGenericMemoryScope(memFenceScopeSegGlobal); gpuDynInst->useContinuation = false; GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); gmp->getGMReqFIFO().push(gpuDynInst); @@ -975,10 +1067,10 @@ namespace HsailISA w->rdGmReqsInPipe--; w->memReqsInPipe--; w->outstandingReqs++; - } else if (o_type == Enums::OT_SHARED_MEMFENCE) { + } else if (isGroupSeg()) { // no-op } else { - fatal("MemFence execute: bad o_type\n"); + fatal("MemFence execute: bad op type\n"); } } }; @@ -1054,6 +1146,7 @@ namespace HsailISA Call(const Brig::BrigInstBase *ib, const BrigObject *obj) : HsailGPUStaticInst(obj, "call") { + setFlag(ALU); unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc index f1662430a..783689dd5 100644 --- a/src/arch/hsail/insts/main.cc +++ b/src/arch/hsail/insts/main.cc @@ -179,12 +179,13 @@ namespace HsailISA w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId); if (!refCount) { + setFlag(SystemScope); + setFlag(Release); + setFlag(GlobalSegment); // Notify Memory System of Kernel Completion // Kernel End = isKernel + isRelease w->status = Wavefront::S_RETURNING; GPUDynInstPtr local_mempacket = gpuDynInst; - local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE; - local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM; local_mempacket->useContinuation = false; local_mempacket->simdId = w->simdId; local_mempacket->wfSlotId = w->wfSlotId; diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc index 97d4c902b..6a6928838 100644 --- a/src/arch/hsail/insts/mem.cc +++ b/src/arch/hsail/insts/mem.cc @@ -36,7 +36,6 @@ #include "arch/hsail/insts/mem.hh" #include "arch/hsail/Brig.h" -#include "enums/OpType.hh" using namespace Brig; @@ -44,68 +43,6 @@ namespace HsailISA { const char* atomicOpToString(BrigAtomicOperation brigOp); - Enums::MemOpType - brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp) - { - if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) { - switch (brigOp) { - case BRIG_ATOMIC_AND: - return Enums::MO_AAND; - case BRIG_ATOMIC_OR: - return Enums::MO_AOR; - case BRIG_ATOMIC_XOR: - return Enums::MO_AXOR; - case BRIG_ATOMIC_CAS: - return Enums::MO_ACAS; - case BRIG_ATOMIC_EXCH: - return Enums::MO_AEXCH; - case BRIG_ATOMIC_ADD: - return Enums::MO_AADD; - case BRIG_ATOMIC_WRAPINC: - return Enums::MO_AINC; - case BRIG_ATOMIC_WRAPDEC: - return Enums::MO_ADEC; - case BRIG_ATOMIC_MIN: - return Enums::MO_AMIN; - case BRIG_ATOMIC_MAX: - return Enums::MO_AMAX; - case BRIG_ATOMIC_SUB: - return Enums::MO_ASUB; - default: - fatal("Bad BrigAtomicOperation code %d\n", brigOp); - } - } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) { - switch (brigOp) { - case BRIG_ATOMIC_AND: - return Enums::MO_ANRAND; - case BRIG_ATOMIC_OR: - return Enums::MO_ANROR; - case BRIG_ATOMIC_XOR: - return Enums::MO_ANRXOR; - case BRIG_ATOMIC_CAS: - return Enums::MO_ANRCAS; - case BRIG_ATOMIC_EXCH: - return Enums::MO_ANREXCH; - case BRIG_ATOMIC_ADD: - return Enums::MO_ANRADD; - case BRIG_ATOMIC_WRAPINC: - return Enums::MO_ANRINC; - case BRIG_ATOMIC_WRAPDEC: - return Enums::MO_ANRDEC; - case BRIG_ATOMIC_MIN: - return Enums::MO_ANRMIN; - case BRIG_ATOMIC_MAX: - return Enums::MO_ANRMAX; - case BRIG_ATOMIC_SUB: - return Enums::MO_ANRSUB; - default: - fatal("Bad BrigAtomicOperation code %d\n", brigOp); - } - } else { - fatal("Bad BrigAtomicOpcode %d\n", brigOpCode); - } - } - const char* atomicOpToString(BrigAtomicOperation brigOp) { diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh index acc8434be..e223c7cf5 100644 --- a/src/arch/hsail/insts/mem.hh +++ b/src/arch/hsail/insts/mem.hh @@ -96,6 +96,8 @@ namespace HsailISA { using namespace Brig; + setFlag(ALU); + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); dest.init(op_offs, obj); op_offs = obj->getOperandPtr(ib->operands, 1); @@ -211,143 +213,119 @@ namespace HsailISA Brig::BrigMemoryOrder memoryOrder; Brig::BrigMemoryScope memoryScope; unsigned int equivClass; - bool isArgLoad() - { - return segment == Brig::BRIG_SEGMENT_KERNARG || - segment == Brig::BRIG_SEGMENT_ARG; - } - void - initLd(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) + + LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; - const BrigInstMem *ldst = (const BrigInstMem*)ib; + setFlag(MemoryRef); + setFlag(Load); - segment = (BrigSegment)ldst->segment; - memoryOrder = BRIG_MEMORY_ORDER_NONE; - memoryScope = BRIG_MEMORY_SCOPE_NONE; - equivClass = ldst->equivClass; + if (ib->opcode == BRIG_OPCODE_LD) { + const BrigInstMem *ldst = (const BrigInstMem*)ib; - switch (segment) { - case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_READ; - break; + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_READ; - break; + width = ldst->width; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_READ; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } else { + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_READ; - break; + segment = (BrigSegment)at->segment; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + memoryScope = (BrigMemoryScope)at->memoryScope; + equivClass = 0; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_READ; - break; + width = BRIG_WIDTH_1; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_READ; - break; + if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) + dest.init(op_offs, obj); - case BRIG_SEGMENT_KERNARG: - o_type = Enums::OT_KERN_READ; - break; + op_offs = obj->getOperandPtr(ib->operands,1); + addr.init(op_offs, obj); + } - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); break; - default: - panic("Ld: segment %d not supported\n", segment); + fatal("LdInst has bad memory order type\n"); } - width = ldst->width; - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) - dest.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands, 1); - addr.init(op_offs, obj); - } - - void - initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - { - using namespace Brig; - - const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - - segment = (BrigSegment)at->segment; - memoryOrder = (BrigMemoryOrder)at->memoryOrder; - memoryScope = (BrigMemoryScope)at->memoryScope; - equivClass = 0; + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("LdInst has bad memory scope type\n"); + } switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_READ; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_READ; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_READ; + setFlag(PrivateSegment); break; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_READ; + setFlag(ReadOnlySegment); break; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_READ; + setFlag(SpillSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_READ; + setFlag(Flat); break; - case BRIG_SEGMENT_KERNARG: - o_type = Enums::OT_KERN_READ; + setFlag(KernArgSegment); break; - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + setFlag(ArgSegment); break; - default: panic("Ld: segment %d not supported\n", segment); } - - width = BRIG_WIDTH_1; - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const Brig::BrigOperand *brigOp = obj->getOperand(op_offs); - - if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER) - dest.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands,1); - addr.init(op_offs, obj); - } - - LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - : HsailGPUStaticInst(obj, _opcode) - { - using namespace Brig; - - if (ib->opcode == BRIG_OPCODE_LD) { - initLd(ib, obj, _opcode); - } else { - initAtomicLd(ib, obj, _opcode); - } } int numSrcRegOperands() override @@ -473,7 +451,7 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); - if (isLocalMem()) { + if (this->isLocalMem()) { // load from shared memory *d = gpuDynInst->wavefront()->ldsChunk-> read(vaddr); @@ -488,8 +466,7 @@ namespace HsailISA if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && - gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + gpuDynInst->isAcquire()) { // if this load has acquire semantics, // set the response continuation function // to perform an Acquire request @@ -520,10 +497,9 @@ namespace HsailISA { // after the load has complete and if the load has acquire // semantics, issue an acquire request. - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->useContinuation = false; // create request @@ -537,12 +513,6 @@ namespace HsailISA } public: - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - bool isVectorRegister(int operandIndex) override { assert((operandIndex >= 0) && (operandIndex < getNumOperands())); @@ -731,128 +701,113 @@ namespace HsailISA Brig::BrigMemoryOrder memoryOrder; unsigned int equivClass; - void - initSt(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) + StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, + const char *_opcode) + : HsailGPUStaticInst(obj, _opcode) { using namespace Brig; - const BrigInstMem *ldst = (const BrigInstMem*)ib; + setFlag(MemoryRef); + setFlag(Store); - segment = (BrigSegment)ldst->segment; - memoryOrder = BRIG_MEMORY_ORDER_NONE; - memoryScope = BRIG_MEMORY_SCOPE_NONE; - equivClass = ldst->equivClass; + if (ib->opcode == BRIG_OPCODE_ST) { + const BrigInstMem *ldst = (const BrigInstMem*)ib; - switch (segment) { - case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_WRITE; - break; + segment = (BrigSegment)ldst->segment; + memoryOrder = BRIG_MEMORY_ORDER_NONE; + memoryScope = BRIG_MEMORY_SCOPE_NONE; + equivClass = ldst->equivClass; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_WRITE; - break; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + const BrigOperand *baseOp = obj->getOperand(op_offs); - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_WRITE; - break; + if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || + (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { + src.init(op_offs, obj); + } - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_WRITE; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + addr.init(op_offs, obj); + } else { + const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_WRITE; - break; + segment = (BrigSegment)at->segment; + memoryScope = (BrigMemoryScope)at->memoryScope; + memoryOrder = (BrigMemoryOrder)at->memoryOrder; + equivClass = 0; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_WRITE; - break; + unsigned op_offs = obj->getOperandPtr(ib->operands, 0); + addr.init(op_offs, obj); - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; - break; + op_offs = obj->getOperandPtr(ib->operands, 1); + src.init(op_offs, obj); + } + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; default: - panic("St: segment %d not supported\n", segment); + fatal("StInst has bad memory order type\n"); } - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - const BrigOperand *baseOp = obj->getOperand(op_offs); - - if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) || - (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) { - src.init(op_offs, obj); + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("StInst has bad memory scope type\n"); } - op_offs = obj->getOperandPtr(ib->operands, 1); - addr.init(op_offs, obj); - } - - void - initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - { - using namespace Brig; - - const BrigInstAtomic *at = (const BrigInstAtomic*)ib; - - segment = (BrigSegment)at->segment; - memoryScope = (BrigMemoryScope)at->memoryScope; - memoryOrder = (BrigMemoryOrder)at->memoryOrder; - equivClass = 0; - switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_WRITE; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_WRITE; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_PRIVATE: - o_type = Enums::OT_PRIVATE_WRITE; + setFlag(PrivateSegment); break; - case BRIG_SEGMENT_READONLY: - o_type = Enums::OT_READONLY_WRITE; + setFlag(ReadOnlySegment); break; - case BRIG_SEGMENT_SPILL: - o_type = Enums::OT_SPILL_WRITE; + setFlag(SpillSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_WRITE; + setFlag(Flat); break; - case BRIG_SEGMENT_ARG: - o_type = Enums::OT_ARG; + setFlag(ArgSegment); break; - default: panic("St: segment %d not supported\n", segment); } - - unsigned op_offs = obj->getOperandPtr(ib->operands, 0); - addr.init(op_offs, obj); - - op_offs = obj->getOperandPtr(ib->operands, 1); - src.init(op_offs, obj); - } - - StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, - const char *_opcode) - : HsailGPUStaticInst(obj, _opcode) - { - using namespace Brig; - - if (ib->opcode == BRIG_OPCODE_ST) { - initSt(ib, obj, _opcode); - } else { - initAtomicSt(ib, obj, _opcode); - } } int numDstRegOperands() override { return 0; } @@ -964,10 +919,9 @@ namespace HsailISA { // before performing a store, check if this store has // release semantics, and if so issue a release first - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_RELEASE) { + && gpuDynInst->isRelease()) { gpuDynInst->statusBitVector = VectorMask(1); gpuDynInst->execContinuation = &GPUStaticInst::execSt; @@ -987,12 +941,6 @@ namespace HsailISA execSt(gpuDynInst); } - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - private: // execSt may be called through a continuation // if the store had release semantics. see comment for @@ -1020,7 +968,7 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0); - if (isLocalMem()) { + if (this->isLocalMem()) { //store to shared memory gpuDynInst->wavefront()->ldsChunk->write(vaddr, *d); @@ -1166,9 +1114,6 @@ namespace HsailISA } } - Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode, - Brig::BrigAtomicOperation brigOp); - template class AtomicInstBase : public HsailGPUStaticInst @@ -1183,7 +1128,6 @@ namespace HsailISA Brig::BrigAtomicOperation atomicOperation; Brig::BrigMemoryScope memoryScope; Brig::BrigOpcode opcode; - Enums::MemOpType opType; AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj, const char *_opcode) @@ -1198,21 +1142,106 @@ namespace HsailISA memoryOrder = (BrigMemoryOrder)at->memoryOrder; atomicOperation = (BrigAtomicOperation)at->atomicOperation; opcode = (BrigOpcode)ib->opcode; - opType = brigAtomicToMemOpType(opcode, atomicOperation); + + assert(opcode == Brig::BRIG_OPCODE_ATOMICNORET || + opcode == Brig::BRIG_OPCODE_ATOMIC); + + setFlag(MemoryRef); + + if (opcode == Brig::BRIG_OPCODE_ATOMIC) { + setFlag(AtomicReturn); + } else { + setFlag(AtomicNoReturn); + } + + switch (memoryOrder) { + case BRIG_MEMORY_ORDER_NONE: + setFlag(NoOrder); + break; + case BRIG_MEMORY_ORDER_RELAXED: + setFlag(RelaxedOrder); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE: + setFlag(Acquire); + break; + case BRIG_MEMORY_ORDER_SC_RELEASE: + setFlag(Release); + break; + case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE: + setFlag(AcquireRelease); + break; + default: + fatal("AtomicInst has bad memory order type\n"); + } + + switch (memoryScope) { + case BRIG_MEMORY_SCOPE_NONE: + setFlag(NoScope); + break; + case BRIG_MEMORY_SCOPE_WORKITEM: + setFlag(WorkitemScope); + break; + case BRIG_MEMORY_SCOPE_WORKGROUP: + setFlag(WorkgroupScope); + break; + case BRIG_MEMORY_SCOPE_AGENT: + setFlag(DeviceScope); + break; + case BRIG_MEMORY_SCOPE_SYSTEM: + setFlag(SystemScope); + break; + default: + fatal("AtomicInst has bad memory scope type\n"); + } + + switch (atomicOperation) { + case Brig::BRIG_ATOMIC_AND: + setFlag(AtomicAnd); + break; + case Brig::BRIG_ATOMIC_OR: + setFlag(AtomicOr); + break; + case Brig::BRIG_ATOMIC_XOR: + setFlag(AtomicXor); + break; + case Brig::BRIG_ATOMIC_CAS: + setFlag(AtomicCAS); + break; + case Brig::BRIG_ATOMIC_EXCH: + setFlag(AtomicExch); + break; + case Brig::BRIG_ATOMIC_ADD: + setFlag(AtomicAdd); + break; + case Brig::BRIG_ATOMIC_WRAPINC: + setFlag(AtomicInc); + break; + case Brig::BRIG_ATOMIC_WRAPDEC: + setFlag(AtomicDec); + break; + case Brig::BRIG_ATOMIC_MIN: + setFlag(AtomicMin); + break; + case Brig::BRIG_ATOMIC_MAX: + setFlag(AtomicMax); + break; + case Brig::BRIG_ATOMIC_SUB: + setFlag(AtomicSub); + break; + default: + fatal("Bad BrigAtomicOperation code %d\n", atomicOperation); + } switch (segment) { case BRIG_SEGMENT_GLOBAL: - o_type = Enums::OT_GLOBAL_ATOMIC; + setFlag(GlobalSegment); break; - case BRIG_SEGMENT_GROUP: - o_type = Enums::OT_SHARED_ATOMIC; + setFlag(GroupSegment); break; - case BRIG_SEGMENT_FLAT: - o_type = Enums::OT_FLAT_ATOMIC; + setFlag(Flat); break; - default: panic("Atomic: segment %d not supported\n", segment); } @@ -1354,11 +1383,10 @@ namespace HsailISA { // before doing the RMW, check if this atomic has // release semantics, and if so issue a release first - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && (gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) { + && (gpuDynInst->isRelease() + || gpuDynInst->isAcquireRelease())) { gpuDynInst->statusBitVector = VectorMask(1); @@ -1383,12 +1411,6 @@ namespace HsailISA void execute(GPUDynInstPtr gpuDynInst) override; - bool - isLocalMem() const override - { - return this->segment == Brig::BRIG_SEGMENT_GROUP; - } - private: // execAtomic may be called through a continuation // if the RMW had release semantics. see comment for @@ -1408,72 +1430,48 @@ namespace HsailISA if (gpuDynInst->exec_mask[i]) { Addr vaddr = gpuDynInst->addr[i]; - if (isLocalMem()) { + if (this->isLocalMem()) { Wavefront *wavefront = gpuDynInst->wavefront(); *d = wavefront->ldsChunk->read(vaddr); - switch (this->opType) { - case Enums::MO_AADD: - case Enums::MO_ANRADD: + if (this->isAtomicAdd()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) + (*e)); - break; - case Enums::MO_ASUB: - case Enums::MO_ANRSUB: + } else if (this->isAtomicSub()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) - (*e)); - break; - case Enums::MO_AMAX: - case Enums::MO_ANRMAX: + } else if (this->isAtomicMax()) { wavefront->ldsChunk->write(vaddr, std::max(wavefront->ldsChunk->read(vaddr), (*e))); - break; - case Enums::MO_AMIN: - case Enums::MO_ANRMIN: + } else if (this->isAtomicMin()) { wavefront->ldsChunk->write(vaddr, std::min(wavefront->ldsChunk->read(vaddr), (*e))); - break; - case Enums::MO_AAND: - case Enums::MO_ANRAND: + } else if (this->isAtomicAnd()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) & (*e)); - break; - case Enums::MO_AOR: - case Enums::MO_ANROR: + } else if (this->isAtomicOr()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) | (*e)); - break; - case Enums::MO_AXOR: - case Enums::MO_ANRXOR: + } else if (this->isAtomicXor()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) ^ (*e)); - break; - case Enums::MO_AINC: - case Enums::MO_ANRINC: + } else if (this->isAtomicInc()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) + 1); - break; - case Enums::MO_ADEC: - case Enums::MO_ANRDEC: + } else if (this->isAtomicDec()) { wavefront->ldsChunk->write(vaddr, wavefront->ldsChunk->read(vaddr) - 1); - break; - case Enums::MO_AEXCH: - case Enums::MO_ANREXCH: + } else if (this->isAtomicExch()) { wavefront->ldsChunk->write(vaddr, (*e)); - break; - case Enums::MO_ACAS: - case Enums::MO_ANRCAS: + } else if (this->isAtomicCAS()) { wavefront->ldsChunk->write(vaddr, (wavefront->ldsChunk->read(vaddr) == (*e)) ? (*f) : wavefront->ldsChunk->read(vaddr)); - break; - default: + } else { fatal("Unrecognized or invalid HSAIL atomic op " "type.\n"); - break; } } else { Request *req = @@ -1481,7 +1479,7 @@ namespace HsailISA gpuDynInst->computeUnit()->masterId(), 0, gpuDynInst->wfDynId, gpuDynInst->makeAtomicOpFunctor(e, - f, this->opType)); + f)); gpuDynInst->setRequestFlags(req); PacketPtr pkt = new Packet(req, MemCmd::SwapReq); @@ -1489,8 +1487,7 @@ namespace HsailISA if (gpuDynInst->computeUnit()->shader-> separate_acquire_release && - (gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE)) { + (gpuDynInst->isAcquire())) { // if this atomic has acquire semantics, // schedule the continuation to perform an // acquire after the RMW completes @@ -1523,10 +1520,9 @@ namespace HsailISA { // after performing the RMW, check to see if this instruction // has acquire semantics, and if so, issue an acquire - if (!isLocalMem()) { + if (!this->isLocalMem()) { if (gpuDynInst->computeUnit()->shader->separate_acquire_release - && gpuDynInst->memoryOrder == - Enums::MEMORY_ORDER_SC_ACQUIRE) { + && gpuDynInst->isAcquire()) { gpuDynInst->statusBitVector = VectorMask(1); // the request will be finished when diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh index e3529f914..c175f2782 100644 --- a/src/arch/hsail/insts/mem_impl.hh +++ b/src/arch/hsail/insts/mem_impl.hh @@ -33,7 +33,6 @@ * Author: Steve Reinhardt */ -#include "arch/hsail/generic_types.hh" #include "gpu-compute/hsail_code.hh" // defined in code.cc, but not worth sucking in all of code.h for this @@ -215,16 +214,12 @@ namespace HsailISA this->addr.calcVector(w, m->addr); - m->m_op = Enums::MO_LD; m->m_type = MemDataType::memType; m->v_type = DestDataType::vgprType; m->exec_mask = w->execMask(); m->statusBitVector = 0; m->equiv = this->equivClass; - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); if (num_dest_operands == 1) { m->dst_reg = this->dest.regIndex(); @@ -245,7 +240,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -276,7 +270,6 @@ namespace HsailISA case Brig::BRIG_SEGMENT_SPILL: assert(num_dest_operands == 1); - m->s_type = SEG_SPILL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -301,7 +294,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); @@ -310,7 +302,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_READONLY: - m->s_type = SEG_READONLY; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -327,7 +318,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_PRIVATE: - m->s_type = SEG_PRIVATE; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -408,7 +398,6 @@ namespace HsailISA } } - m->m_op = Enums::MO_ST; m->m_type = OperationType::memType; m->v_type = OperationType::vgprType; @@ -421,10 +410,6 @@ namespace HsailISA m->n_reg = num_src_operands; } - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); - m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; @@ -434,7 +419,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); @@ -463,7 +447,6 @@ namespace HsailISA case Brig::BRIG_SEGMENT_SPILL: assert(num_src_operands == 1); - m->s_type = SEG_SPILL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -483,7 +466,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); @@ -492,7 +474,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_PRIVATE: - m->s_type = SEG_PRIVATE; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); { @@ -586,7 +567,6 @@ namespace HsailISA assert(NumSrcOperands <= 2); - m->m_op = this->opType; m->m_type = DataType::memType; m->v_type = DataType::vgprType; @@ -594,9 +574,6 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = getGenericMemoryOrder(this->memoryOrder); - - m->scope = getGenericMemoryScope(this->memoryScope); if (HasDst) { m->dst_reg = this->dest.regIndex(); @@ -611,7 +588,6 @@ namespace HsailISA switch (this->segment) { case Brig::BRIG_SEGMENT_GLOBAL: - m->s_type = SEG_GLOBAL; m->latency.set(w->computeUnit->shader->ticks(64)); m->pipeId = GLBMEM_PIPE; @@ -623,7 +599,6 @@ namespace HsailISA break; case Brig::BRIG_SEGMENT_GROUP: - m->s_type = SEG_SHARED; m->pipeId = LDSMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(24)); w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m); diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc index 2bfc5aaad..bfffb7d8f 100644 --- a/src/arch/hsail/insts/pseudo_inst.cc +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -627,8 +627,12 @@ namespace HsailISA ((int*)m->a_data)[lane] = src1.get(w, lane, 3); } - m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, - Brig::BRIG_ATOMIC_ADD); + setFlag(AtomicNoReturn); + setFlag(AtomicAdd); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; m->v_type = U32::vgprType; @@ -636,15 +640,12 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); @@ -666,8 +667,12 @@ namespace HsailISA ((int*)m->a_data)[lane] = src1.get(w, lane, 1); } - m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET, - Brig::BRIG_ATOMIC_ADD); + setFlag(AtomicNoReturn); + setFlag(AtomicAdd); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; m->v_type = U32::vgprType; @@ -675,15 +680,12 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; // atomics don't have an equivalence class operand m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; m->simdId = w->simdId; m->wfSlotId = w->wfSlotId; m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); @@ -702,7 +704,11 @@ namespace HsailISA // calculate the address calcAddr(w, m); - m->m_op = Enums::MO_LD; + setFlag(Load); + setFlag(NoScope); + setFlag(NoOrder); + setFlag(GlobalSegment); + m->m_type = U32::memType; //MemDataType::memType; m->v_type = U32::vgprType; //DestDataType::vgprType; @@ -710,8 +716,6 @@ namespace HsailISA m->statusBitVector = 0; m->equiv = 0; m->n_reg = 1; - m->memoryOrder = Enums::MEMORY_ORDER_NONE; - m->scope = Enums::MEMORY_SCOPE_NONE; // FIXME //m->dst_reg = this->dest.regIndex(); @@ -721,7 +725,6 @@ namespace HsailISA m->wfDynId = w->wfDynId; m->latency.init(&w->computeUnit->shader->tick_cnt); - m->s_type = SEG_GLOBAL; m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index f580a09f7..b672f616c 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -171,56 +171,6 @@ class GpuDispatcher(DmaDevice): cl_driver = Param.ClDriver('pointer to driver') -class OpType(Enum): vals = [ - 'OT_NULL', - 'OT_ALU', - 'OT_SPECIAL', - 'OT_GLOBAL_READ', - 'OT_GLOBAL_WRITE', - 'OT_GLOBAL_ATOMIC', - 'OT_GLOBAL_HIST', - 'OT_GLOBAL_LDAS', - 'OT_SHARED_READ', - 'OT_SHARED_WRITE', - 'OT_SHARED_ATOMIC', - 'OT_SHARED_HIST', - 'OT_SHARED_LDAS', - 'OT_PRIVATE_READ', - 'OT_PRIVATE_WRITE', - 'OT_PRIVATE_ATOMIC', - 'OT_PRIVATE_HIST', - 'OT_PRIVATE_LDAS', - 'OT_SPILL_READ', - 'OT_SPILL_WRITE', - 'OT_SPILL_ATOMIC', - 'OT_SPILL_HIST', - 'OT_SPILL_LDAS', - 'OT_READONLY_READ', - 'OT_READONLY_WRITE', - 'OT_READONLY_ATOMIC', - 'OT_READONLY_HIST', - 'OT_READONLY_LDAS', - 'OT_FLAT_READ', - 'OT_FLAT_WRITE', - 'OT_FLAT_ATOMIC', - 'OT_FLAT_HIST', - 'OT_FLAT_LDAS', - 'OT_KERN_READ', - 'OT_BRANCH', - - # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version - # of the compiler. - 'OT_SHARED_MEMFENCE', - 'OT_GLOBAL_MEMFENCE', - 'OT_BOTH_MEMFENCE', - - 'OT_BARRIER', - 'OT_PRINT', - 'OT_RET', - 'OT_NOP', - 'OT_ARG' - ] - class MemType(Enum): vals = [ 'M_U8', 'M_U16', @@ -235,47 +185,6 @@ class MemType(Enum): vals = [ 'M_F64', ] -class MemOpType(Enum): vals = [ - 'MO_LD', - 'MO_ST', - 'MO_LDAS', - 'MO_LDA', - 'MO_AAND', - 'MO_AOR', - 'MO_AXOR', - 'MO_ACAS', - 'MO_AEXCH', - 'MO_AADD', - 'MO_ASUB', - 'MO_AINC', - 'MO_ADEC', - 'MO_AMAX', - 'MO_AMIN', - 'MO_ANRAND', - 'MO_ANROR', - 'MO_ANRXOR', - 'MO_ANRCAS', - 'MO_ANREXCH', - 'MO_ANRADD', - 'MO_ANRSUB', - 'MO_ANRINC', - 'MO_ANRDEC', - 'MO_ANRMAX', - 'MO_ANRMIN', - 'MO_HAND', - 'MO_HOR', - 'MO_HXOR', - 'MO_HCAS', - 'MO_HEXCH', - 'MO_HADD', - 'MO_HSUB', - 'MO_HINC', - 'MO_HDEC', - 'MO_HMAX', - 'MO_HMIN', - 'MO_UNDEF' - ] - class StorageClassType(Enum): vals = [ 'SC_SPILL', 'SC_GLOBAL', @@ -293,20 +202,3 @@ class RegisterType(Enum): vals = [ 'RT_HARDWARE', 'RT_NONE', ] - -class GenericMemoryOrder(Enum): vals = [ - 'MEMORY_ORDER_NONE', - 'MEMORY_ORDER_RELAXED', - 'MEMORY_ORDER_SC_ACQUIRE', - 'MEMORY_ORDER_SC_RELEASE', - 'MEMORY_ORDER_SC_ACQUIRE_RELEASE', - ] - -class GenericMemoryScope(Enum): vals = [ - 'MEMORY_SCOPE_NONE', - 'MEMORY_SCOPE_WORKITEM', - 'MEMORY_SCOPE_WAVEFRONT', - 'MEMORY_SCOPE_WORKGROUP', - 'MEMORY_SCOPE_DEVICE', - 'MEMORY_SCOPE_SYSTEM', - ] diff --git a/src/gpu-compute/GPUStaticInstFlags.py b/src/gpu-compute/GPUStaticInstFlags.py new file mode 100644 index 000000000..453fdced2 --- /dev/null +++ b/src/gpu-compute/GPUStaticInstFlags.py @@ -0,0 +1,111 @@ +# Copyright (c) 2016 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Anthony Gutierrez + +from m5.params import * + +class GPUStaticInstFlags(Enum): + wrapper_name = 'GPUStaticInstFlags' + wrapper_is_struct = True + enum_name = 'Flags' + + vals = [ + # Op types + 'ALU', # ALU op + 'Branch', # Branch instruction + 'Nop', # No-op (no effect at all) + 'Return', # Return instruction + 'UnconditionalJump', # + 'SpecialOp', # Special op + 'Waitcnt', # Is a waitcnt instruction + + # Memory ops + 'MemBarrier', # Barrier instruction + 'MemFence', # Memory fence instruction + 'MemoryRef', # References memory (load, store, or atomic) + 'Flat', # Flat memory op + 'Load', # Reads from memory + 'Store', # Writes to memory + + # Atomic ops + 'AtomicReturn', # Atomic instruction that returns data + 'AtomicNoReturn', # Atomic instruction that doesn't return data + + # Instruction attributes + 'Scalar', # A scalar (not vector) operation + 'ReadsSCC', # The instruction reads SCC + 'WritesSCC', # The instruction writes SCC + 'ReadsVCC', # The instruction reads VCC + 'WritesVCC', # The instruction writes VCC + + # Atomic OP types + 'AtomicAnd', + 'AtomicOr', + 'AtomicXor', + 'AtomicCAS', + 'AtomicExch', + 'AtomicAdd', + 'AtomicSub', + 'AtomicInc', + 'AtomicDec', + 'AtomicMax', + 'AtomicMin', + + # Memory order flags + 'RelaxedOrder', + 'Acquire', # Has acquire semantics + 'Release', # Has release semantics + 'AcquireRelease', # Has acquire and release semantics + 'NoOrder', # Has no ordering restrictions + + # Segment access flags + 'ArgSegment', # Accesses the arg segment + 'GlobalSegment', # Accesses global memory + 'GroupSegment', # Accesses local memory (LDS), aka shared memory + 'KernArgSegment', # Accesses the kernel argument segment + 'PrivateSegment', # Accesses the private segment + 'ReadOnlySegment', # Accesses read only memory + 'SpillSegment', # Accesses the spill segment + 'NoSegment', # Does not have an associated segment + + # Scope flags + 'WorkitemScope', + 'WavefrontScope', + 'WorkgroupScope', + 'DeviceScope', + 'SystemScope', + 'NoScope', # Does not have an associated scope + + # Coherence flags + 'GloballyCoherent', # Coherent with other workitems on same device + 'SystemCoherent' # Coherent with a different device, or the host + ] diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript index 88c1cf036..8cf1ed8cf 100644 --- a/src/gpu-compute/SConscript +++ b/src/gpu-compute/SConscript @@ -41,6 +41,7 @@ if not env['BUILD_GPU']: Return() SimObject('GPU.py') +SimObject('GPUStaticInstFlags.py') SimObject('LdsState.py') SimObject('X86GPUTLB.py') diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh deleted file mode 100644 index 6cd9bfe26..000000000 --- a/src/gpu-compute/code_enums.hh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2015 Advanced Micro Devices, Inc. - * All rights reserved. - * - * For use for simulation and test purposes only - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * Author: Anthony Gutierrez - */ - -#ifndef __CODE_ENUMS_HH__ -#define __CODE_ENUMS_HH__ - -#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \ - && (a)<=Enums::OT_GLOBAL_LDAS) -#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \ - && (a)<=Enums::OT_SHARED_LDAS) -#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \ - && (a)<=Enums::OT_PRIVATE_LDAS) -#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \ - && (a)<=Enums::OT_SPILL_LDAS) -#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \ - && (a)<=Enums::OT_READONLY_LDAS) -#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS) - -#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \ - ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \ - ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS) - -#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \ - ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ) - -#define IS_OT_READ_GM(a) \ - ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \ - ||(a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ) - -#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ) - -#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ) - -#define IS_OT_WRITE(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \ - ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE) - -#define IS_OT_WRITE_GM(a) \ - ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \ - ||(a)==Enums::OT_READONLY_WRITE) - -#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE) - -#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE) - -#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_PRIVATE_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_BOTH_MEMFENCE \ - ||(a)==Enums::OT_FLAT_ATOMIC) - -#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \ - ||(a)==Enums::OT_SPILL_ATOMIC \ - ||(a)==Enums::OT_READONLY_ATOMIC \ - ||(a)==Enums::OT_GLOBAL_MEMFENCE \ - ||(a)==Enums::OT_BOTH_MEMFENCE) - -#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \ - ||(a)==Enums::OT_SHARED_MEMFENCE) - -#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC) - -#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SHARED_HIST \ - ||(a)==Enums::OT_PRIVATE_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST \ - ||(a)==Enums::OT_FLAT_HIST) - -#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \ - ||(a)==Enums::OT_SPILL_HIST \ - ||(a)==Enums::OT_READONLY_HIST) - -#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST) - -#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST) - -#endif // __CODE_ENUMS_HH__ diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 97e018713..abf8ff2c5 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -75,7 +75,8 @@ ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p), req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()), resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()), _masterId(p->system->getMasterId(name() + ".ComputeUnit")), - lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize) + lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize), + kernelLaunchInst(new KernelLaunchStaticInst()) { /** * This check is necessary because std::bitset only provides conversion @@ -316,13 +317,11 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) // Send L1 cache acquire // isKernel + isAcquire = Kernel Begin if (shader->impl_kern_boundary_sync) { - GPUDynInstPtr gpuDynInst = std::make_shared(this, - nullptr, - nullptr, 0); + GPUDynInstPtr gpuDynInst = + std::make_shared(this, nullptr, kernelLaunchInst, + getAndIncSeqNum()); gpuDynInst->useContinuation = false; - gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE; - gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM; injectGlobalMemFence(gpuDynInst, true); } @@ -647,7 +646,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) gpuDynInst->wfSlotId, w->barrierCnt); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -658,7 +657,7 @@ ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt) return true; } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -942,6 +941,8 @@ void ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, Request* req) { + assert(gpuDynInst->isGlobalSeg()); + if (!req) { req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId); } @@ -950,8 +951,6 @@ ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch, req->setFlags(Request::KERNEL); } - gpuDynInst->s_type = SEG_GLOBAL; - // for non-kernel MemFence operations, memorder flags are set depending // on which type of request is currently being sent, so this // should be set by the caller (e.g. if an inst has acq-rel @@ -1033,8 +1032,7 @@ ComputeUnit::DataPort::MemRespEvent::process() if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op) - || MO_ANR(gpuDynInst->m_op)) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); compute_unit->globalMemoryPipe.getGMLdRespFIFO() @@ -1055,7 +1053,7 @@ ComputeUnit::DataPort::MemRespEvent::process() // the continuation may generate more work for // this memory request if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } @@ -1065,7 +1063,7 @@ ComputeUnit::DataPort::MemRespEvent::process() gpuDynInst->statusBitVector = VectorMask(0); if (gpuDynInst->useContinuation) { - assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE); + assert(!gpuDynInst->isNoScope()); gpuDynInst->execContinuation(gpuDynInst->staticInstruction(), gpuDynInst); } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index a3547402a..938658fd1 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -744,6 +744,7 @@ class ComputeUnit : public MemObject private: uint64_t globalSeqNum; int wavefrontSize; + GPUStaticInst *kernelLaunchInst; }; #endif // __COMPUTE_UNIT_HH__ diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index 102905ec8..ab3e8c47e 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -67,7 +67,7 @@ GlobalMemPipeline::exec() bool accessVrf = true; // check the VRF to see if the operands of a load (or load component // of an atomic) are accessible - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -127,10 +127,7 @@ GlobalMemPipeline::exec() // memory packets to DTLB if (!gmIssuedRequests.empty()) { GPUDynInstPtr mp = gmIssuedRequests.front(); - if (mp->m_op == Enums::MO_LD || - (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) || - (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) { - + if (mp->isLoad() || mp->isAtomic()) { if (inflightLoads >= gmQueueSize) { return; } else { @@ -139,7 +136,7 @@ GlobalMemPipeline::exec() } else { if (inflightStores >= gmQueueSize) { return; - } else if (mp->m_op == Enums::MO_ST) { + } else if (mp->isStore()) { ++inflightStores; } } @@ -147,9 +144,8 @@ GlobalMemPipeline::exec() mp->initiateAcc(mp); gmIssuedRequests.pop(); - DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n", - computeUnit->cu_id, mp->simdId, mp->wfSlotId, - Enums::MemOpTypeStrings[mp->m_op]); + DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", + computeUnit->cu_id, mp->simdId, mp->wfSlotId); } } @@ -160,12 +156,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { gmReturnedLoads.pop(); assert(inflightLoads > 0); --inflightLoads; - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector regVec; // iterate over number of destination register operands since // this is a load or atomic operation @@ -214,13 +210,12 @@ GlobalMemPipeline::doGmReturn(GPUDynInstPtr m) // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) || - MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time, -1); } diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc index 1806e79e4..ec6340360 100644 --- a/src/gpu-compute/gpu_dyn_inst.cc +++ b/src/gpu-compute/gpu_dyn_inst.cc @@ -41,11 +41,10 @@ #include "gpu-compute/wavefront.hh" GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, - GPUStaticInst *_staticInst, uint64_t instSeqNum) + GPUStaticInst *static_inst, uint64_t instSeqNum) : GPUExecContext(_cu, _wf), addr(computeUnit()->wfSize(), (Addr)0), - m_op(Enums::MO_UNDEF), - memoryOrder(Enums::MEMORY_ORDER_NONE), n_reg(0), useContinuation(false), - statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum) + n_reg(0), useContinuation(false), + statusBitVector(0), _staticInst(static_inst), _seqNum(instSeqNum) { tlbHitLevel.assign(computeUnit()->wfSize(), -1); d_data = new uint8_t[computeUnit()->wfSize() * 16]; @@ -68,77 +67,69 @@ GPUDynInst::~GPUDynInst() } void -GPUDynInst::execute() +GPUDynInst::execute(GPUDynInstPtr gpuDynInst) { - GPUDynInstPtr gpuDynInst = std::make_shared(cu, wf, staticInst, - _seqNum); - staticInst->execute(gpuDynInst); + _staticInst->execute(gpuDynInst); } int GPUDynInst::numSrcRegOperands() { - return staticInst->numSrcRegOperands(); + return _staticInst->numSrcRegOperands(); } int GPUDynInst::numDstRegOperands() { - return staticInst->numDstRegOperands(); + return _staticInst->numDstRegOperands(); } int GPUDynInst::getNumOperands() { - return staticInst->getNumOperands(); + return _staticInst->getNumOperands(); } bool GPUDynInst::isVectorRegister(int operandIdx) { - return staticInst->isVectorRegister(operandIdx); + return _staticInst->isVectorRegister(operandIdx); } bool GPUDynInst::isScalarRegister(int operandIdx) { - return staticInst->isScalarRegister(operandIdx); + return _staticInst->isScalarRegister(operandIdx); } int GPUDynInst::getRegisterIndex(int operandIdx) { - return staticInst->getRegisterIndex(operandIdx); + return _staticInst->getRegisterIndex(operandIdx); } int GPUDynInst::getOperandSize(int operandIdx) { - return staticInst->getOperandSize(operandIdx); + return _staticInst->getOperandSize(operandIdx); } bool GPUDynInst::isDstOperand(int operandIdx) { - return staticInst->isDstOperand(operandIdx); + return _staticInst->isDstOperand(operandIdx); } bool GPUDynInst::isSrcOperand(int operandIdx) { - return staticInst->isSrcOperand(operandIdx); -} - -bool -GPUDynInst::isArgLoad() -{ - return staticInst->isArgLoad(); + return _staticInst->isSrcOperand(operandIdx); } const std::string& GPUDynInst::disassemble() const { - return staticInst->disassemble(); + return _staticInst->disassemble(); } uint64_t @@ -147,16 +138,10 @@ GPUDynInst::seqNum() const return _seqNum; } -Enums::OpType -GPUDynInst::opType() -{ - return staticInst->o_type; -} - Enums::StorageClassType GPUDynInst::executedAs() { - return staticInst->executed_as; + return _staticInst->executed_as; } // Process a memory instruction and (if necessary) submit timing request @@ -166,20 +151,347 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst) DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n", cu->cu_id, simdId, wfSlotId, exec_mask); - staticInst->initiateAcc(gpuDynInst); + _staticInst->initiateAcc(gpuDynInst); time = 0; } +/** + * accessor methods for the attributes of + * the underlying GPU static instruction + */ +bool +GPUDynInst::isALU() const +{ + return _staticInst->isALU(); +} + +bool +GPUDynInst::isBranch() const +{ + return _staticInst->isBranch(); +} + +bool +GPUDynInst::isNop() const +{ + return _staticInst->isNop(); +} + +bool +GPUDynInst::isReturn() const +{ + return _staticInst->isReturn(); +} + +bool +GPUDynInst::isUnconditionalJump() const +{ + return _staticInst->isUnconditionalJump(); +} + +bool +GPUDynInst::isSpecialOp() const +{ + return _staticInst->isSpecialOp(); +} + +bool +GPUDynInst::isWaitcnt() const +{ + return _staticInst->isWaitcnt(); +} + +bool +GPUDynInst::isBarrier() const +{ + return _staticInst->isBarrier(); +} + +bool +GPUDynInst::isMemFence() const +{ + return _staticInst->isMemFence(); +} + +bool +GPUDynInst::isMemRef() const +{ + return _staticInst->isMemRef(); +} + +bool +GPUDynInst::isFlat() const +{ + return _staticInst->isFlat(); +} + +bool +GPUDynInst::isLoad() const +{ + return _staticInst->isLoad(); +} + +bool +GPUDynInst::isStore() const +{ + return _staticInst->isStore(); +} + +bool +GPUDynInst::isAtomic() const +{ + return _staticInst->isAtomic(); +} + +bool +GPUDynInst::isAtomicNoRet() const +{ + return _staticInst->isAtomicNoRet(); +} + +bool +GPUDynInst::isAtomicRet() const +{ + return _staticInst->isAtomicRet(); +} + +bool +GPUDynInst::isScalar() const +{ + return _staticInst->isScalar(); +} + +bool +GPUDynInst::readsSCC() const +{ + return _staticInst->readsSCC(); +} + +bool +GPUDynInst::writesSCC() const +{ + return _staticInst->writesSCC(); +} + +bool +GPUDynInst::readsVCC() const +{ + return _staticInst->readsVCC(); +} + +bool +GPUDynInst::writesVCC() const +{ + return _staticInst->writesVCC(); +} + +bool +GPUDynInst::isAtomicAnd() const +{ + return _staticInst->isAtomicAnd(); +} + +bool +GPUDynInst::isAtomicOr() const +{ + return _staticInst->isAtomicOr(); +} + +bool +GPUDynInst::isAtomicXor() const +{ + return _staticInst->isAtomicXor(); +} + +bool +GPUDynInst::isAtomicCAS() const +{ + return _staticInst->isAtomicCAS(); +} + +bool GPUDynInst::isAtomicExch() const +{ + return _staticInst->isAtomicExch(); +} + +bool +GPUDynInst::isAtomicAdd() const +{ + return _staticInst->isAtomicAdd(); +} + +bool +GPUDynInst::isAtomicSub() const +{ + return _staticInst->isAtomicSub(); +} + +bool +GPUDynInst::isAtomicInc() const +{ + return _staticInst->isAtomicInc(); +} + +bool +GPUDynInst::isAtomicDec() const +{ + return _staticInst->isAtomicDec(); +} + +bool +GPUDynInst::isAtomicMax() const +{ + return _staticInst->isAtomicMax(); +} + +bool +GPUDynInst::isAtomicMin() const +{ + return _staticInst->isAtomicMin(); +} + +bool +GPUDynInst::isArgLoad() const +{ + return _staticInst->isArgLoad(); +} + +bool +GPUDynInst::isGlobalMem() const +{ + return _staticInst->isGlobalMem(); +} + +bool +GPUDynInst::isLocalMem() const +{ + return _staticInst->isLocalMem(); +} + +bool +GPUDynInst::isArgSeg() const +{ + return _staticInst->isArgSeg(); +} + +bool +GPUDynInst::isGlobalSeg() const +{ + return _staticInst->isGlobalSeg(); +} + +bool +GPUDynInst::isGroupSeg() const +{ + return _staticInst->isGroupSeg(); +} + +bool +GPUDynInst::isKernArgSeg() const +{ + return _staticInst->isKernArgSeg(); +} + +bool +GPUDynInst::isPrivateSeg() const +{ + return _staticInst->isPrivateSeg(); +} + +bool +GPUDynInst::isReadOnlySeg() const +{ + return _staticInst->isReadOnlySeg(); +} + +bool +GPUDynInst::isSpillSeg() const +{ + return _staticInst->isSpillSeg(); +} + +bool +GPUDynInst::isWorkitemScope() const +{ + return _staticInst->isWorkitemScope(); +} + +bool +GPUDynInst::isWavefrontScope() const +{ + return _staticInst->isWavefrontScope(); +} + +bool +GPUDynInst::isWorkgroupScope() const +{ + return _staticInst->isWorkgroupScope(); +} + +bool +GPUDynInst::isDeviceScope() const +{ + return _staticInst->isDeviceScope(); +} + +bool +GPUDynInst::isSystemScope() const +{ + return _staticInst->isSystemScope(); +} + +bool +GPUDynInst::isNoScope() const +{ + return _staticInst->isNoScope(); +} + +bool +GPUDynInst::isRelaxedOrder() const +{ + return _staticInst->isRelaxedOrder(); +} + +bool +GPUDynInst::isAcquire() const +{ + return _staticInst->isAcquire(); +} + +bool +GPUDynInst::isRelease() const +{ + return _staticInst->isRelease(); +} + +bool +GPUDynInst::isAcquireRelease() const +{ + return _staticInst->isAcquireRelease(); +} + +bool +GPUDynInst::isNoOrder() const +{ + return _staticInst->isNoOrder(); +} + +bool +GPUDynInst::isGloballyCoherent() const +{ + return _staticInst->isGloballyCoherent(); +} + bool -GPUDynInst::scalarOp() const +GPUDynInst::isSystemCoherent() const { - return staticInst->scalarOp(); + return _staticInst->isSystemCoherent(); } void GPUDynInst::updateStats() { - if (staticInst->isLocalMem()) { + if (_staticInst->isLocalMem()) { // access to LDS (shared) memory cu->dynamicLMemInstrCnt++; } else { diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh index 46774d867..c07d85d78 100644 --- a/src/gpu-compute/gpu_dyn_inst.hh +++ b/src/gpu-compute/gpu_dyn_inst.hh @@ -39,11 +39,7 @@ #include #include -#include "enums/GenericMemoryOrder.hh" -#include "enums/GenericMemoryScope.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" -#include "enums/OpType.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_exec_context.hh" @@ -180,33 +176,19 @@ class AtomicOpMin : public TypedAtomicOpFunctor } }; -#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN) -#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN) -#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN) - typedef enum { VT_32, VT_64, } vgpr_type; -typedef enum -{ - SEG_PRIVATE, - SEG_SPILL, - SEG_GLOBAL, - SEG_SHARED, - SEG_READONLY, - SEG_FLAT -} seg_type; - class GPUDynInst : public GPUExecContext { public: - GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst, + GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst, uint64_t instSeqNum); ~GPUDynInst(); - void execute(); + void execute(GPUDynInstPtr gpuDynInst); int numSrcRegOperands(); int numDstRegOperands(); int getNumOperands(); @@ -216,13 +198,11 @@ class GPUDynInst : public GPUExecContext int getOperandSize(int operandIdx); bool isDstOperand(int operandIdx); bool isSrcOperand(int operandIdx); - bool isArgLoad(); const std::string &disassemble() const; uint64_t seqNum() const; - Enums::OpType opType(); Enums::StorageClassType executedAs(); // The address of the memory operation @@ -240,14 +220,7 @@ class GPUDynInst : public GPUExecContext // The memory type (M_U32, M_S32, ...) Enums::MemType m_type; - // The memory operation (MO_LD, MO_ST, ...) - Enums::MemOpType m_op; - Enums::GenericMemoryOrder memoryOrder; - - // Scope of the request - Enums::GenericMemoryScope scope; - // The memory segment (SEG_SHARED, SEG_GLOBAL, ...) - seg_type s_type; + // The equivalency class int equiv; // The return VGPR type (VT_32 or VT_64) @@ -288,10 +261,72 @@ class GPUDynInst : public GPUExecContext void updateStats(); - GPUStaticInst* staticInstruction() { return staticInst; } - - // Is the instruction a scalar or vector op? - bool scalarOp() const; + GPUStaticInst* staticInstruction() { return _staticInst; } + + bool isALU() const; + bool isBranch() const; + bool isNop() const; + bool isReturn() const; + bool isUnconditionalJump() const; + bool isSpecialOp() const; + bool isWaitcnt() const; + + bool isBarrier() const; + bool isMemFence() const; + bool isMemRef() const; + bool isFlat() const; + bool isLoad() const; + bool isStore() const; + + bool isAtomic() const; + bool isAtomicNoRet() const; + bool isAtomicRet() const; + + bool isScalar() const; + bool readsSCC() const; + bool writesSCC() const; + bool readsVCC() const; + bool writesVCC() const; + + bool isAtomicAnd() const; + bool isAtomicOr() const; + bool isAtomicXor() const; + bool isAtomicCAS() const; + bool isAtomicExch() const; + bool isAtomicAdd() const; + bool isAtomicSub() const; + bool isAtomicInc() const; + bool isAtomicDec() const; + bool isAtomicMax() const; + bool isAtomicMin() const; + + bool isArgLoad() const; + bool isGlobalMem() const; + bool isLocalMem() const; + + bool isArgSeg() const; + bool isGlobalSeg() const; + bool isGroupSeg() const; + bool isKernArgSeg() const; + bool isPrivateSeg() const; + bool isReadOnlySeg() const; + bool isSpillSeg() const; + + bool isWorkitemScope() const; + bool isWavefrontScope() const; + bool isWorkgroupScope() const; + bool isDeviceScope() const; + bool isSystemScope() const; + bool isNoScope() const; + + bool isRelaxedOrder() const; + bool isAcquire() const; + bool isRelease() const; + bool isAcquireRelease() const; + bool isNoOrder() const; + + bool isGloballyCoherent() const; + bool isSystemCoherent() const; /* * Loads/stores/atomics may have acquire/release semantics associated @@ -312,46 +347,32 @@ class GPUDynInst : public GPUExecContext bool useContinuation; template AtomicOpFunctor* - makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op) + makeAtomicOpFunctor(c0 *reg0, c0 *reg1) { - using namespace Enums; - - switch(op) { - case MO_AAND: - case MO_ANRAND: + if (isAtomicAnd()) { return new AtomicOpAnd(*reg0); - case MO_AOR: - case MO_ANROR: + } else if (isAtomicOr()) { return new AtomicOpOr(*reg0); - case MO_AXOR: - case MO_ANRXOR: + } else if (isAtomicXor()) { return new AtomicOpXor(*reg0); - case MO_ACAS: - case MO_ANRCAS: + } else if (isAtomicCAS()) { return new AtomicOpCAS(*reg0, *reg1, cu); - case MO_AEXCH: - case MO_ANREXCH: + } else if (isAtomicExch()) { return new AtomicOpExch(*reg0); - case MO_AADD: - case MO_ANRADD: + } else if (isAtomicAdd()) { return new AtomicOpAdd(*reg0); - case MO_ASUB: - case MO_ANRSUB: + } else if (isAtomicSub()) { return new AtomicOpSub(*reg0); - case MO_AINC: - case MO_ANRINC: + } else if (isAtomicInc()) { return new AtomicOpInc(); - case MO_ADEC: - case MO_ANRDEC: + } else if (isAtomicDec()) { return new AtomicOpDec(); - case MO_AMAX: - case MO_ANRMAX: + } else if (isAtomicMax()) { return new AtomicOpMax(*reg0); - case MO_AMIN: - case MO_ANRMIN: + } else if (isAtomicMin()) { return new AtomicOpMin(*reg0); - default: - panic("Unrecognized atomic operation"); + } else { + fatal("Unrecognized atomic operation"); } } @@ -359,88 +380,58 @@ class GPUDynInst : public GPUExecContext setRequestFlags(Request *req, bool setMemOrder=true) { // currently these are the easy scopes to deduce - switch (s_type) { - case SEG_PRIVATE: + if (isPrivateSeg()) { req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT); - break; - case SEG_SPILL: + } else if (isSpillSeg()) { req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT); - break; - case SEG_GLOBAL: + } else if (isGlobalSeg()) { req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT); - break; - case SEG_READONLY: + } else if (isReadOnlySeg()) { req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT); - break; - case SEG_SHARED: + } else if (isGroupSeg()) { req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT); - break; - case SEG_FLAT: + } else if (isFlat()) { // TODO: translate to correct scope assert(false); - default: - panic("Bad segment type"); - break; + } else { + fatal("%s has bad segment type\n", disassemble()); } - switch (scope) { - case Enums::MEMORY_SCOPE_NONE: - case Enums::MEMORY_SCOPE_WORKITEM: - break; - case Enums::MEMORY_SCOPE_WAVEFRONT: + if (isWavefrontScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WAVEFRONT_SCOPE); - break; - case Enums::MEMORY_SCOPE_WORKGROUP: + } else if (isWorkgroupScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::WORKGROUP_SCOPE); - break; - case Enums::MEMORY_SCOPE_DEVICE: + } else if (isDeviceScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::DEVICE_SCOPE); - break; - case Enums::MEMORY_SCOPE_SYSTEM: + } else if (isSystemScope()) { req->setMemSpaceConfigFlags(Request::SCOPE_VALID | Request::SYSTEM_SCOPE); - break; - default: - panic("Bad scope type"); - break; + } else if (!isNoScope() && !isWorkitemScope()) { + fatal("%s has bad scope type\n", disassemble()); } if (setMemOrder) { // set acquire and release flags - switch (memoryOrder){ - case Enums::MEMORY_ORDER_SC_ACQUIRE: + if (isAcquire()) { req->setFlags(Request::ACQUIRE); - break; - case Enums::MEMORY_ORDER_SC_RELEASE: + } else if (isRelease()) { req->setFlags(Request::RELEASE); - break; - case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE: + } else if (isAcquireRelease()) { req->setFlags(Request::ACQUIRE | Request::RELEASE); - break; - default: - break; + } else if (!isNoOrder()) { + fatal("%s has bad memory order\n", disassemble()); } } // set atomic type // currently, the instruction genenerator only produces atomic return // but a magic instruction can produce atomic no return - if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB || - m_op == Enums::MO_AAND || m_op == Enums::MO_AOR || - m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX || - m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC || - m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH || - m_op == Enums::MO_ACAS) { + if (isAtomicRet()) { req->setFlags(Request::ATOMIC_RETURN_OP); - } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB || - m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR || - m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX || - m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC || - m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH || - m_op == Enums::MO_ANRCAS) { + } else if (isAtomicNoRet()) { req->setFlags(Request::ATOMIC_NO_RETURN_OP); } } @@ -457,7 +448,7 @@ class GPUDynInst : public GPUExecContext std::vector tlbHitLevel; private: - GPUStaticInst *staticInst; + GPUStaticInst *_staticInst; uint64_t _seqNum; }; diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc index 83b429e62..0f74bd532 100644 --- a/src/gpu-compute/gpu_static_inst.cc +++ b/src/gpu-compute/gpu_static_inst.cc @@ -36,10 +36,12 @@ #include "gpu-compute/gpu_static_inst.hh" GPUStaticInst::GPUStaticInst(const std::string &opcode) - : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode), - _instNum(0), _scalarOp(false) + : executed_as(Enums::SC_NONE), opcode(opcode), + _instNum(0) { + setFlag(NoOrder); } + const std::string& GPUStaticInst::disassemble() { diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh index 911e4f308..a73ec12e3 100644 --- a/src/gpu-compute/gpu_static_inst.hh +++ b/src/gpu-compute/gpu_static_inst.hh @@ -48,7 +48,7 @@ #include #include -#include "enums/OpType.hh" +#include "enums/GPUStaticInstFlags.hh" #include "enums/StorageClassType.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/misc.hh" @@ -57,7 +57,7 @@ class BaseOperand; class BaseRegOperand; class Wavefront; -class GPUStaticInst +class GPUStaticInst : public GPUStaticInstFlags { public: GPUStaticInst(const std::string &opcode); @@ -86,22 +86,110 @@ class GPUStaticInst virtual bool isValid() const = 0; - /* - * Most instructions (including all HSAIL instructions) - * are vector ops, so _scalarOp will be false by default. - * Derived instruction objects that are scalar ops must - * set _scalarOp to true in their constructors. - */ - bool scalarOp() const { return _scalarOp; } + bool isALU() const { return _flags[ALU]; } + bool isBranch() const { return _flags[Branch]; } + bool isNop() const { return _flags[Nop]; } + bool isReturn() const { return _flags[Return]; } + + bool + isUnconditionalJump() const + { + return _flags[UnconditionalJump]; + } + + bool isSpecialOp() const { return _flags[SpecialOp]; } + bool isWaitcnt() const { return _flags[Waitcnt]; } + + bool isBarrier() const { return _flags[MemBarrier]; } + bool isMemFence() const { return _flags[MemFence]; } + bool isMemRef() const { return _flags[MemoryRef]; } + bool isFlat() const { return _flags[Flat]; } + bool isLoad() const { return _flags[Load]; } + bool isStore() const { return _flags[Store]; } + + bool + isAtomic() const + { + return _flags[AtomicReturn] || _flags[AtomicNoReturn]; + } + + bool isAtomicNoRet() const { return _flags[AtomicNoReturn]; } + bool isAtomicRet() const { return _flags[AtomicReturn]; } + + bool isScalar() const { return _flags[Scalar]; } + bool readsSCC() const { return _flags[ReadsSCC]; } + bool writesSCC() const { return _flags[WritesSCC]; } + bool readsVCC() const { return _flags[ReadsVCC]; } + bool writesVCC() const { return _flags[WritesVCC]; } - virtual bool isLocalMem() const + bool isAtomicAnd() const { return _flags[AtomicAnd]; } + bool isAtomicOr() const { return _flags[AtomicOr]; } + bool isAtomicXor() const { return _flags[AtomicXor]; } + bool isAtomicCAS() const { return _flags[AtomicCAS]; } + bool isAtomicExch() const { return _flags[AtomicExch]; } + bool isAtomicAdd() const { return _flags[AtomicAdd]; } + bool isAtomicSub() const { return _flags[AtomicSub]; } + bool isAtomicInc() const { return _flags[AtomicInc]; } + bool isAtomicDec() const { return _flags[AtomicDec]; } + bool isAtomicMax() const { return _flags[AtomicMax]; } + bool isAtomicMin() const { return _flags[AtomicMin]; } + + bool + isArgLoad() const + { + return (_flags[KernArgSegment] || _flags[ArgSegment]) && _flags[Load]; + } + + bool + isGlobalMem() const { - fatal("calling isLocalMem() on non-memory instruction.\n"); + return _flags[MemoryRef] && (_flags[GlobalSegment] || + _flags[PrivateSegment] || _flags[ReadOnlySegment] || + _flags[SpillSegment]); + } - return false; + bool + isLocalMem() const + { + return _flags[MemoryRef] && _flags[GroupSegment]; } - bool isArgLoad() { return false; } + bool isArgSeg() const { return _flags[ArgSegment]; } + bool isGlobalSeg() const { return _flags[GlobalSegment]; } + bool isGroupSeg() const { return _flags[GroupSegment]; } + bool isKernArgSeg() const { return _flags[KernArgSegment]; } + bool isPrivateSeg() const { return _flags[PrivateSegment]; } + bool isReadOnlySeg() const { return _flags[ReadOnlySegment]; } + bool isSpillSeg() const { return _flags[SpillSegment]; } + + bool isWorkitemScope() const { return _flags[WorkitemScope]; } + bool isWavefrontScope() const { return _flags[WavefrontScope]; } + bool isWorkgroupScope() const { return _flags[WorkgroupScope]; } + bool isDeviceScope() const { return _flags[DeviceScope]; } + bool isSystemScope() const { return _flags[SystemScope]; } + bool isNoScope() const { return _flags[NoScope]; } + + bool isRelaxedOrder() const { return _flags[RelaxedOrder]; } + bool isAcquire() const { return _flags[Acquire]; } + bool isRelease() const { return _flags[Release]; } + bool isAcquireRelease() const { return _flags[AcquireRelease]; } + bool isNoOrder() const { return _flags[NoOrder]; } + + /** + * Coherence domain of a memory instruction. Only valid for + * machine ISA. The coherence domain specifies where it is + * possible to perform memory synchronization, e.g., acquire + * or release, from the shader kernel. + * + * isGloballyCoherent(): returns true if kernel is sharing memory + * with other work-items on the same device (GPU) + * + * isSystemCoherent(): returns true if kernel is sharing memory + * with other work-items on a different device (GPU) or the host (CPU) + */ + bool isGloballyCoherent() const { return _flags[GloballyCoherent]; } + bool isSystemCoherent() const { return _flags[SystemCoherent]; } + virtual uint32_t instSize() = 0; // only used for memory instructions @@ -120,22 +208,13 @@ class GPUStaticInst virtual uint32_t getTargetPc() { return 0; } - /** - * Query whether the instruction is an unconditional jump i.e., the jump - * is always executed because there is no condition to be evaluated. - * - * If the instruction is not of branch type, the result is always false. - * - * @return True if the instruction is an unconditional jump. - */ - virtual bool unconditionalJumpInstruction() { return false; } - static uint64_t dynamic_id_count; - Enums::OpType o_type; // For flat memory accesses Enums::StorageClassType executed_as; + void setFlag(Flags flag) { _flags[flag] = true; } + protected: virtual void execLdAcq(GPUDynInstPtr gpuDynInst) @@ -169,7 +248,45 @@ class GPUStaticInst */ int _ipdInstNum; - bool _scalarOp; + std::bitset _flags; +}; + +class KernelLaunchStaticInst : public GPUStaticInst +{ + public: + KernelLaunchStaticInst() : GPUStaticInst("kernel_launch") + { + setFlag(Nop); + setFlag(Scalar); + setFlag(Acquire); + setFlag(SystemScope); + setFlag(GlobalSegment); + } + + void + execute(GPUDynInstPtr gpuDynInst) + { + fatal("kernel launch instruction should not be executed\n"); + } + + void + generateDisassembly() + { + disassembly = opcode; + } + + int getNumOperands() { return 0; } + bool isCondRegister(int operandIndex) { return false; } + bool isScalarRegister(int operandIndex) { return false; } + bool isVectorRegister(int operandIndex) { return false; } + bool isSrcOperand(int operandIndex) { return false; } + bool isDstOperand(int operandIndex) { return false; } + int getOperandSize(int operandIndex) { return 0; } + int getRegisterIndex(int operandIndex) { return 0; } + int numDstRegOperands() { return 0; } + int numSrcRegOperands() { return 0; } + bool isValid() const { return true; } + uint32_t instSize() { return 0; } }; #endif // __GPU_STATIC_INST_HH__ diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc index 10ded11b7..ac6a81b16 100644 --- a/src/gpu-compute/kernel_cfg.cc +++ b/src/gpu-compute/kernel_cfg.cc @@ -104,7 +104,7 @@ ControlFlowInfo::createBasicBlocks() leaders.insert(0); for (int i = 1; i < instructions.size(); i++) { GPUStaticInst* instruction = instructions[i]; - if (instruction->o_type == Enums::OT_BRANCH) { + if (instruction->isBranch()) { const int target_pc = instruction->getTargetPc(); leaders.insert(target_pc); leaders.insert(i + 1); @@ -137,18 +137,18 @@ ControlFlowInfo::connectBasicBlocks() break; } GPUStaticInst* last = lastInstruction(bb.get()); - if (last->o_type == Enums::OT_RET) { + if (last->isReturn()) { bb->successorIds.insert(exit_bb->id); continue; } - if (last->o_type == Enums::OT_BRANCH) { + if (last->isBranch()) { const uint32_t target_pc = last->getTargetPc(); BasicBlock* target_bb = basicBlock(target_pc); bb->successorIds.insert(target_bb->id); } // Unconditional jump instructions have a unique successor - if (!last->unconditionalJumpInstruction()) { + if (!last->isUnconditionalJump()) { BasicBlock* next_bb = basicBlock(last->instNum() + 1); bb->successorIds.insert(next_bb->id); } @@ -274,7 +274,7 @@ ControlFlowInfo::printBasicBlocks() const int inst_num = inst->instNum(); std::cout << inst_num << " [" << basicBlock(inst_num)->id << "]: " << inst->disassemble(); - if (inst->o_type == Enums::OT_BRANCH) { + if (inst->isBranch()) { std::cout << ", PC = " << inst->getTargetPc(); } std::cout << std::endl; diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc index d4a27318a..fad98c886 100644 --- a/src/gpu-compute/lds_state.cc +++ b/src/gpu-compute/lds_state.cc @@ -141,8 +141,7 @@ LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, } } - if (gpuDynInst->m_op == Enums::MO_LD || - gpuDynInst->m_op == Enums::MO_ST) { + if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { // mask identical addresses for (int j = 0; j < numBanks; ++j) { for (int j0 = 0; j0 < j; j0++) { @@ -208,8 +207,8 @@ LdsState::processPacket(PacketPtr packet) GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead - int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : - (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : + (dynInst->isStore()) ? parent->storeBusLength() : parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh index 58d109493..5fcbe82c0 100644 --- a/src/gpu-compute/lds_state.hh +++ b/src/gpu-compute/lds_state.hh @@ -43,7 +43,6 @@ #include #include -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/misc.hh" #include "mem/mem_object.hh" diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc index e2238bf45..80dad6fcd 100644 --- a/src/gpu-compute/local_memory_pipeline.cc +++ b/src/gpu-compute/local_memory_pipeline.cc @@ -62,7 +62,7 @@ LocalMemPipeline::exec() lmReturnedRequests.front() : nullptr; bool accessVrf = true; - if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) { + if ((m) && (m->isLoad() || m->isAtomicRet())) { Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; accessVrf = @@ -137,7 +137,7 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId]; // Return data to registers - if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) { + if (m->isLoad() || m->isAtomicRet()) { std::vector regVec; for (int k = 0; k < m->n_reg; ++k) { int dst = m->dst_reg+k; @@ -172,13 +172,12 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m) // Decrement outstanding request count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); - if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) - || MO_H(m->m_op)) { + if (m->isStore() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrLm, m->time, -1); } - if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) { + if (m->isLoad() || m->isAtomic()) { computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdLm, m->time, -1); } diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh index c1f741d6a..13afab977 100644 --- a/src/gpu-compute/shader.hh +++ b/src/gpu-compute/shader.hh @@ -47,7 +47,6 @@ #include "cpu/simple_thread.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" -#include "enums/MemOpType.hh" #include "enums/MemType.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_tlb.hh" diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc index c43d765af..c50c06cc6 100644 --- a/src/gpu-compute/vector_register_file.cc +++ b/src/gpu-compute/vector_register_file.cc @@ -38,7 +38,6 @@ #include #include "base/misc.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -153,8 +152,8 @@ VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const void VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w) { - bool loadInstr = IS_OT_READ(ii->opType()); - bool atomicInstr = IS_OT_ATOMIC(ii->opType()); + bool loadInstr = ii->isLoad(); + bool atomicInstr = ii->isAtomic() || ii->isMemFence(); bool loadNoArgInstr = loadInstr && !ii->isArgLoad(); diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc index c677cbe41..caeed85a7 100644 --- a/src/gpu-compute/wavefront.cc +++ b/src/gpu-compute/wavefront.cc @@ -37,7 +37,6 @@ #include "debug/GPUExec.hh" #include "debug/WavefrontStack.hh" -#include "gpu-compute/code_enums.hh" #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" @@ -165,19 +164,8 @@ Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr) bool Wavefront::isGmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType())) { + if (ii->isGlobalMem() || ii->isFlat()) return true; - } - - if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType())) { - return true; - } - - if (IS_OT_FLAT(ii->opType())) { - return true; - } return false; } @@ -185,8 +173,7 @@ Wavefront::isGmInstruction(GPUDynInstPtr ii) bool Wavefront::isLmInstruction(GPUDynInstPtr ii) { - if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) || - IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLocalMem()) { return true; } @@ -199,10 +186,9 @@ Wavefront::isOldestInstALU() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ)) { + if (status != S_STOPPED && (ii->isNop() || + ii->isReturn() || ii->isBranch() || + ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) { return true; } @@ -215,7 +201,7 @@ Wavefront::isOldestInstBarrier() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) { + if (status != S_STOPPED && ii->isBarrier()) { return true; } @@ -228,9 +214,7 @@ Wavefront::isOldestInstGMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { - + if (status != S_STOPPED && ii->isGlobalMem()) { return true; } @@ -243,9 +227,7 @@ Wavefront::isOldestInstLMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { - + if (status != S_STOPPED && ii->isLocalMem()) { return true; } @@ -258,9 +240,7 @@ Wavefront::isOldestInstPrivMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - + if (status != S_STOPPED && ii->isPrivateSeg()) { return true; } @@ -273,8 +253,7 @@ Wavefront::isOldestInstFlatMem() assert(!instructionBuffer.empty()); GPUDynInstPtr ii = instructionBuffer.front(); - if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) { - + if (status != S_STOPPED && ii->isFlat()) { return true; } @@ -289,7 +268,7 @@ Wavefront::instructionBufferHasBranch() for (auto it : instructionBuffer) { GPUDynInstPtr ii = it; - if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) { + if (ii->isReturn() || ii->isBranch()) { return true; } } @@ -371,23 +350,16 @@ Wavefront::ready(itype_e type) // checking readiness will be fixed eventually. In the meantime, let's // make sure that we do not silently let an instruction type slip // through this logic and always return not ready. - if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP || - ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) || - IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) || - IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) { + if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() || + ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() || + ii->isMemFence() || ii->isFlat())) { panic("next instruction: %s is of unknown type\n", ii->disassemble()); } DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n", computeUnit->cu_id, simdId, wfSlotId, ii->disassemble()); - if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) { + if (type == I_ALU && ii->isBarrier()) { // Here for ALU instruction (barrier) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -400,7 +372,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) { + } else if (type == I_ALU && ii->isNop()) { // Here for ALU instruction (nop) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -408,7 +380,7 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && ii->opType() == Enums::OT_RET) { + } else if (type == I_ALU && ii->isReturn()) { // Here for ALU instruction (return) if (!computeUnit->wfWait[simdId].prerdy()) { // Is wave slot free? @@ -421,10 +393,10 @@ Wavefront::ready(itype_e type) } ready_inst = true; - } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH || - ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) || - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG)) { + } else if (type == I_ALU && (ii->isBranch() || + ii->isALU() || + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg())) { // Here for ALU instruction (all others) if (!computeUnit->wfWait[simdId].prerdy()) { // Is alu slot free? @@ -439,18 +411,16 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) || - IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) { + } else if (type == I_GLOBAL && ii->isGlobalMem()) { // Here Global memory instruction - if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory write requests? if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) || - IS_OT_HIST_GM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { // Are there in pipe or outstanding global memory read requests? if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) return 0; @@ -480,17 +450,15 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) || - IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) { + } else if (type == I_SHARED && ii->isLocalMem()) { // Here for Shared memory instruction - if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) { + if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) { return 0; } } - if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) || - IS_OT_HIST_LM(ii->opType())) { + if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) { if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) { return 0; } @@ -519,47 +487,7 @@ Wavefront::ready(itype_e type) return 0; } ready_inst = true; - } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) || - IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) { - // Here for Private memory instruction ------------------------ // - if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) { - if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) { - return 0; - } - } - - if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) || - IS_OT_HIST_PM(ii->opType())) { - if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0) { - return 0; - } - } - - if (!glbMemBusRdy) { - // Is there an available VRF->Global memory read bus? - return 0; - } - - if (!glbMemIssueRdy) { - // Is wave slot free? - return 0; - } - - if (!computeUnit->globalMemoryPipe. - isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) { - // Can we insert a new request to the Global Mem Request FIFO? - return 0; - } - // can we schedule source & destination operands on the VRF? - if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii, - VrfAccessType::RD_WR)) { - return 0; - } - if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) { - return 0; - } - ready_inst = true; - } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) { + } else if (type == I_FLAT && ii->isFlat()) { if (!glbMemBusRdy) { // Is there an available VRF->Global memory read bus? return 0; @@ -618,23 +546,22 @@ Wavefront::updateResources() assert(ii); computeUnit->vrf[simdId]->updateResources(this, ii); // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType()==Enums::OT_KERN_READ || - ii->opType()==Enums::OT_ARG || - ii->opType()==Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].preset(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].preset(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; rdGmReqsInPipe++; @@ -649,7 +576,7 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); memReqsInPipe++; wrGmReqsInPipe++; @@ -664,21 +591,21 @@ Wavefront::updateResources() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { memReqsInPipe++; rdGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { memReqsInPipe++; wrGmReqsInPipe++; rdGmReqsInPipe++; @@ -686,21 +613,21 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { memReqsInPipe++; rdLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { memReqsInPipe++; wrLmReqsInPipe++; rdLmReqsInPipe++; @@ -708,28 +635,6 @@ Wavefront::updateResources() preset(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_PM(ii->opType())) { - memReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(4)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_PM(ii->opType())) { - memReqsInPipe++; - wrGmReqsInPipe++; - rdGmReqsInPipe++; - computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. - preset(computeUnit->shader->ticks(8)); - computeUnit->wfWait[computeUnit->GlbMemUnitId()]. - preset(computeUnit->shader->ticks(computeUnit->issuePeriod)); } } @@ -751,7 +656,7 @@ Wavefront::exec() DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s " "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId, ii->disassemble(), old_pc); - ii->execute(); + ii->execute(ii); // access the VRF computeUnit->vrf[simdId]->exec(ii, this); srcRegOpDist.sample(ii->numSrcRegOperands()); @@ -785,24 +690,24 @@ Wavefront::exec() // ---- Update Vector ALU pipeline and other resources ------------------ // // Single precision ALU or Branch or Return or Special instruction - if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL || - ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) || + if (ii->isALU() || ii->isSpecialOp() || + ii->isBranch() || // FIXME: Kernel argument loads are currently treated as ALU operations // since we don't send memory packets at execution. If we fix that then // we should map them to one of the memory pipelines - ii->opType() == Enums::OT_KERN_READ || - ii->opType() == Enums::OT_ARG || - ii->opType() == Enums::OT_RET) { + (ii->isKernArgSeg() && ii->isLoad()) || + ii->isArgSeg() || + ii->isReturn()) { computeUnit->aluPipe[simdId].set(computeUnit->shader-> ticks(computeUnit->spBypassLength())); // this is to enforce a fixed number of cycles per issue slot per SIMD computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_BARRIER) { + } else if (ii->isBarrier()) { computeUnit->wfWait[simdId].set(computeUnit->shader-> ticks(computeUnit->issuePeriod)); - } else if (ii->opType() == Enums::OT_FLAT_READ) { + } else if (ii->isLoad() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { @@ -816,7 +721,7 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (ii->opType() == Enums::OT_FLAT_WRITE) { + } else if (ii->isStore() && ii->isFlat()) { assert(Enums::SC_NONE != ii->executedAs()); if (Enums::SC_SHARED == ii->executedAs()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. @@ -829,32 +734,32 @@ Wavefront::exec() computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); } - } else if (IS_OT_READ_GM(ii->opType())) { + } else if (ii->isLoad() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_GM(ii->opType())) { + } else if (ii->isStore() && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_GM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) { computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->GlbMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_READ_LM(ii->opType())) { + } else if (ii->isLoad() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(4)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_WRITE_LM(ii->opType())) { + } else if (ii->isStore() && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. set(computeUnit->shader->ticks(computeUnit->issuePeriod)); - } else if (IS_OT_ATOMIC_LM(ii->opType())) { + } else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) { computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()]. set(computeUnit->shader->ticks(8)); computeUnit->wfWait[computeUnit->ShrMemUnitId()]. -- 2.30.2