X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fnouveau%2Fcodegen%2Fnv50_ir_from_nir.cpp;h=08bbaaed5df84d556232e7abf4484fa93338cfac;hb=70cbddc4a7967c8889f8b10d8a31d7b8bdbde2fb;hp=7a10a408b701e3022f969efdc9e1cfa5a88a35af;hpb=fa361a3c1e7bf7b291de59c2386501fc13ebfed1;p=mesa.git diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp index 7a10a408b70..08bbaaed5df 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_nir.cpp @@ -30,12 +30,15 @@ #include "codegen/nv50_ir_from_common.h" #include "codegen/nv50_ir_lowering_helper.h" #include "codegen/nv50_ir_util.h" +#include "tgsi/tgsi_from_mesa.h" #if __cplusplus >= 201103L #include #else #include #endif +#include +#include #include namespace { @@ -51,27 +54,42 @@ using std::tr1::unordered_map; using namespace nv50_ir; int -type_size(const struct glsl_type *type) +type_size(const struct glsl_type *type, bool bindless) { return glsl_count_attribute_slots(type, false); } +static void +function_temp_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) +{ + assert(glsl_type_is_vector_or_scalar(type)); + + unsigned comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; + unsigned length = glsl_get_vector_elements(type); + + *size = comp_size * length; + *align = 0x10; +} + class Converter : public ConverterCommon { public: - Converter(Program *, nir_shader *, nv50_ir_prog_info *); + Converter(Program *, nir_shader *, nv50_ir_prog_info *, nv50_ir_prog_info_out *); bool run(); private: typedef std::vector LValues; typedef unordered_map NirDefMap; + typedef unordered_map ImmediateMap; typedef unordered_map NirBlockMap; + CacheMode convert(enum gl_access_qualifier); TexTarget convert(glsl_sampler_dim, bool isArray, bool isShadow); LValues& convert(nir_alu_dest *); BasicBlock* convert(nir_block *); LValues& convert(nir_dest *); SVSemantic convert(nir_intrinsic_op); + Value* convert(nir_load_const_instr*, uint8_t); LValues& convert(nir_register *); LValues& convert(nir_ssa_def *); @@ -88,7 +106,10 @@ private: // If the found value has not a constant part, the Value gets returned // through the Value parameter. uint32_t getIndirect(nir_src *, uint8_t, Value *&); - uint32_t getIndirect(nir_intrinsic_instr *, uint8_t s, uint8_t c, Value *&); + // isScalar indicates that the addressing is scalar, vec4 addressing is + // assumed otherwise + uint32_t getIndirect(nir_intrinsic_instr *, uint8_t s, uint8_t c, Value *&, + bool isScalar = false); uint32_t getSlotAddress(nir_intrinsic_instr *, uint8_t idx, uint8_t slot); @@ -111,8 +132,11 @@ private: DataType getDType(nir_alu_instr *); DataType getDType(nir_intrinsic_instr *); + DataType getDType(nir_intrinsic_instr *, bool isSigned); DataType getDType(nir_op, uint8_t); + DataFile getFile(nir_intrinsic_op); + std::vector getSTypes(nir_alu_instr *); DataType getSType(nir_src &, bool isFloat, bool isSigned); @@ -144,16 +168,20 @@ private: // tex stuff Value* applyProjection(Value *src, Value *proj); + unsigned int getNIRArgCount(TexInstruction::Target&); nir_shader *nir; NirDefMap ssaDefs; NirDefMap regDefs; + ImmediateMap immediates; NirBlockMap blocks; unsigned int curLoopDepth; + unsigned int curIfDepth; BasicBlock *exit; Value *zero; + Instruction *immInsertPos; int clipVertexOutput; @@ -164,10 +192,12 @@ private: }; }; -Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info) - : ConverterCommon(prog, info), +Converter::Converter(Program *prog, nir_shader *nir, nv50_ir_prog_info *info, + nv50_ir_prog_info_out *info_out) + : ConverterCommon(prog, info, info_out), nir(nir), curLoopDepth(0), + curIfDepth(0), clipVertexOutput(-1) { zero = mkImm((uint32_t)0); @@ -238,11 +268,30 @@ Converter::getDType(nir_alu_instr *insn) DataType Converter::getDType(nir_intrinsic_instr *insn) +{ + bool isSigned; + switch (insn->intrinsic) { + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_imin: + isSigned = true; + break; + default: + isSigned = false; + break; + } + + return getDType(insn, isSigned); +} + +DataType +Converter::getDType(nir_intrinsic_instr *insn, bool isSigned) { if (insn->dest.is_ssa) - return typeOfSize(insn->dest.ssa.bit_size / 8, false, false); + return typeOfSize(insn->dest.ssa.bit_size / 8, false, isSigned); else - return typeOfSize(insn->dest.reg.reg->bit_size / 8, false, false); + return typeOfSize(insn->dest.reg.reg->bit_size / 8, false, isSigned); } DataType @@ -300,6 +349,28 @@ Converter::getSType(nir_src &src, bool isFloat, bool isSigned) return ty; } +DataFile +Converter::getFile(nir_intrinsic_op op) +{ + switch (op) { + case nir_intrinsic_load_global: + case nir_intrinsic_store_global: + return FILE_MEMORY_GLOBAL; + case nir_intrinsic_load_scratch: + case nir_intrinsic_store_scratch: + return FILE_MEMORY_LOCAL; + case nir_intrinsic_load_shared: + case nir_intrinsic_store_shared: + return FILE_MEMORY_SHARED; + case nir_intrinsic_load_kernel_input: + return FILE_SHADER_INPUT; + default: + ERROR("couldn't get DateFile for op %s\n", nir_intrinsic_infos[op].name); + assert(false); + } + return FILE_NULL; +} + operation Converter::getOperation(nir_op op) { @@ -311,7 +382,6 @@ Converter::getOperation(nir_op op) case nir_op_fadd: case nir_op_iadd: return OP_ADD; - case nir_op_fand: case nir_op_iand: return OP_AND; case nir_op_ifind_msb: @@ -380,10 +450,8 @@ Converter::getOperation(nir_op op) case nir_op_fneg: case nir_op_ineg: return OP_NEG; - case nir_op_fnot: case nir_op_inot: return OP_NOT; - case nir_op_for: case nir_op_ior: return OP_OR; case nir_op_fpow: @@ -402,7 +470,7 @@ Converter::getOperation(nir_op op) case nir_op_flt32: case nir_op_ilt32: case nir_op_ult32: - case nir_op_fne32: + case nir_op_fneu32: case nir_op_ine32: return OP_SET; case nir_op_ishl: @@ -414,12 +482,8 @@ Converter::getOperation(nir_op op) return OP_SIN; case nir_op_fsqrt: return OP_SQRT; - case nir_op_fsub: - case nir_op_isub: - return OP_SUB; case nir_op_ftrunc: return OP_TRUNC; - case nir_op_fxor: case nir_op_ixor: return OP_XOR; default: @@ -463,6 +527,46 @@ operation Converter::getOperation(nir_intrinsic_op op) { switch (op) { + case nir_intrinsic_emit_vertex: + return OP_EMIT; + case nir_intrinsic_end_primitive: + return OP_RESTART; + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: + case nir_intrinsic_image_atomic_dec_wrap: + return OP_SUREDP; + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_image_load: + return OP_SULDP; + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_image_samples: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_image_size: + return OP_SUQ; + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_image_store: + return OP_SUSTP; default: ERROR("couldn't get operation for nir_intrinsic_op %u\n", op); assert(false); @@ -489,6 +593,10 @@ Converter::getSubOp(nir_op op) case nir_op_imul_high: case nir_op_umul_high: return NV50_IR_SUBOP_MUL_HIGH; + case nir_op_ishl: + case nir_op_ishr: + case nir_op_ushr: + return NV50_IR_SUBOP_SHIFT_WRAP; default: return 0; } @@ -498,6 +606,79 @@ int Converter::getSubOp(nir_intrinsic_op op) { switch (op) { + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_ssbo_atomic_add: + return NV50_IR_SUBOP_ATOM_ADD; + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_ssbo_atomic_and: + return NV50_IR_SUBOP_ATOM_AND; + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_comp_swap: + return NV50_IR_SUBOP_ATOM_CAS; + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_ssbo_atomic_exchange: + return NV50_IR_SUBOP_ATOM_EXCH; + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_ssbo_atomic_or: + return NV50_IR_SUBOP_ATOM_OR; + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + return NV50_IR_SUBOP_ATOM_MAX; + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + return NV50_IR_SUBOP_ATOM_MIN; + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_global_atomic_xor: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_ssbo_atomic_xor: + return NV50_IR_SUBOP_ATOM_XOR; + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_image_atomic_inc_wrap: + return NV50_IR_SUBOP_ATOM_INC; + case nir_intrinsic_bindless_image_atomic_dec_wrap: + case nir_intrinsic_image_atomic_dec_wrap: + return NV50_IR_SUBOP_ATOM_DEC; + + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + return NV50_IR_SUBOP_MEMBAR(M, GL); + case nir_intrinsic_memory_barrier_shared: + return NV50_IR_SUBOP_MEMBAR(M, CTA); + case nir_intrinsic_vote_all: return NV50_IR_SUBOP_VOTE_ALL; case nir_intrinsic_vote_any: @@ -524,7 +705,7 @@ Converter::getCondCode(nir_op op) case nir_op_ilt32: case nir_op_ult32: return CC_LT; - case nir_op_fne32: + case nir_op_fneu32: return CC_NEU; case nir_op_ine32: return CC_NE; @@ -556,6 +737,8 @@ Converter::convert(nir_dest *dest) Converter::LValues& Converter::convert(nir_register *reg) { + assert(!reg->num_array_elems); + NirDefMap::iterator it = regDefs.find(reg->index); if (it != regDefs.end()) return it->second; @@ -618,6 +801,10 @@ Converter::getSrc(nir_src *src, uint8_t idx, bool indirect) Value* Converter::getSrc(nir_ssa_def *src, uint8_t idx) { + ImmediateMap::iterator iit = immediates.find(src->index); + if (iit != immediates.end()) + return convert((*iit).second, idx); + NirDefMap::iterator it = ssaDefs.find(src->index); if (it == ssaDefs.end()) { ERROR("SSA value %u not found\n", src->index); @@ -634,7 +821,7 @@ Converter::getIndirect(nir_src *src, uint8_t idx, Value *&indirect) if (offset) { indirect = NULL; - return offset->u32[0]; + return offset[0].u32; } indirect = getSrc(src, idx, true); @@ -642,10 +829,10 @@ Converter::getIndirect(nir_src *src, uint8_t idx, Value *&indirect) } uint32_t -Converter::getIndirect(nir_intrinsic_instr *insn, uint8_t s, uint8_t c, Value *&indirect) +Converter::getIndirect(nir_intrinsic_instr *insn, uint8_t s, uint8_t c, Value *&indirect, bool isScalar) { int32_t idx = nir_intrinsic_base(insn) + getIndirect(&insn->src[s], c, indirect); - if (indirect) + if (indirect && !isScalar) indirect = mkOp2v(OP_SHL, TYPE_U32, getSSA(4, FILE_ADDRESS), indirect, loadImm(NULL, 4)); return idx; } @@ -711,256 +898,6 @@ vert_attrib_to_tgsi_semantic(gl_vert_attrib slot, unsigned *name, unsigned *inde } } -static void -varying_slot_to_tgsi_semantic(gl_varying_slot slot, unsigned *name, unsigned *index) -{ - assert(name && index); - - if (slot >= VARYING_SLOT_TESS_MAX) { - ERROR("invalid varying slot %u\n", slot); - assert(false); - return; - } - - if (slot >= VARYING_SLOT_PATCH0) { - *name = TGSI_SEMANTIC_PATCH; - *index = slot - VARYING_SLOT_PATCH0; - return; - } - - if (slot >= VARYING_SLOT_VAR0) { - *name = TGSI_SEMANTIC_GENERIC; - *index = slot - VARYING_SLOT_VAR0; - return; - } - - if (slot >= VARYING_SLOT_TEX0 && slot <= VARYING_SLOT_TEX7) { - *name = TGSI_SEMANTIC_TEXCOORD; - *index = slot - VARYING_SLOT_TEX0; - return; - } - - switch (slot) { - case VARYING_SLOT_BFC0: - *name = TGSI_SEMANTIC_BCOLOR; - *index = 0; - break; - case VARYING_SLOT_BFC1: - *name = TGSI_SEMANTIC_BCOLOR; - *index = 1; - break; - case VARYING_SLOT_CLIP_DIST0: - *name = TGSI_SEMANTIC_CLIPDIST; - *index = 0; - break; - case VARYING_SLOT_CLIP_DIST1: - *name = TGSI_SEMANTIC_CLIPDIST; - *index = 1; - break; - case VARYING_SLOT_CLIP_VERTEX: - *name = TGSI_SEMANTIC_CLIPVERTEX; - *index = 0; - break; - case VARYING_SLOT_COL0: - *name = TGSI_SEMANTIC_COLOR; - *index = 0; - break; - case VARYING_SLOT_COL1: - *name = TGSI_SEMANTIC_COLOR; - *index = 1; - break; - case VARYING_SLOT_EDGE: - *name = TGSI_SEMANTIC_EDGEFLAG; - *index = 0; - break; - case VARYING_SLOT_FACE: - *name = TGSI_SEMANTIC_FACE; - *index = 0; - break; - case VARYING_SLOT_FOGC: - *name = TGSI_SEMANTIC_FOG; - *index = 0; - break; - case VARYING_SLOT_LAYER: - *name = TGSI_SEMANTIC_LAYER; - *index = 0; - break; - case VARYING_SLOT_PNTC: - *name = TGSI_SEMANTIC_PCOORD; - *index = 0; - break; - case VARYING_SLOT_POS: - *name = TGSI_SEMANTIC_POSITION; - *index = 0; - break; - case VARYING_SLOT_PRIMITIVE_ID: - *name = TGSI_SEMANTIC_PRIMID; - *index = 0; - break; - case VARYING_SLOT_PSIZ: - *name = TGSI_SEMANTIC_PSIZE; - *index = 0; - break; - case VARYING_SLOT_TESS_LEVEL_INNER: - *name = TGSI_SEMANTIC_TESSINNER; - *index = 0; - break; - case VARYING_SLOT_TESS_LEVEL_OUTER: - *name = TGSI_SEMANTIC_TESSOUTER; - *index = 0; - break; - case VARYING_SLOT_VIEWPORT: - *name = TGSI_SEMANTIC_VIEWPORT_INDEX; - *index = 0; - break; - default: - ERROR("unknown varying slot %u\n", slot); - assert(false); - break; - } -} - -static void -frag_result_to_tgsi_semantic(unsigned slot, unsigned *name, unsigned *index) -{ - if (slot >= FRAG_RESULT_DATA0) { - *name = TGSI_SEMANTIC_COLOR; - *index = slot - FRAG_RESULT_COLOR - 2; // intentional - return; - } - - switch (slot) { - case FRAG_RESULT_COLOR: - *name = TGSI_SEMANTIC_COLOR; - *index = 0; - break; - case FRAG_RESULT_DEPTH: - *name = TGSI_SEMANTIC_POSITION; - *index = 0; - break; - case FRAG_RESULT_SAMPLE_MASK: - *name = TGSI_SEMANTIC_SAMPLEMASK; - *index = 0; - break; - default: - ERROR("unknown frag result slot %u\n", slot); - assert(false); - break; - } -} - -// copy of _mesa_sysval_to_semantic -static void -system_val_to_tgsi_semantic(unsigned val, unsigned *name, unsigned *index) -{ - *index = 0; - switch (val) { - // Vertex shader - case SYSTEM_VALUE_VERTEX_ID: - *name = TGSI_SEMANTIC_VERTEXID; - break; - case SYSTEM_VALUE_INSTANCE_ID: - *name = TGSI_SEMANTIC_INSTANCEID; - break; - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: - *name = TGSI_SEMANTIC_VERTEXID_NOBASE; - break; - case SYSTEM_VALUE_BASE_VERTEX: - *name = TGSI_SEMANTIC_BASEVERTEX; - break; - case SYSTEM_VALUE_BASE_INSTANCE: - *name = TGSI_SEMANTIC_BASEINSTANCE; - break; - case SYSTEM_VALUE_DRAW_ID: - *name = TGSI_SEMANTIC_DRAWID; - break; - - // Geometry shader - case SYSTEM_VALUE_INVOCATION_ID: - *name = TGSI_SEMANTIC_INVOCATIONID; - break; - - // Fragment shader - case SYSTEM_VALUE_FRAG_COORD: - *name = TGSI_SEMANTIC_POSITION; - break; - case SYSTEM_VALUE_FRONT_FACE: - *name = TGSI_SEMANTIC_FACE; - break; - case SYSTEM_VALUE_SAMPLE_ID: - *name = TGSI_SEMANTIC_SAMPLEID; - break; - case SYSTEM_VALUE_SAMPLE_POS: - *name = TGSI_SEMANTIC_SAMPLEPOS; - break; - case SYSTEM_VALUE_SAMPLE_MASK_IN: - *name = TGSI_SEMANTIC_SAMPLEMASK; - break; - case SYSTEM_VALUE_HELPER_INVOCATION: - *name = TGSI_SEMANTIC_HELPER_INVOCATION; - break; - - // Tessellation shader - case SYSTEM_VALUE_TESS_COORD: - *name = TGSI_SEMANTIC_TESSCOORD; - break; - case SYSTEM_VALUE_VERTICES_IN: - *name = TGSI_SEMANTIC_VERTICESIN; - break; - case SYSTEM_VALUE_PRIMITIVE_ID: - *name = TGSI_SEMANTIC_PRIMID; - break; - case SYSTEM_VALUE_TESS_LEVEL_OUTER: - *name = TGSI_SEMANTIC_TESSOUTER; - break; - case SYSTEM_VALUE_TESS_LEVEL_INNER: - *name = TGSI_SEMANTIC_TESSINNER; - break; - - // Compute shader - case SYSTEM_VALUE_LOCAL_INVOCATION_ID: - *name = TGSI_SEMANTIC_THREAD_ID; - break; - case SYSTEM_VALUE_WORK_GROUP_ID: - *name = TGSI_SEMANTIC_BLOCK_ID; - break; - case SYSTEM_VALUE_NUM_WORK_GROUPS: - *name = TGSI_SEMANTIC_GRID_SIZE; - break; - case SYSTEM_VALUE_LOCAL_GROUP_SIZE: - *name = TGSI_SEMANTIC_BLOCK_SIZE; - break; - - // ARB_shader_ballot - case SYSTEM_VALUE_SUBGROUP_SIZE: - *name = TGSI_SEMANTIC_SUBGROUP_SIZE; - break; - case SYSTEM_VALUE_SUBGROUP_INVOCATION: - *name = TGSI_SEMANTIC_SUBGROUP_INVOCATION; - break; - case SYSTEM_VALUE_SUBGROUP_EQ_MASK: - *name = TGSI_SEMANTIC_SUBGROUP_EQ_MASK; - break; - case SYSTEM_VALUE_SUBGROUP_GE_MASK: - *name = TGSI_SEMANTIC_SUBGROUP_GE_MASK; - break; - case SYSTEM_VALUE_SUBGROUP_GT_MASK: - *name = TGSI_SEMANTIC_SUBGROUP_GT_MASK; - break; - case SYSTEM_VALUE_SUBGROUP_LE_MASK: - *name = TGSI_SEMANTIC_SUBGROUP_LE_MASK; - break; - case SYSTEM_VALUE_SUBGROUP_LT_MASK: - *name = TGSI_SEMANTIC_SUBGROUP_LT_MASK; - break; - - default: - ERROR("unknown system value %u\n", val); - assert(false); - break; - } -} - void Converter::setInterpolate(nv50_ir_varying *var, uint8_t mode, @@ -996,7 +933,7 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, uint16_t slots; switch (stage) { case Program::TYPE_GEOMETRY: - slots = type->uniform_locations(); + slots = type->count_attribute_slots(false); if (input) slots /= info.gs.vertices_in; break; @@ -1004,9 +941,9 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, case Program::TYPE_TESSELLATION_EVAL: // remove first dimension if (var->data.patch || (!input && stage == Program::TYPE_TESSELLATION_EVAL)) - slots = type->uniform_locations(); + slots = type->count_attribute_slots(false); else - slots = type->fields.array->uniform_locations(); + slots = type->fields.array->count_attribute_slots(false); break; default: slots = type->count_attribute_slots(false); @@ -1016,61 +953,97 @@ calcSlots(const glsl_type *type, Program::Type stage, const shader_info &info, return slots; } +static uint8_t +getMaskForType(const glsl_type *type, uint8_t slot) { + uint16_t comp = type->without_array()->components(); + comp = comp ? comp : 4; + + if (glsl_base_type_is_64bit(type->without_array()->base_type)) { + comp *= 2; + if (comp > 4) { + if (slot % 2) + comp -= 4; + else + comp = 4; + } + } + + return (1 << comp) - 1; +} + bool Converter::assignSlots() { unsigned name; unsigned index; info->io.viewportId = -1; - info->numInputs = 0; + info_out->numInputs = 0; + info_out->numOutputs = 0; + info_out->numSysVals = 0; - // we have to fixup the uniform locations for arrays - unsigned numImages = 0; - nir_foreach_variable(var, &nir->uniforms) { - const glsl_type *type = var->type; - if (!type->without_array()->is_image()) + for (uint8_t i = 0; i < SYSTEM_VALUE_MAX; ++i) { + if (!(nir->info.system_values_read & 1ull << i)) continue; - var->data.driver_location = numImages; - numImages += type->is_array() ? type->arrays_of_arrays_size() : 1; + + info_out->sv[info_out->numSysVals].sn = tgsi_get_sysval_semantic(i); + info_out->sv[info_out->numSysVals].si = 0; + info_out->sv[info_out->numSysVals].input = 0; // TODO inferSysValDirection(sn); + + switch (i) { + case SYSTEM_VALUE_INSTANCE_ID: + info_out->io.instanceId = info_out->numSysVals; + break; + case SYSTEM_VALUE_TESS_LEVEL_INNER: + case SYSTEM_VALUE_TESS_LEVEL_OUTER: + info_out->sv[info_out->numSysVals].patch = 1; + break; + case SYSTEM_VALUE_VERTEX_ID: + info_out->io.vertexId = info_out->numSysVals; + break; + default: + break; + } + + info_out->numSysVals += 1; } - nir_foreach_variable(var, &nir->inputs) { + if (prog->getType() == Program::TYPE_COMPUTE) + return true; + + nir_foreach_shader_in_variable(var, nir) { const glsl_type *type = var->type; int slot = var->data.location; uint16_t slots = calcSlots(type, prog->getType(), nir->info, true, var); - uint32_t comp = type->is_array() ? type->without_array()->component_slots() - : type->component_slots(); - uint32_t frac = var->data.location_frac; uint32_t vary = var->data.driver_location; - if (glsl_base_type_is_64bit(type->without_array()->base_type)) { - if (comp > 2) - slots *= 2; - } - assert(vary + slots <= PIPE_MAX_SHADER_INPUTS); switch(prog->getType()) { case Program::TYPE_FRAGMENT: - varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); + tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, + &name, &index); for (uint16_t i = 0; i < slots; ++i) { - setInterpolate(&info->in[vary + i], var->data.interpolation, + setInterpolate(&info_out->in[vary + i], var->data.interpolation, var->data.centroid | var->data.sample, name); } break; case Program::TYPE_GEOMETRY: - varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); + tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, + &name, &index); break; case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: - varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); + tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, + &name, &index); if (var->data.patch && name == TGSI_SEMANTIC_PATCH) - info->numPatchConstants = MAX2(info->numPatchConstants, index + slots); + info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots); break; case Program::TYPE_VERTEX: + if (slot >= VERT_ATTRIB_GENERIC0) + slot = VERT_ATTRIB_GENERIC0 + vary; vert_attrib_to_tgsi_semantic((gl_vert_attrib)slot, &name, &index); switch (name) { case TGSI_SEMANTIC_EDGEFLAG: - info->io.edgeFlagIn = vary; + info_out->io.edgeFlagIn = vary; break; default: break; @@ -1082,56 +1055,44 @@ bool Converter::assignSlots() { } for (uint16_t i = 0u; i < slots; ++i, ++vary) { - info->in[vary].id = vary; - info->in[vary].patch = var->data.patch; - info->in[vary].sn = name; - info->in[vary].si = index + i; - if (glsl_base_type_is_64bit(type->without_array()->base_type)) - if (i & 0x1) - info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); - else - info->in[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); - else - info->in[vary].mask |= ((1 << comp) - 1) << frac; + nv50_ir_varying *v = &info_out->in[vary]; + + v->patch = var->data.patch; + v->sn = name; + v->si = index + i; + v->mask |= getMaskForType(type, i) << var->data.location_frac; } - info->numInputs = std::max(info->numInputs, vary); + info_out->numInputs = std::max(info_out->numInputs, vary); } - info->numOutputs = 0; - nir_foreach_variable(var, &nir->outputs) { + nir_foreach_shader_out_variable(var, nir) { const glsl_type *type = var->type; int slot = var->data.location; uint16_t slots = calcSlots(type, prog->getType(), nir->info, false, var); - uint32_t comp = type->is_array() ? type->without_array()->component_slots() - : type->component_slots(); - uint32_t frac = var->data.location_frac; uint32_t vary = var->data.driver_location; - if (glsl_base_type_is_64bit(type->without_array()->base_type)) { - if (comp > 2) - slots *= 2; - } - assert(vary < PIPE_MAX_SHADER_OUTPUTS); switch(prog->getType()) { case Program::TYPE_FRAGMENT: - frag_result_to_tgsi_semantic((gl_frag_result)slot, &name, &index); + tgsi_get_gl_frag_result_semantic((gl_frag_result)slot, &name, &index); switch (name) { case TGSI_SEMANTIC_COLOR: if (!var->data.fb_fetch_output) - info->prop.fp.numColourResults++; - info->prop.fp.separateFragData = true; + info_out->prop.fp.numColourResults++; + if (var->data.location == FRAG_RESULT_COLOR && + nir->info.outputs_written & BITFIELD64_BIT(var->data.location)) + info_out->prop.fp.separateFragData = true; // sometimes we get FRAG_RESULT_DATAX with data.index 0 // sometimes we get FRAG_RESULT_DATA0 with data.index X index = index == 0 ? var->data.index : index; break; case TGSI_SEMANTIC_POSITION: - info->io.fragDepth = vary; - info->prop.fp.writesDepth = true; + info_out->io.fragDepth = vary; + info_out->prop.fp.writesDepth = true; break; case TGSI_SEMANTIC_SAMPLEMASK: - info->io.sampleMask = vary; + info_out->io.sampleMask = vary; break; default: break; @@ -1141,21 +1102,22 @@ bool Converter::assignSlots() { case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: case Program::TYPE_VERTEX: - varying_slot_to_tgsi_semantic((gl_varying_slot)slot, &name, &index); + tgsi_get_gl_varying_semantic((gl_varying_slot)slot, true, + &name, &index); if (var->data.patch && name != TGSI_SEMANTIC_TESSINNER && name != TGSI_SEMANTIC_TESSOUTER) - info->numPatchConstants = MAX2(info->numPatchConstants, index + slots); + info_out->numPatchConstants = MAX2(info_out->numPatchConstants, index + slots); switch (name) { case TGSI_SEMANTIC_CLIPDIST: - info->io.genUserClip = -1; + info_out->io.genUserClip = -1; break; case TGSI_SEMANTIC_CLIPVERTEX: clipVertexOutput = vary; break; case TGSI_SEMANTIC_EDGEFLAG: - info->io.edgeFlagOut = vary; + info_out->io.edgeFlagOut = vary; break; case TGSI_SEMANTIC_POSITION: if (clipVertexOutput < 0) @@ -1171,67 +1133,33 @@ bool Converter::assignSlots() { } for (uint16_t i = 0u; i < slots; ++i, ++vary) { - info->out[vary].id = vary; - info->out[vary].patch = var->data.patch; - info->out[vary].sn = name; - info->out[vary].si = index + i; - if (glsl_base_type_is_64bit(type->without_array()->base_type)) - if (i & 0x1) - info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) >> 0x4); - else - info->out[vary].mask |= (((1 << (comp * 2)) - 1) << (frac * 2) & 0xf); - else - info->out[vary].mask |= ((1 << comp) - 1) << frac; - - if (nir->info.outputs_read & 1ll << slot) - info->out[vary].oread = 1; + nv50_ir_varying *v = &info_out->out[vary]; + v->patch = var->data.patch; + v->sn = name; + v->si = index + i; + v->mask |= getMaskForType(type, i) << var->data.location_frac; + + if (nir->info.outputs_read & 1ull << slot) + v->oread = 1; } - info->numOutputs = std::max(info->numOutputs, vary); + info_out->numOutputs = std::max(info_out->numOutputs, vary); } - info->numSysVals = 0; - for (uint8_t i = 0; i < 64; ++i) { - if (!(nir->info.system_values_read & 1ll << i)) - continue; - - system_val_to_tgsi_semantic(i, &name, &index); - info->sv[info->numSysVals].sn = name; - info->sv[info->numSysVals].si = index; - info->sv[info->numSysVals].input = 0; // TODO inferSysValDirection(sn); + if (info_out->io.genUserClip > 0) { + info_out->io.clipDistances = info_out->io.genUserClip; - switch (i) { - case SYSTEM_VALUE_INSTANCE_ID: - info->io.instanceId = info->numSysVals; - break; - case SYSTEM_VALUE_TESS_LEVEL_INNER: - case SYSTEM_VALUE_TESS_LEVEL_OUTER: - info->sv[info->numSysVals].patch = 1; - break; - case SYSTEM_VALUE_VERTEX_ID: - info->io.vertexId = info->numSysVals; - break; - default: - break; - } - - info->numSysVals += 1; - } - - if (info->io.genUserClip > 0) { - info->io.clipDistances = info->io.genUserClip; - - const unsigned int nOut = (info->io.genUserClip + 3) / 4; + const unsigned int nOut = (info_out->io.genUserClip + 3) / 4; for (unsigned int n = 0; n < nOut; ++n) { - unsigned int i = info->numOutputs++; - info->out[i].id = i; - info->out[i].sn = TGSI_SEMANTIC_CLIPDIST; - info->out[i].si = n; - info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4); + unsigned int i = info_out->numOutputs++; + info_out->out[i].id = i; + info_out->out[i].sn = TGSI_SEMANTIC_CLIPDIST; + info_out->out[i].si = n; + info_out->out[i].mask = ((1 << info_out->io.clipDistances) - 1) >> (n * 4); } } - return info->assignSlots(info) == 0; + return info->assignSlots(info_out) == 0; } uint32_t @@ -1281,7 +1209,7 @@ Converter::getSlotAddress(nir_intrinsic_instr *insn, uint8_t idx, uint8_t slot) assert(!input || idx < PIPE_MAX_SHADER_INPUTS); assert(input || idx < PIPE_MAX_SHADER_OUTPUTS); - const nv50_ir_varying *vary = input ? info->in : info->out; + const nv50_ir_varying *vary = input ? info_out->in : info_out->out; return vary[idx].slot[slot] * 4; } @@ -1339,62 +1267,63 @@ Converter::storeTo(nir_intrinsic_instr *insn, DataFile file, operation op, } mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address), indirect0, - split[0])->perPatch = info->out[idx].patch; + split[0])->perPatch = info_out->out[idx].patch; mkStore(op, TYPE_U32, mkSymbol(file, 0, TYPE_U32, address + 4), indirect0, - split[1])->perPatch = info->out[idx].patch; + split[1])->perPatch = info_out->out[idx].patch; } else { if (op == OP_EXPORT) src = mkMov(getSSA(size), src, ty)->getDef(0); mkStore(op, ty, mkSymbol(file, 0, ty, address), indirect0, - src)->perPatch = info->out[idx].patch; + src)->perPatch = info_out->out[idx].patch; } } bool Converter::parseNIR() { - info->io.clipDistances = nir->info.clip_distance_array_size; - info->io.cullDistances = nir->info.cull_distance_array_size; + info_out->bin.tlsSpace = nir->scratch_size; + info_out->io.clipDistances = nir->info.clip_distance_array_size; + info_out->io.cullDistances = nir->info.cull_distance_array_size; + info_out->io.layer_viewport_relative = nir->info.layer_viewport_relative; switch(prog->getType()) { case Program::TYPE_COMPUTE: info->prop.cp.numThreads[0] = nir->info.cs.local_size[0]; info->prop.cp.numThreads[1] = nir->info.cs.local_size[1]; info->prop.cp.numThreads[2] = nir->info.cs.local_size[2]; - info->bin.smemSize = nir->info.cs.shared_size; + info_out->bin.smemSize += nir->info.cs.shared_size; break; case Program::TYPE_FRAGMENT: - info->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests; - info->prop.fp.persampleInvocation = + info_out->prop.fp.earlyFragTests = nir->info.fs.early_fragment_tests; + prog->persampleInvocation = (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_ID) || (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); - info->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage; - info->prop.fp.readsSampleLocations = + info_out->prop.fp.postDepthCoverage = nir->info.fs.post_depth_coverage; + info_out->prop.fp.readsSampleLocations = (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); - info->prop.fp.usesDiscard = nir->info.fs.uses_discard; - info->prop.fp.usesSampleMaskIn = + info_out->prop.fp.usesDiscard = nir->info.fs.uses_discard || nir->info.fs.uses_demote; + info_out->prop.fp.usesSampleMaskIn = !!(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN); break; case Program::TYPE_GEOMETRY: - info->prop.gp.inputPrim = nir->info.gs.input_primitive; - info->prop.gp.instanceCount = nir->info.gs.invocations; - info->prop.gp.maxVertices = nir->info.gs.vertices_out; - info->prop.gp.outputPrim = nir->info.gs.output_primitive; + info_out->prop.gp.instanceCount = nir->info.gs.invocations; + info_out->prop.gp.maxVertices = nir->info.gs.vertices_out; + info_out->prop.gp.outputPrim = nir->info.gs.output_primitive; break; case Program::TYPE_TESSELLATION_CONTROL: case Program::TYPE_TESSELLATION_EVAL: if (nir->info.tess.primitive_mode == GL_ISOLINES) - info->prop.tp.domain = GL_LINES; + info_out->prop.tp.domain = GL_LINES; else - info->prop.tp.domain = nir->info.tess.primitive_mode; - info->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out; - info->prop.tp.outputPrim = + info_out->prop.tp.domain = nir->info.tess.primitive_mode; + info_out->prop.tp.outputPatchSize = nir->info.tess.tcs_vertices_out; + info_out->prop.tp.outputPrim = nir->info.tess.point_mode ? PIPE_PRIM_POINTS : PIPE_PRIM_TRIANGLES; - info->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3; - info->prop.tp.winding = !nir->info.tess.ccw; + info_out->prop.tp.partitioning = (nir->info.tess.spacing + 1) % 3; + info_out->prop.tp.winding = !nir->info.tess.ccw; break; case Program::TYPE_VERTEX: - info->prop.vp.usesDrawParameters = + info_out->prop.vp.usesDrawParameters = (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX)) || (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE)) || (nir->info.system_values_read & BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)); @@ -1409,8 +1338,6 @@ Converter::parseNIR() bool Converter::visit(nir_function *function) { - // we only support emiting the main function for now - assert(!strcmp(function->name, "main")); assert(function->impl); // usually the blocks will set everything up, but main is special @@ -1422,7 +1349,7 @@ Converter::visit(nir_function *function) setPosition(entry, true); - if (info->io.genUserClip > 0) { + if (info_out->io.genUserClip > 0) { for (int c = 0; c < 4; ++c) clipVtx[c] = getScratch(); } @@ -1453,7 +1380,9 @@ Converter::visit(nir_function *function) bb->cfg.attach(&exit->cfg, Graph::Edge::TREE); setPosition(exit, true); - if (info->io.genUserClip > 0) + if ((prog->getType() == Program::TYPE_VERTEX || + prog->getType() == Program::TYPE_TESSELLATION_EVAL) + && info_out->io.genUserClip > 0) handleUserClipPlanes(); // TODO: for non main function this needs to be a OP_RETURN @@ -1496,64 +1425,69 @@ Converter::visit(nir_block *block) bool Converter::visit(nir_if *nif) { + curIfDepth++; + DataType sType = getSType(nif->condition, false, false); Value *src = getSrc(&nif->condition, 0); nir_block *lastThen = nir_if_last_then_block(nif); nir_block *lastElse = nir_if_last_else_block(nif); - assert(!lastThen->successors[1]); - assert(!lastElse->successors[1]); - + BasicBlock *headBB = bb; BasicBlock *ifBB = convert(nir_if_first_then_block(nif)); BasicBlock *elseBB = convert(nir_if_first_else_block(nif)); bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE); bb->cfg.attach(&elseBB->cfg, Graph::Edge::TREE); - // we only insert joinats, if both nodes end up at the end of the if again. - // the reason for this to not happens are breaks/continues/ret/... which - // have their own handling - if (lastThen->successors[0] == lastElse->successors[0]) - bb->joinAt = mkFlow(OP_JOINAT, convert(lastThen->successors[0]), - CC_ALWAYS, NULL); - + bool insertJoins = lastThen->successors[0] == lastElse->successors[0]; mkFlow(OP_BRA, elseBB, CC_EQ, src)->setType(sType); foreach_list_typed(nir_cf_node, node, node, &nif->then_list) { if (!visit(node)) return false; } + setPosition(convert(lastThen), true); - if (!bb->getExit() || - !bb->getExit()->asFlow() || - bb->getExit()->asFlow()->op == OP_JOIN) { + if (!bb->isTerminated()) { BasicBlock *tailBB = convert(lastThen->successors[0]); mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); + } else { + insertJoins = insertJoins && bb->getExit()->op == OP_BRA; } foreach_list_typed(nir_cf_node, node, node, &nif->else_list) { if (!visit(node)) return false; } + setPosition(convert(lastElse), true); - if (!bb->getExit() || - !bb->getExit()->asFlow() || - bb->getExit()->asFlow()->op == OP_JOIN) { + if (!bb->isTerminated()) { BasicBlock *tailBB = convert(lastElse->successors[0]); mkFlow(OP_BRA, tailBB, CC_ALWAYS, NULL); bb->cfg.attach(&tailBB->cfg, Graph::Edge::FORWARD); + } else { + insertJoins = insertJoins && bb->getExit()->op == OP_BRA; } - if (lastThen->successors[0] == lastElse->successors[0]) { - setPosition(convert(lastThen->successors[0]), true); + /* only insert joins for the most outer if */ + if (--curIfDepth) + insertJoins = false; + + /* we made sure that all threads would converge at the same block */ + if (insertJoins) { + BasicBlock *conv = convert(lastThen->successors[0]); + setPosition(headBB->getExit(), false); + headBB->joinAt = mkFlow(OP_JOINAT, conv, CC_ALWAYS, NULL); + setPosition(conv, false); mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; } return true; } +// TODO: add convergency bool Converter::visit(nir_loop *loop) { @@ -1561,8 +1495,8 @@ Converter::visit(nir_loop *loop) func->loopNestingBound = std::max(func->loopNestingBound, curLoopDepth); BasicBlock *loopBB = convert(nir_loop_first_block(loop)); - BasicBlock *tailBB = - convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); + BasicBlock *tailBB = convert(nir_cf_node_as_block(nir_cf_node_next(&loop->cf_node))); + bb->cfg.attach(&loopBB->cfg, Graph::Edge::TREE); mkFlow(OP_PREBREAK, tailBB, CC_ALWAYS, NULL); @@ -1573,19 +1507,15 @@ Converter::visit(nir_loop *loop) if (!visit(node)) return false; } - Instruction *insn = bb->getExit(); - if (bb->cfg.incidentCount() != 0) { - if (!insn || !insn->asFlow()) { - mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); - bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); - } else if (insn && insn->op == OP_BRA && !insn->getPredicate() && - tailBB->cfg.incidentCount() == 0) { - // RA doesn't like having blocks around with no incident edge, - // so we create a fake one to make it happy - bb->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); - } + + if (!bb->isTerminated()) { + mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); + bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); } + if (tailBB->cfg.incidentCount() == 0) + loopBB->cfg.attach(&tailBB->cfg, Graph::Edge::TREE); + curLoopDepth -= 1; return true; @@ -1594,6 +1524,8 @@ Converter::visit(nir_loop *loop) bool Converter::visit(nir_instr *insn) { + // we need an insertion point for on the fly generated immediate loads + immInsertPos = bb->getExit(); switch (insn->type) { case nir_instr_type_alu: return visit(nir_instr_as_alu(insn)); @@ -1626,6 +1558,7 @@ Converter::convert(nir_intrinsic_op intr) return SV_DRAWID; case nir_intrinsic_load_front_face: return SV_FACE; + case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_helper_invocation: return SV_THREAD_KILL; case nir_intrinsic_load_instance_id: @@ -1670,6 +1603,8 @@ Converter::convert(nir_intrinsic_op intr) return SV_VERTEX_ID; case nir_intrinsic_load_work_group_id: return SV_CTAID; + case nir_intrinsic_load_work_dim: + return SV_WORK_DIM; default: ERROR("unknown SVSemantic for nir_intrinsic_op %s\n", nir_intrinsic_infos[intr].name); @@ -1682,6 +1617,8 @@ bool Converter::visit(nir_intrinsic_instr *insn) { nir_intrinsic_op op = insn->intrinsic; + const nir_intrinsic_info &opInfo = nir_intrinsic_infos[op]; + unsigned dest_components = nir_intrinsic_dest_components(insn); switch (op) { case nir_intrinsic_load_uniform: { @@ -1689,7 +1626,7 @@ Converter::visit(nir_intrinsic_instr *insn) const DataType dType = getDType(insn); Value *indirect; uint32_t coffset = getIndirect(insn, 0, 0, indirect); - for (uint8_t i = 0; i < insn->num_components; ++i) { + for (uint8_t i = 0; i < dest_components; ++i) { loadFrom(FILE_MEMORY_CONST, 0, dType, newDefs[i], 16 * coffset, i, indirect); } break; @@ -1700,7 +1637,7 @@ Converter::visit(nir_intrinsic_instr *insn) DataType dType = getSType(insn->src[0], false, false); uint32_t idx = getIndirect(insn, op == nir_intrinsic_store_output ? 1 : 2, 0, indirect); - for (uint8_t i = 0u; i < insn->num_components; ++i) { + for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { if (!((1u << i) & nir_intrinsic_write_mask(insn))) continue; @@ -1708,7 +1645,7 @@ Converter::visit(nir_intrinsic_instr *insn) Value *src = getSrc(&insn->src[0], i); switch (prog->getType()) { case Program::TYPE_FRAGMENT: { - if (info->out[idx].sn == TGSI_SEMANTIC_POSITION) { + if (info_out->out[idx].sn == TGSI_SEMANTIC_POSITION) { // TGSI uses a different interface than NIR, TGSI stores that // value in the z component, NIR in X offset += 2; @@ -1716,8 +1653,10 @@ Converter::visit(nir_intrinsic_instr *insn) } break; } + case Program::TYPE_GEOMETRY: + case Program::TYPE_TESSELLATION_EVAL: case Program::TYPE_VERTEX: { - if (info->io.genUserClip > 0 && idx == clipVertexOutput) { + if (info_out->io.genUserClip > 0 && idx == (uint32_t)clipVertexOutput) { mkMov(clipVtx[i], src); src = clipVtx[i]; } @@ -1752,7 +1691,7 @@ Converter::visit(nir_intrinsic_instr *insn) srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_LAYER, 0))); srcs.push_back(mkOp1v(OP_RDSV, TYPE_U32, getSSA(), mkSysVal(SV_SAMPLE_INDEX, 0))); - for (uint8_t i = 0u; i < insn->num_components; ++i) { + for (uint8_t i = 0u; i < dest_components; ++i) { defs.push_back(newDefs[i]); mask |= 1 << i; } @@ -1764,7 +1703,7 @@ Converter::visit(nir_intrinsic_instr *insn) texi->tex.r = 0xffff; texi->tex.s = 0xffff; - info->prop.fp.readsFramebuffer = true; + info_out->prop.fp.readsFramebuffer = true; break; } @@ -1775,19 +1714,29 @@ Converter::visit(nir_intrinsic_instr *insn) uint32_t mode = 0; uint32_t idx = getIndirect(insn, op == nir_intrinsic_load_interpolated_input ? 1 : 0, 0, indirect); - nv50_ir_varying& vary = input ? info->in[idx] : info->out[idx]; + nv50_ir_varying& vary = input ? info_out->in[idx] : info_out->out[idx]; // see load_barycentric_* handling if (prog->getType() == Program::TYPE_FRAGMENT) { - mode = translateInterpMode(&vary, nvirOp); if (op == nir_intrinsic_load_interpolated_input) { ImmediateValue immMode; if (getSrc(&insn->src[0], 1)->getUniqueInsn()->src(0).getImmediate(immMode)) - mode |= immMode.reg.data.u32; + mode = immMode.reg.data.u32; + } + if (mode == NV50_IR_INTERP_DEFAULT) + mode |= translateInterpMode(&vary, nvirOp); + else { + if (vary.linear) { + nvirOp = OP_LINTERP; + mode |= NV50_IR_INTERP_LINEAR; + } else { + nvirOp = OP_PINTERP; + mode |= NV50_IR_INTERP_PERSPECTIVE; + } } } - for (uint8_t i = 0u; i < insn->num_components; ++i) { + for (uint8_t i = 0u; i < dest_components; ++i) { uint32_t address = getSlotAddress(insn, idx, i); Symbol *sym = mkSymbol(input ? FILE_SHADER_INPUT : FILE_SHADER_OUTPUT, 0, dType, address); if (prog->getType() == Program::TYPE_FRAGMENT) { @@ -1856,7 +1805,7 @@ Converter::visit(nir_intrinsic_instr *insn) } else if (op == nir_intrinsic_load_barycentric_pixel) { mode = NV50_IR_INTERP_DEFAULT; } else if (op == nir_intrinsic_load_barycentric_at_sample) { - info->prop.fp.readsSampleLocations = true; + info_out->prop.fp.readsSampleLocations = true; mkOp1(OP_PIXLD, TYPE_U32, newDefs[0], getSrc(&insn->src[0], 0))->subOp = NV50_IR_SUBOP_PIXLD_OFFSET; mode = NV50_IR_INTERP_OFFSET; } else { @@ -1866,9 +1815,11 @@ Converter::visit(nir_intrinsic_instr *insn) loadImm(newDefs[1], mode); break; } + case nir_intrinsic_demote: case nir_intrinsic_discard: mkOp(OP_DISCARD, TYPE_NONE, NULL); break; + case nir_intrinsic_demote_if: case nir_intrinsic_discard_if: { Value *pred = getSSA(1, FILE_PREDICATE); if (insn->num_components > 1) { @@ -1884,6 +1835,7 @@ Converter::visit(nir_intrinsic_instr *insn) case nir_intrinsic_load_base_instance: case nir_intrinsic_load_draw_id: case nir_intrinsic_load_front_face: + case nir_intrinsic_is_helper_invocation: case nir_intrinsic_load_helper_invocation: case nir_intrinsic_load_instance_id: case nir_intrinsic_load_invocation_id: @@ -1905,12 +1857,13 @@ Converter::visit(nir_intrinsic_instr *insn) case nir_intrinsic_load_tess_level_inner: case nir_intrinsic_load_tess_level_outer: case nir_intrinsic_load_vertex_id: - case nir_intrinsic_load_work_group_id: { + case nir_intrinsic_load_work_group_id: + case nir_intrinsic_load_work_dim: { const DataType dType = getDType(insn); SVSemantic sv = convert(op); LValues &newDefs = convert(&insn->dest); - for (uint8_t i = 0u; i < insn->num_components; ++i) { + for (uint8_t i = 0u; i < nir_intrinsic_dest_components(insn); ++i) { Value *def; if (typeSizeof(dType) == 8) def = getSSA(); @@ -1962,17 +1915,455 @@ Converter::visit(nir_intrinsic_instr *insn) if (op == nir_intrinsic_read_first_invocation) { mkOp1(OP_VOTE, TYPE_U32, tmp, mkImm(1))->subOp = NV50_IR_SUBOP_VOTE_ANY; - mkOp2(OP_EXTBF, TYPE_U32, tmp, tmp, mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; + mkOp1(OP_BREV, TYPE_U32, tmp, tmp); mkOp1(OP_BFIND, TYPE_U32, tmp, tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; } else tmp = getSrc(&insn->src[1], 0); - for (uint8_t i = 0; i < insn->num_components; ++i) { + for (uint8_t i = 0; i < dest_components; ++i) { mkOp3(OP_SHFL, dType, newDefs[i], getSrc(&insn->src[0], i), tmp, mkImm(0x1f)) ->subOp = NV50_IR_SUBOP_SHFL_IDX; } break; } + case nir_intrinsic_load_per_vertex_input: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectVertex; + Value *indirectOffset; + uint32_t baseVertex = getIndirect(&insn->src[0], 0, indirectVertex); + uint32_t idx = getIndirect(insn, 1, 0, indirectOffset); + + Value *vtxBase = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(4, FILE_ADDRESS), + mkImm(baseVertex), indirectVertex); + for (uint8_t i = 0u; i < dest_components; ++i) { + uint32_t address = getSlotAddress(insn, idx, i); + loadFrom(FILE_SHADER_INPUT, 0, dType, newDefs[i], address, 0, + indirectOffset, vtxBase, info_out->in[idx].patch); + } + break; + } + case nir_intrinsic_load_per_vertex_output: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectVertex; + Value *indirectOffset; + uint32_t baseVertex = getIndirect(&insn->src[0], 0, indirectVertex); + uint32_t idx = getIndirect(insn, 1, 0, indirectOffset); + Value *vtxBase = NULL; + + if (indirectVertex) + vtxBase = indirectVertex; + else + vtxBase = loadImm(NULL, baseVertex); + + vtxBase = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, FILE_ADDRESS), outBase, vtxBase); + + for (uint8_t i = 0u; i < dest_components; ++i) { + uint32_t address = getSlotAddress(insn, idx, i); + loadFrom(FILE_SHADER_OUTPUT, 0, dType, newDefs[i], address, 0, + indirectOffset, vtxBase, info_out->in[idx].patch); + } + break; + } + case nir_intrinsic_emit_vertex: { + if (info_out->io.genUserClip > 0) + handleUserClipPlanes(); + uint32_t idx = nir_intrinsic_stream_id(insn); + mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; + break; + } + case nir_intrinsic_end_primitive: { + uint32_t idx = nir_intrinsic_stream_id(insn); + if (idx) + break; + mkOp1(getOperation(op), TYPE_U32, NULL, mkImm(idx))->fixed = 1; + break; + } + case nir_intrinsic_load_ubo: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectIndex; + Value *indirectOffset; + uint32_t index = getIndirect(&insn->src[0], 0, indirectIndex) + 1; + uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); + + for (uint8_t i = 0u; i < dest_components; ++i) { + loadFrom(FILE_MEMORY_CONST, index, dType, newDefs[i], offset, i, + indirectOffset, indirectIndex); + } + break; + } + case nir_intrinsic_get_buffer_size: { + LValues &newDefs = convert(&insn->dest); + const DataType dType = getDType(insn); + Value *indirectBuffer; + uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer); + + Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, dType, 0); + mkOp1(OP_BUFQ, dType, newDefs[0], sym)->setIndirect(0, 0, indirectBuffer); + break; + } + case nir_intrinsic_store_ssbo: { + DataType sType = getSType(insn->src[0], false, false); + Value *indirectBuffer; + Value *indirectOffset; + uint32_t buffer = getIndirect(&insn->src[1], 0, indirectBuffer); + uint32_t offset = getIndirect(&insn->src[2], 0, indirectOffset); + + for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + if (!((1u << i) & nir_intrinsic_write_mask(insn))) + continue; + Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, sType, + offset + i * typeSizeof(sType)); + mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i)) + ->setIndirect(0, 1, indirectBuffer); + } + info_out->io.globalAccess |= 0x2; + break; + } + case nir_intrinsic_load_ssbo: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectBuffer; + Value *indirectOffset; + uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer); + uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); + + for (uint8_t i = 0u; i < dest_components; ++i) + loadFrom(FILE_MEMORY_BUFFER, buffer, dType, newDefs[i], offset, i, + indirectOffset, indirectBuffer); + + info_out->io.globalAccess |= 0x1; + break; + } + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_xor: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectOffset; + uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); + Symbol *sym = mkSymbol(FILE_MEMORY_SHARED, 0, dType, offset); + Instruction *atom = mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0)); + if (op == nir_intrinsic_shared_atomic_comp_swap) + atom->setSrc(2, getSrc(&insn->src[2], 0)); + atom->setIndirect(0, 0, indirectOffset); + atom->subOp = getSubOp(op); + break; + } + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_xor: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectBuffer; + Value *indirectOffset; + uint32_t buffer = getIndirect(&insn->src[0], 0, indirectBuffer); + uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); + + Symbol *sym = mkSymbol(FILE_MEMORY_BUFFER, buffer, dType, offset); + Instruction *atom = mkOp2(OP_ATOM, dType, newDefs[0], sym, + getSrc(&insn->src[2], 0)); + if (op == nir_intrinsic_ssbo_atomic_comp_swap) + atom->setSrc(2, getSrc(&insn->src[3], 0)); + atom->setIndirect(0, 0, indirectOffset); + atom->setIndirect(0, 1, indirectBuffer); + atom->subOp = getSubOp(op); + + info_out->io.globalAccess |= 0x2; + break; + } + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_xor: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *address; + uint32_t offset = getIndirect(&insn->src[0], 0, address); + + Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, dType, offset); + Instruction *atom = + mkOp2(OP_ATOM, dType, newDefs[0], sym, getSrc(&insn->src[1], 0)); + if (op == nir_intrinsic_global_atomic_comp_swap) + atom->setSrc(2, getSrc(&insn->src[2], 0)); + atom->setIndirect(0, 0, address); + atom->subOp = getSubOp(op); + + info_out->io.globalAccess |= 0x2; + break; + } + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_samples: + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_inc_wrap: + case nir_intrinsic_image_atomic_dec_wrap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_samples: + case nir_intrinsic_image_size: + case nir_intrinsic_image_store: { + std::vector srcs, defs; + Value *indirect; + DataType ty; + + uint32_t mask = 0; + TexInstruction::Target target = + convert(nir_intrinsic_image_dim(insn), !!nir_intrinsic_image_array(insn), false); + unsigned int argCount = getNIRArgCount(target); + uint16_t location = 0; + + if (opInfo.has_dest) { + LValues &newDefs = convert(&insn->dest); + for (uint8_t i = 0u; i < newDefs.size(); ++i) { + defs.push_back(newDefs[i]); + mask |= 1 << i; + } + } + + int lod_src = -1; + bool bindless = false; + switch (op) { + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: + ty = getDType(insn); + bindless = true; + info_out->io.globalAccess |= 0x2; + mask = 0x1; + break; + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_comp_swap: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_imax: + case nir_intrinsic_image_atomic_umax: + case nir_intrinsic_image_atomic_imin: + case nir_intrinsic_image_atomic_umin: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_inc_wrap: + case nir_intrinsic_image_atomic_dec_wrap: + ty = getDType(insn); + bindless = false; + info_out->io.globalAccess |= 0x2; + mask = 0x1; + break; + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_image_load: + ty = TYPE_U32; + bindless = op == nir_intrinsic_bindless_image_load; + info_out->io.globalAccess |= 0x1; + lod_src = 4; + break; + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_image_store: + ty = TYPE_U32; + mask = 0xf; + bindless = op == nir_intrinsic_bindless_image_store; + info_out->io.globalAccess |= 0x2; + lod_src = 5; + mask = 0xf; + break; + case nir_intrinsic_bindless_image_samples: + mask = 0x8; + case nir_intrinsic_image_samples: + ty = TYPE_U32; + bindless = op == nir_intrinsic_bindless_image_samples; + mask = 0x8; + break; + case nir_intrinsic_bindless_image_size: + case nir_intrinsic_image_size: + assert(nir_src_as_uint(insn->src[1]) == 0); + ty = TYPE_U32; + bindless = op == nir_intrinsic_bindless_image_size; + break; + default: + unreachable("unhandled image opcode"); + break; + } + + if (bindless) + indirect = getSrc(&insn->src[0], 0); + else + location = getIndirect(&insn->src[0], 0, indirect); + + // coords + if (opInfo.num_srcs >= 2) + for (unsigned int i = 0u; i < argCount; ++i) + srcs.push_back(getSrc(&insn->src[1], i)); + + // the sampler is just another src added after coords + if (opInfo.num_srcs >= 3 && target.isMS()) + srcs.push_back(getSrc(&insn->src[2], 0)); + + if (opInfo.num_srcs >= 4 && lod_src != 4) { + unsigned components = opInfo.src_components[3] ? opInfo.src_components[3] : insn->num_components; + for (uint8_t i = 0u; i < components; ++i) + srcs.push_back(getSrc(&insn->src[3], i)); + } + + if (opInfo.num_srcs >= 5 && lod_src != 5) + // 1 for aotmic swap + for (uint8_t i = 0u; i < opInfo.src_components[4]; ++i) + srcs.push_back(getSrc(&insn->src[4], i)); + + TexInstruction *texi = mkTex(getOperation(op), target.getEnum(), location, 0, defs, srcs); + texi->tex.bindless = bindless; + texi->tex.format = nv50_ir::TexInstruction::translateImgFormat(nir_intrinsic_format(insn)); + texi->tex.mask = mask; + texi->cache = convert(nir_intrinsic_access(insn)); + texi->setType(ty); + texi->subOp = getSubOp(op); + + if (indirect) + texi->setIndirectR(indirect); + + break; + } + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_shared: { + DataType sType = getSType(insn->src[0], false, false); + Value *indirectOffset; + uint32_t offset = getIndirect(&insn->src[1], 0, indirectOffset); + + for (uint8_t i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + if (!((1u << i) & nir_intrinsic_write_mask(insn))) + continue; + Symbol *sym = mkSymbol(getFile(op), 0, sType, offset + i * typeSizeof(sType)); + mkStore(OP_STORE, sType, sym, indirectOffset, getSrc(&insn->src[0], i)); + } + break; + } + case nir_intrinsic_load_kernel_input: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_shared: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectOffset; + uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); + + for (uint8_t i = 0u; i < dest_components; ++i) + loadFrom(getFile(op), 0, dType, newDefs[i], offset, i, indirectOffset); + + break; + } + case nir_intrinsic_control_barrier: { + // TODO: add flag to shader_info + info_out->numBarriers = 1; + Instruction *bar = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0)); + bar->fixed = 1; + bar->subOp = NV50_IR_SUBOP_BAR_SYNC; + break; + } + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: { + Instruction *bar = mkOp(OP_MEMBAR, TYPE_NONE, NULL); + bar->fixed = 1; + bar->subOp = getSubOp(op); + break; + } + case nir_intrinsic_memory_barrier_tcs_patch: + break; + case nir_intrinsic_shader_clock: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + + loadImm(newDefs[0], 0u); + mkOp1(OP_RDSV, dType, newDefs[1], mkSysVal(SV_CLOCK, 0))->fixed = 1; + break; + } + case nir_intrinsic_load_global: { + const DataType dType = getDType(insn); + LValues &newDefs = convert(&insn->dest); + Value *indirectOffset; + uint32_t offset = getIndirect(&insn->src[0], 0, indirectOffset); + + for (auto i = 0u; i < dest_components; ++i) + loadFrom(FILE_MEMORY_GLOBAL, 0, dType, newDefs[i], offset, i, indirectOffset); + + info_out->io.globalAccess |= 0x1; + break; + } + case nir_intrinsic_store_global: { + DataType sType = getSType(insn->src[0], false, false); + + for (auto i = 0u; i < nir_intrinsic_src_components(insn, 0); ++i) { + if (!((1u << i) & nir_intrinsic_write_mask(insn))) + continue; + if (typeSizeof(sType) == 8) { + Value *split[2]; + mkSplit(split, 4, getSrc(&insn->src[0], i)); + + Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, i * typeSizeof(sType)); + mkStore(OP_STORE, TYPE_U32, sym, getSrc(&insn->src[1], 0), split[0]); + + sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, i * typeSizeof(sType) + 4); + mkStore(OP_STORE, TYPE_U32, sym, getSrc(&insn->src[1], 0), split[1]); + } else { + Symbol *sym = mkSymbol(FILE_MEMORY_GLOBAL, 0, sType, i * typeSizeof(sType)); + mkStore(OP_STORE, sType, sym, getSrc(&insn->src[1], 0), getSrc(&insn->src[0], i)); + } + } + + info_out->io.globalAccess |= 0x2; + break; + } default: ERROR("unknown nir_intrinsic_op %s\n", nir_intrinsic_infos[op].name); return false; @@ -1994,7 +2385,6 @@ Converter::visit(nir_jump_instr *insn) case nir_jump_continue: { bool isBreak = insn->type == nir_jump_break; nir_block *block = insn->instr.block; - assert(!block->successors[1]); BasicBlock *target = convert(block->successors[0]); mkFlow(isBreak ? OP_BREAK : OP_CONT, target, CC_ALWAYS, NULL); bb->cfg.attach(&target->cfg, isBreak ? Graph::Edge::CROSS : Graph::Edge::BACK); @@ -2008,28 +2398,41 @@ Converter::visit(nir_jump_instr *insn) return true; } +Value* +Converter::convert(nir_load_const_instr *insn, uint8_t idx) +{ + Value *val; + + if (immInsertPos) + setPosition(immInsertPos, true); + else + setPosition(bb, false); + + switch (insn->def.bit_size) { + case 64: + val = loadImm(getSSA(8), insn->value[idx].u64); + break; + case 32: + val = loadImm(getSSA(4), insn->value[idx].u32); + break; + case 16: + val = loadImm(getSSA(2), insn->value[idx].u16); + break; + case 8: + val = loadImm(getSSA(1), insn->value[idx].u8); + break; + default: + unreachable("unhandled bit size!\n"); + } + setPosition(bb, true); + return val; +} + bool Converter::visit(nir_load_const_instr *insn) { assert(insn->def.bit_size <= 64); - - LValues &newDefs = convert(&insn->def); - for (int i = 0; i < insn->def.num_components; i++) { - switch (insn->def.bit_size) { - case 64: - loadImm(newDefs[i], insn->value.u64[i]); - break; - case 32: - loadImm(newDefs[i], insn->value.u32[i]); - break; - case 16: - loadImm(newDefs[i], insn->value.u16[i]); - break; - case 8: - loadImm(newDefs[i], insn->value.u8[i]); - break; - } - } + immediates[insn->def.index] = insn; return true; } @@ -2057,7 +2460,6 @@ Converter::visit(nir_alu_instr *insn) case nir_op_iabs: case nir_op_fadd: case nir_op_iadd: - case nir_op_fand: case nir_op_iand: case nir_op_fceil: case nir_op_fcos: @@ -2089,9 +2491,7 @@ Converter::visit(nir_alu_instr *insn) case nir_op_umul_high: case nir_op_fneg: case nir_op_ineg: - case nir_op_fnot: case nir_op_inot: - case nir_op_for: case nir_op_ior: case nir_op_pack_64_2x32_split: case nir_op_fpow: @@ -2104,11 +2504,8 @@ Converter::visit(nir_alu_instr *insn) case nir_op_ushr: case nir_op_fsin: case nir_op_fsqrt: - case nir_op_fsub: - case nir_op_isub: case nir_op_ftrunc: case nir_op_ishl: - case nir_op_fxor: case nir_op_ixor: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); @@ -2178,7 +2575,7 @@ Converter::visit(nir_alu_instr *insn) case nir_op_flt32: case nir_op_ilt32: case nir_op_ult32: - case nir_op_fne32: + case nir_op_fneu32: case nir_op_ine32: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); @@ -2194,14 +2591,12 @@ Converter::visit(nir_alu_instr *insn) i->sType = sTypes[0]; break; } - // those are weird ALU ops and need special handling, because - // 1. they are always componend based - // 2. they basically just merge multiple values into one data type - case nir_op_imov: - case nir_op_fmov: + case nir_op_mov: case nir_op_vec2: case nir_op_vec3: - case nir_op_vec4: { + case nir_op_vec4: + case nir_op_vec8: + case nir_op_vec16: { LValues &newDefs = convert(&insn->dest); for (LValues::size_type c = 0u; c < newDefs.size(); ++c) { mkMov(newDefs[c], getSrc(&insn->src[c]), dType); @@ -2297,7 +2692,7 @@ Converter::visit(nir_alu_instr *insn) case nir_op_bfm: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); - mkOp3(OP_INSBF, dType, newDefs[0], getSrc(&insn->src[0]), loadImm(NULL, 0x808), getSrc(&insn->src[1])); + mkOp2(OP_BMSK, dType, newDefs[0], getSrc(&insn->src[1]), getSrc(&insn->src[0]))->subOp = NV50_IR_SUBOP_BMSK_W; break; } case nir_op_bitfield_insert: { @@ -2317,17 +2712,69 @@ Converter::visit(nir_alu_instr *insn) case nir_op_bitfield_reverse: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); - mkOp2(OP_EXTBF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; + mkOp1(OP_BREV, TYPE_U32, newDefs[0], getSrc(&insn->src[0])); break; } case nir_op_find_lsb: { DEFAULT_CHECKS; LValues &newDefs = convert(&insn->dest); Value *tmp = getSSA(); - mkOp2(OP_EXTBF, TYPE_U32, tmp, getSrc(&insn->src[0]), mkImm(0x2000))->subOp = NV50_IR_SUBOP_EXTBF_REV; + mkOp1(OP_BREV, TYPE_U32, tmp, getSrc(&insn->src[0])); mkOp1(OP_BFIND, TYPE_U32, newDefs[0], tmp)->subOp = NV50_IR_SUBOP_BFIND_SAMT; break; } + case nir_op_extract_u8: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + Value *prmt = getSSA(); + mkOp2(OP_OR, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x4440)); + mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); + break; + } + case nir_op_extract_i8: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + Value *prmt = getSSA(); + mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x1111), loadImm(NULL, 0x8880)); + mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); + break; + } + case nir_op_extract_u16: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + Value *prmt = getSSA(); + mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x22), loadImm(NULL, 0x4410)); + mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); + break; + } + case nir_op_extract_i16: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + Value *prmt = getSSA(); + mkOp3(OP_MAD, TYPE_U32, prmt, getSrc(&insn->src[1]), loadImm(NULL, 0x2222), loadImm(NULL, 0x9910)); + mkOp3(OP_PERMT, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), prmt, loadImm(NULL, 0)); + break; + } + case nir_op_urol: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), + getSrc(&insn->src[1]), getSrc(&insn->src[0])) + ->subOp = NV50_IR_SUBOP_SHF_L | + NV50_IR_SUBOP_SHF_W | + NV50_IR_SUBOP_SHF_HI; + break; + } + case nir_op_uror: { + DEFAULT_CHECKS; + LValues &newDefs = convert(&insn->dest); + mkOp3(OP_SHF, TYPE_U32, newDefs[0], getSrc(&insn->src[0]), + getSrc(&insn->src[1]), getSrc(&insn->src[0])) + ->subOp = NV50_IR_SUBOP_SHF_R | + NV50_IR_SUBOP_SHF_W | + NV50_IR_SUBOP_SHF_LO; + break; + } // boolean conversions case nir_op_b2f32: { DEFAULT_CHECKS; @@ -2373,6 +2820,7 @@ Converter::visit(nir_alu_instr *insn) } default: ERROR("unknown nir_op %s\n", info.name); + assert(false); return false; } @@ -2452,6 +2900,27 @@ Converter::applyProjection(Value *src, Value *proj) return mkOp2v(OP_MUL, TYPE_F32, getScratch(), src, proj); } +unsigned int +Converter::getNIRArgCount(TexInstruction::Target& target) +{ + unsigned int result = target.getArgCount(); + if (target.isCube() && target.isArray()) + result--; + if (target.isMS()) + result--; + return result; +} + +CacheMode +Converter::convert(enum gl_access_qualifier access) +{ + if (access & ACCESS_VOLATILE) + return CACHE_CV; + if (access & ACCESS_COHERENT) + return CACHE_CG; + return CACHE_CA; +} + bool Converter::visit(nir_tex_instr *insn) { @@ -2489,6 +2958,11 @@ Converter::visit(nir_tex_instr *insn) int projIdx = nir_tex_instr_src_index(insn, nir_tex_src_projector); int sampOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_offset); int texOffIdx = nir_tex_instr_src_index(insn, nir_tex_src_texture_offset); + int sampHandleIdx = nir_tex_instr_src_index(insn, nir_tex_src_sampler_handle); + int texHandleIdx = nir_tex_instr_src_index(insn, nir_tex_src_texture_handle); + + bool bindless = sampHandleIdx != -1 || texHandleIdx != -1; + assert((sampHandleIdx != -1) == (texHandleIdx != -1)); if (projIdx != -1) proj = mkOp1v(OP_RCP, TYPE_F32, getScratch(), getSrc(&insn->src[projIdx].src, 0)); @@ -2532,9 +3006,19 @@ Converter::visit(nir_tex_instr *insn) srcs.push_back(getSrc(&insn->src[sampOffIdx].src, 0)); sampOffIdx = srcs.size() - 1; } + if (bindless) { + // currently we use the lower bits + Value *split[2]; + Value *handle = getSrc(&insn->src[sampHandleIdx].src, 0); - r = insn->texture_index; - s = insn->sampler_index; + mkSplit(split, 4, handle); + + srcs.push_back(split[0]); + texOffIdx = srcs.size() - 1; + } + + r = bindless ? 0xff : insn->texture_index; + s = bindless ? 0x1f : insn->sampler_index; defs.resize(newDefs.size()); for (uint8_t d = 0u; d < newDefs.size(); ++d) { @@ -2547,6 +3031,7 @@ Converter::visit(nir_tex_instr *insn) TexInstruction *texi = mkTex(op, target.getEnum(), r, s, defs, srcs); texi->tex.levelZero = lz; texi->tex.mask = mask; + texi->tex.bindless = bindless; if (texOffIdx != -1) texi->tex.rIndirectSrc = texOffIdx; @@ -2584,6 +3069,20 @@ Converter::visit(nir_tex_instr *insn) } } + if (op == OP_TXG && offsetIdx == -1) { + if (nir_tex_instr_has_explicit_tg4_offsets(insn)) { + texi->tex.useOffsets = 4; + setPosition(texi, false); + for (uint8_t i = 0; i < 4; ++i) { + for (uint8_t j = 0; j < 2; ++j) { + texi->offset[i][j].set(loadImm(NULL, insn->tg4_offsets[i][j])); + texi->offset[i][j].setInsn(texi); + } + } + setPosition(texi, true); + } + } + if (ddxIdx != -1 && ddyIdx != -1) { for (uint8_t c = 0u; c < target.getDim() + target.isCube(); ++c) { texi->dPdx[c].set(getSrc(&insn->src[ddxIdx].src, c)); @@ -2613,14 +3112,30 @@ Converter::run() .ballot_bit_size = 32, }; - NIR_PASS_V(nir, nir_lower_io, nir_var_all, type_size, (nir_lower_io_options)0); - NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options); + /* prepare for IO lowering */ + NIR_PASS_V(nir, nir_opt_deref); NIR_PASS_V(nir, nir_lower_regs_to_ssa); - NIR_PASS_V(nir, nir_lower_load_const_to_scalar); NIR_PASS_V(nir, nir_lower_vars_to_ssa); - NIR_PASS_V(nir, nir_lower_alu_to_scalar); + + /* codegen assumes vec4 alignment for memory */ + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, function_temp_type_info); + NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); + NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL); + + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + type_size, (nir_lower_io_options)0); + + NIR_PASS_V(nir, nir_lower_subgroups, &subgroup_options); + + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); + NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS_V(nir, nir_lower_phis_to_scalar); + /*TODO: improve this lowering/optimisation loop so that we can use + * nir_opt_idiv_const effectively before this. + */ + NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_precise); + do { progress = false; NIR_PASS(progress, nir, nir_copy_prop); @@ -2635,8 +3150,6 @@ Converter::run() } while (progress); NIR_PASS_V(nir, nir_lower_bool_to_int32); - NIR_PASS_V(nir, nir_lower_locals_to_regs); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp); NIR_PASS_V(nir, nir_convert_from_ssa, true); // Garbage collect dead instructions @@ -2668,17 +3181,140 @@ Converter::run() namespace nv50_ir { bool -Program::makeFromNIR(struct nv50_ir_prog_info *info) +Program::makeFromNIR(struct nv50_ir_prog_info *info, + struct nv50_ir_prog_info_out *info_out) { nir_shader *nir = (nir_shader*)info->bin.source; - Converter converter(this, nir, info); + Converter converter(this, nir, info, info_out); bool result = converter.run(); if (!result) return result; LoweringHelper lowering; lowering.run(this); - tlsSize = info->bin.tlsSpace; + tlsSize = info_out->bin.tlsSpace; return result; } } // namespace nv50_ir + +static nir_shader_compiler_options +nvir_nir_shader_compiler_options(int chipset) +{ + nir_shader_compiler_options op = {}; + op.lower_fdiv = (chipset >= NVISA_GV100_CHIPSET); + op.lower_ffma = false; + op.fuse_ffma = false; /* nir doesn't track mad vs fma */ + op.lower_flrp16 = (chipset >= NVISA_GV100_CHIPSET); + op.lower_flrp32 = true; + op.lower_flrp64 = true; + op.lower_fpow = false; // TODO: nir's lowering is broken, or we could use it + op.lower_fsat = false; + op.lower_fsqrt = false; // TODO: only before gm200 + op.lower_sincos = false; + op.lower_fmod = true; + op.lower_bitfield_extract = false; + op.lower_bitfield_extract_to_shifts = (chipset >= NVISA_GV100_CHIPSET); + op.lower_bitfield_insert = false; + op.lower_bitfield_insert_to_shifts = (chipset >= NVISA_GV100_CHIPSET); + op.lower_bitfield_insert_to_bitfield_select = false; + op.lower_bitfield_reverse = false; + op.lower_bit_count = false; + op.lower_ifind_msb = false; + op.lower_find_lsb = false; + op.lower_uadd_carry = true; // TODO + op.lower_usub_borrow = true; // TODO + op.lower_mul_high = false; + op.lower_negate = false; + op.lower_sub = true; + op.lower_scmp = true; // TODO: not implemented yet + op.lower_vector_cmp = false; + op.lower_idiv = true; + op.lower_bitops = false; + op.lower_isign = (chipset >= NVISA_GV100_CHIPSET); + op.lower_fsign = (chipset >= NVISA_GV100_CHIPSET); + op.lower_fdph = false; + op.lower_fdot = false; + op.fdot_replicates = false; // TODO + op.lower_ffloor = false; // TODO + op.lower_ffract = true; + op.lower_fceil = false; // TODO + op.lower_ftrunc = false; + op.lower_ldexp = true; + op.lower_pack_half_2x16 = true; + op.lower_pack_unorm_2x16 = true; + op.lower_pack_snorm_2x16 = true; + op.lower_pack_unorm_4x8 = true; + op.lower_pack_snorm_4x8 = true; + op.lower_unpack_half_2x16 = true; + op.lower_unpack_unorm_2x16 = true; + op.lower_unpack_snorm_2x16 = true; + op.lower_unpack_unorm_4x8 = true; + op.lower_unpack_snorm_4x8 = true; + op.lower_pack_split = false; + op.lower_extract_byte = (chipset < NVISA_GM107_CHIPSET); + op.lower_extract_word = (chipset < NVISA_GM107_CHIPSET); + op.lower_all_io_to_temps = false; + op.lower_all_io_to_elements = false; + op.vertex_id_zero_based = false; + op.lower_base_vertex = false; + op.lower_helper_invocation = false; + op.optimize_sample_mask_in = false; + op.lower_cs_local_index_from_id = true; + op.lower_cs_local_id_from_index = false; + op.lower_device_index_to_zero = false; // TODO + op.lower_wpos_pntc = false; // TODO + op.lower_hadd = true; // TODO + op.lower_add_sat = true; // TODO + op.vectorize_io = false; + op.lower_to_scalar = false; + op.unify_interfaces = false; + op.use_interpolated_input_intrinsics = true; + op.lower_mul_2x32_64 = true; // TODO + op.lower_rotate = (chipset < NVISA_GV100_CHIPSET); + op.has_imul24 = false; + op.intel_vec4 = false; + op.max_unroll_iterations = 32; + op.lower_int64_options = (nir_lower_int64_options) ( + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_isign64 : 0) | + nir_lower_divmod64 | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_high64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_mov64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_icmp64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_iabs64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ineg64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_logic64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_minmax64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_shift64 : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_imul_2x32_64 : 0) | + ((chipset >= NVISA_GM107_CHIPSET) ? nir_lower_extract64 : 0) | + nir_lower_ufind_msb64 + ); + op.lower_doubles_options = (nir_lower_doubles_options) ( + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drcp : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsqrt : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_drsq : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dfract : 0) | + nir_lower_dmod | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_dsub : 0) | + ((chipset >= NVISA_GV100_CHIPSET) ? nir_lower_ddiv : 0) + ); + return op; +} + +static const nir_shader_compiler_options gf100_nir_shader_compiler_options = +nvir_nir_shader_compiler_options(NVISA_GF100_CHIPSET); +static const nir_shader_compiler_options gm107_nir_shader_compiler_options = +nvir_nir_shader_compiler_options(NVISA_GM107_CHIPSET); +static const nir_shader_compiler_options gv100_nir_shader_compiler_options = +nvir_nir_shader_compiler_options(NVISA_GV100_CHIPSET); + +const nir_shader_compiler_options * +nv50_ir_nir_shader_compiler_options(int chipset) +{ + if (chipset >= NVISA_GV100_CHIPSET) + return &gv100_nir_shader_compiler_options; + if (chipset >= NVISA_GM107_CHIPSET) + return &gm107_nir_shader_compiler_options; + return &gf100_nir_shader_compiler_options; +}