X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fr600%2Fr600_shader.c;h=7d1452add343bb88d473c9ebe0b5ba7c891eaa54;hb=76b02d2fe1df5351f67f53d07b37952043f0a84c;hp=af7622e9b3417249c2cf16d7ebb757216a92050c;hpb=a62edcce4eb4c800d972817a20ee874bf2a2c3ef;p=mesa.git diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index af7622e9b34..7d1452add34 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -21,7 +21,6 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "r600_sq.h" -#include "r600_llvm.h" #include "r600_formats.h" #include "r600_opcodes.h" #include "r600_shader.h" @@ -34,6 +33,7 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_scan.h" #include "tgsi/tgsi_dump.h" +#include "util/u_bitcast.h" #include "util/u_memory.h" #include "util/u_math.h" #include @@ -47,7 +47,7 @@ MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT These 8xx t-slot only opcodes become vector ops, with all four slots expecting the arguments on sources a and b. Result is broadcast to all channels. -MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT +MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64 These 8xx t-slot only opcodes become vector ops in the z, y, and x slots. EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64 @@ -60,10 +60,32 @@ issued in the w slot as well. The compiler must issue the source argument to slots z, y, and x */ +/* Contents of r0 on entry to various shaders + + VS - .x = VertexID + .y = RelVertexID (??) + .w = InstanceID + + GS - r0.xyw, r1.xyz = per-vertex offsets + r0.z = PrimitiveID + + TCS - .x = PatchID + .y = RelPatchID (??) + .z = InvocationID + .w = tess factor base. + + TES - .x = TessCoord.x + - .y = TessCoord.y + - .z = RelPatchID (??) + - .w = PrimitiveID + + PS - face_gpr.z = SampleMask + face_gpr.w = SampleID +*/ +#define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16) static int r600_shader_from_tgsi(struct r600_context *rctx, struct r600_pipe_shader *pipeshader, - struct r600_shader_key key); - + union r600_shader_key key); static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr, int size, unsigned comp_mask) { @@ -93,8 +115,10 @@ static void r600_dump_streamout(struct pipe_stream_output_info *so) for (i = 0; i < so->num_outputs; i++) { unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; - fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", - i, so->output[i].output_buffer, + fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n", + i, + so->output[i].stream, + so->output[i].output_buffer, so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, so->output[i].register_index, mask & 1 ? "x" : "", @@ -113,7 +137,7 @@ static int store_shader(struct pipe_context *ctx, if (shader->bo == NULL) { shader->bo = (struct r600_resource*) - pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); + pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4); if (shader->bo == NULL) { return -ENOMEM; } @@ -125,7 +149,7 @@ static int store_shader(struct pipe_context *ctx, } else { memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); } - rctx->b.ws->buffer_unmap(shader->bo->cs_buf); + rctx->b.ws->buffer_unmap(shader->bo->buf); } return 0; @@ -133,15 +157,16 @@ static int store_shader(struct pipe_context *ctx, int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader, - struct r600_shader_key key) + union r600_shader_key key) { struct r600_context *rctx = (struct r600_context *)ctx; struct r600_pipe_shader_selector *sel = shader->selector; int r; - bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens); + bool dump = r600_can_dump_shader(&rctx->screen->b, + tgsi_get_processor_type(sel->tokens)); unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB); unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); - unsigned export_shader = key.vs_as_es; + unsigned export_shader; shader->shader.bc.isa = rctx->isa; @@ -158,18 +183,18 @@ int r600_pipe_shader_create(struct pipe_context *ctx, R600_ERR("translation from TGSI failed !\n"); goto error; } + if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { + /* only disable for vertex shaders in tess paths */ + if (key.vs.as_ls) + use_sb = 0; + } + use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL); + use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL); - /* disable SB for geom shaders on R6xx/R7xx due to some mysterious gs piglit regressions with it enabled. */ - if (rctx->b.chip_class <= R700) { - use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); - } - /* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_index_registers; + /* disable SB for shaders using doubles */ + use_sb &= !shader->shader.uses_doubles; - /* Check if the bytecode has already been built. When using the llvm - * backend, r600_shader_from_tgsi() will take care of building the - * bytecode. - */ + /* Check if the bytecode has already been built. */ if (!shader->shader.bc.bytecode) { r = r600_bytecode_build(&shader->shader.bc); if (r) { @@ -210,7 +235,16 @@ int r600_pipe_shader_create(struct pipe_context *ctx, /* Build state. */ switch (shader->shader.processor_type) { - case TGSI_PROCESSOR_GEOMETRY: + case PIPE_SHADER_TESS_CTRL: + evergreen_update_hs_state(ctx, shader); + break; + case PIPE_SHADER_TESS_EVAL: + if (key.tes.as_es) + evergreen_update_es_state(ctx, shader); + else + evergreen_update_vs_state(ctx, shader); + break; + case PIPE_SHADER_GEOMETRY: if (rctx->b.chip_class >= EVERGREEN) { evergreen_update_gs_state(ctx, shader); evergreen_update_vs_state(ctx, shader->gs_copy_shader); @@ -219,9 +253,12 @@ int r600_pipe_shader_create(struct pipe_context *ctx, r600_update_vs_state(ctx, shader->gs_copy_shader); } break; - case TGSI_PROCESSOR_VERTEX: + case PIPE_SHADER_VERTEX: + export_shader = key.vs.as_es; if (rctx->b.chip_class >= EVERGREEN) { - if (export_shader) + if (key.vs.as_ls) + evergreen_update_ls_state(ctx, shader); + else if (key.vs.as_es) evergreen_update_es_state(ctx, shader); else evergreen_update_vs_state(ctx, shader); @@ -232,7 +269,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx, r600_update_vs_state(ctx, shader); } break; - case TGSI_PROCESSOR_FRAGMENT: + case PIPE_SHADER_FRAGMENT: if (rctx->b.chip_class >= EVERGREEN) { evergreen_update_ps_state(ctx, shader); } else { @@ -252,7 +289,7 @@ error: void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader) { - pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL); + r600_resource_reference(&shader->bo, NULL); r600_bytecode_clear(&shader->shader.bc); r600_release_command_buffer(&shader->command_buffer); } @@ -292,7 +329,6 @@ struct r600_shader_ctx { uint32_t *literals; uint32_t nliterals; uint32_t max_driver_temp_used; - boolean use_llvm; /* needed for evergreen interpolation */ struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid /* evergreen/cayman also store sample mask in face register */ @@ -309,7 +345,11 @@ struct r600_shader_ctx { int gs_out_ring_offset; int gs_next_vertex; struct r600_shader *gs_for_vs; - int gs_export_gpr_treg; + int gs_export_gpr_tregs[4]; + const struct pipe_stream_output_info *gs_stream_output_info; + unsigned enabled_stream_buffers_mask; + unsigned tess_input_info; /* temp with tess input offsets */ + unsigned tess_output_info; /* temp with tess input offsets */ }; struct r600_shader_tgsi_instruction { @@ -317,7 +357,7 @@ struct r600_shader_tgsi_instruction { int (*process)(struct r600_shader_ctx *ctx); }; -static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind); +static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind); static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[]; static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx); static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason); @@ -333,13 +373,27 @@ static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, const struct r600_shader_src *shader_src, unsigned chan); +static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, + unsigned dst_reg); + +static int tgsi_last_instruction(unsigned writemask) +{ + int i, lasti = 0; + + for (i = 0; i < 4; i++) { + if (writemask & (1 << i)) { + lasti = i; + } + } + return lasti; +} static int tgsi_is_supported(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction; - int j; + unsigned j; - if (i->Instruction.NumDstRegs > 1) { + if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) { R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs); return -EINVAL; } @@ -359,10 +413,16 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx) case TGSI_FILE_CONSTANT: break; case TGSI_FILE_INPUT: - if (ctx->type == TGSI_PROCESSOR_GEOMETRY) + if (ctx->type == PIPE_SHADER_GEOMETRY || + ctx->type == PIPE_SHADER_TESS_CTRL || + ctx->type == PIPE_SHADER_TESS_EVAL) + break; + case TGSI_FILE_OUTPUT: + if (ctx->type == PIPE_SHADER_TESS_CTRL) break; default: - R600_ERR("unsupported src %d (dimension %d)\n", j, + R600_ERR("unsupported src %d (file %d, dimension %d)\n", j, + i->Src[j].Register.File, i->Src[j].Register.Dimension); return -EINVAL; } @@ -370,6 +430,8 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx) } for (j = 0; j < i->Instruction.NumDstRegs; j++) { if (i->Dst[j].Register.Dimension) { + if (ctx->type == PIPE_SHADER_TESS_CTRL) + continue; R600_ERR("unsupported dst (dimension)\n"); return -EINVAL; } @@ -548,6 +610,44 @@ static int r600_spi_sid(struct r600_shader_io * io) return index; }; +/* we need this to get a common lds index for vs/tcs/tes input/outputs */ +int r600_get_lds_unique_index(unsigned semantic_name, unsigned index) +{ + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: + return 0; + case TGSI_SEMANTIC_PSIZE: + return 1; + case TGSI_SEMANTIC_CLIPDIST: + assert(index <= 1); + return 2 + index; + case TGSI_SEMANTIC_GENERIC: + if (index <= 63-4) + return 4 + index - 9; + else + /* same explanation as in the default statement, + * the only user hitting this is st/nine. + */ + return 0; + + /* patch indices are completely separate and thus start from 0 */ + case TGSI_SEMANTIC_TESSOUTER: + return 0; + case TGSI_SEMANTIC_TESSINNER: + return 1; + case TGSI_SEMANTIC_PATCH: + return 2 + index; + + default: + /* Don't fail here. The result of this function is only used + * for LS, TCS, TES, and GS, where legacy GL semantics can't + * occur, but this function is called for all vertex shaders + * before it's known whether LS will be compiled or not. + */ + return 0; + } +} + /* turn input into interpolate on EG */ static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) { @@ -557,11 +657,9 @@ static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) ctx->shader->input[index].lds_pos = ctx->shader->nlds++; if (ctx->shader->input[index].interpolate > 0) { evergreen_interp_assign_ij_index(ctx, index); - if (!ctx->use_llvm) - r = evergreen_interp_alu(ctx, index); + r = evergreen_interp_alu(ctx, index); } else { - if (!ctx->use_llvm) - r = evergreen_interp_flat(ctx, index); + r = evergreen_interp_flat(ctx, index); } } return r; @@ -596,6 +694,133 @@ static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back return 0; } +/* execute a single slot ALU calculation */ +static int single_alu_op2(struct r600_shader_ctx *ctx, int op, + int dst_sel, int dst_chan, + int src0_sel, unsigned src0_chan_val, + int src1_sel, unsigned src1_chan_val) +{ + struct r600_bytecode_alu alu; + int r, i; + + if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) { + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = i; + alu.dst.write = i == dst_chan; + alu.last = (i == 3); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; + } + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = dst_chan; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +/* execute a single slot ALU calculation */ +static int single_alu_op3(struct r600_shader_ctx *ctx, int op, + int dst_sel, int dst_chan, + int src0_sel, unsigned src0_chan_val, + int src1_sel, unsigned src1_chan_val, + int src2_sel, unsigned src2_chan_val) +{ + struct r600_bytecode_alu alu; + int r; + + /* validate this for other ops */ + assert(op == ALU_OP3_MULADD_UINT24); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = op; + alu.src[0].sel = src0_sel; + if (src0_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[0].value = src0_chan_val; + else + alu.src[0].chan = src0_chan_val; + alu.src[1].sel = src1_sel; + if (src1_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[1].value = src1_chan_val; + else + alu.src[1].chan = src1_chan_val; + alu.src[2].sel = src2_sel; + if (src2_sel == V_SQ_ALU_SRC_LITERAL) + alu.src[2].value = src2_chan_val; + else + alu.src[2].chan = src2_chan_val; + alu.dst.sel = dst_sel; + alu.dst.chan = dst_chan; + alu.is_op3 = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +/* put it in temp_reg.x */ +static int get_lds_offset0(struct r600_shader_ctx *ctx, + int rel_patch_chan, + int temp_reg, bool is_patch_var) +{ + int r; + + /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */ + /* ADD + Dimension - patch0_offset (input_vals.z), + Non-dim - patch0_data_offset (input_vals.w) + */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + temp_reg, 0, + ctx->tess_output_info, 0, + 0, rel_patch_chan, + ctx->tess_output_info, is_patch_var ? 3 : 2); + if (r) + return r; + return 0; +} + +static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index) +{ + return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg; +} + +static int r600_get_temp(struct r600_shader_ctx *ctx) +{ + return ctx->temp_reg + ctx->max_driver_temp_used++; +} + static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) { int i; @@ -610,6 +835,21 @@ static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid) return 0; } +static int tgsi_barrier(struct r600_shader_ctx *ctx) +{ + struct r600_bytecode_alu alu; + int r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + alu.last = 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + static int tgsi_declaration(struct r600_shader_ctx *ctx) { struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration; @@ -619,13 +859,13 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_INPUT: for (j = 0; j < count; j++) { i = ctx->shader->ninput + j; - assert(i < Elements(ctx->shader->input)); + assert(i < ARRAY_SIZE(ctx->shader->input)); ctx->shader->input[i].name = d->Semantic.Name; ctx->shader->input[i].sid = d->Semantic.Index + j; ctx->shader->input[i].interpolate = d->Interp.Interpolate; ctx->shader->input[i].interpolate_location = d->Interp.Location; ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j; - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + if (ctx->type == PIPE_SHADER_FRAGMENT) { ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]); switch (ctx->shader->input[i].name) { case TGSI_SEMANTIC_FACE: @@ -650,7 +890,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) if ((r = evergreen_interp_input(ctx, i))) return r; } - } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { + } else if (ctx->type == PIPE_SHADER_GEOMETRY) { /* FIXME probably skip inputs if they aren't passed in the ring */ ctx->shader->input[i].ring_offset = ctx->next_ring_offset; ctx->next_ring_offset += 16; @@ -663,14 +903,15 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_OUTPUT: for (j = 0; j < count; j++) { i = ctx->shader->noutput + j; - assert(i < Elements(ctx->shader->output)); + assert(i < ARRAY_SIZE(ctx->shader->output)); ctx->shader->output[i].name = d->Semantic.Name; ctx->shader->output[i].sid = d->Semantic.Index + j; ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j; ctx->shader->output[i].interpolate = d->Interp.Interpolate; ctx->shader->output[i].write_mask = d->Declaration.UsageMask; - if (ctx->type == TGSI_PROCESSOR_VERTEX || - ctx->type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx->type == PIPE_SHADER_VERTEX || + ctx->type == PIPE_SHADER_GEOMETRY || + ctx->type == PIPE_SHADER_TESS_EVAL) { ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]); switch (d->Semantic.Name) { case TGSI_SEMANTIC_CLIPDIST: @@ -699,10 +940,10 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) ctx->cv_output = i; break; } - if (ctx->type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx->type == PIPE_SHADER_GEOMETRY) { ctx->gs_out_ring_offset += 16; } - } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { + } else if (ctx->type == PIPE_SHADER_FRAGMENT) { switch (d->Semantic.Name) { case TGSI_SEMANTIC_COLOR: ctx->shader->nr_ps_max_color_exports++; @@ -756,6 +997,73 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) break; else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID) break; + else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || + d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { + int param = r600_get_lds_unique_index(d->Semantic.Name, 0); + int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; + unsigned temp_reg = r600_get_temp(ctx); + + r = get_lds_offset0(ctx, 2, temp_reg, true); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, 0, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, param * 16); + if (r) + return r; + + do_lds_fetch_values(ctx, temp_reg, dreg); + } + else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { + /* MOV r1.x, r0.x; + MOV r1.y, r0.y; + */ + for (i = 0; i < 2; i++) { + struct r600_bytecode_alu alu; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = 0; + alu.src[0].chan = 0 + i; + alu.dst.sel = 1; + alu.dst.chan = 0 + i; + alu.dst.write = 1; + alu.last = (i == 1) ? 1 : 0; + if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) + return r; + } + /* ADD r1.z, 1.0f, -r0.x */ + struct r600_bytecode_alu alu; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD; + alu.src[0].sel = V_SQ_ALU_SRC_1; + alu.src[1].sel = 1; + alu.src[1].chan = 0; + alu.src[1].neg = 1; + alu.dst.sel = 1; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) + return r; + + /* ADD r1.z, r1.z, -r1.y */ + alu.op = ALU_OP2_ADD; + alu.src[0].sel = 1; + alu.src[0].chan = 2; + alu.src[1].sel = 1; + alu.src[1].chan = 1; + alu.src[1].neg = 1; + alu.dst.sel = 1; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx->bc, &alu))) + return r; + break; + } + break; default: R600_ERR("unsupported file %d declaration\n", d->Declaration.File); return -EINVAL; @@ -763,11 +1071,6 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) return 0; } -static int r600_get_temp(struct r600_shader_ctx *ctx) -{ - return ctx->temp_reg + ctx->max_driver_temp_used++; -} - static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset) { struct tgsi_parse_context parse; @@ -815,7 +1118,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration; if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) { - for (k = 0; k < Elements(inputs); k++) { + for (k = 0; k < ARRAY_SIZE(inputs); k++) { if (d->Semantic.Name == inputs[k].name || d->Semantic.Name == inputs[k].alternate_name) { inputs[k].enabled = true; @@ -827,7 +1130,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off tgsi_parse_free(&parse); - for (i = 0; i < Elements(inputs); i++) { + for (i = 0; i < ARRAY_SIZE(inputs); i++) { boolean enabled = inputs[i].enabled; int *reg = inputs[i].reg; unsigned name = inputs[i].name; @@ -858,7 +1161,7 @@ static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_off */ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) { - int i; + unsigned i; int num_baryc; struct tgsi_parse_context parse; @@ -915,7 +1218,7 @@ static int evergreen_gpr_count(struct r600_shader_ctx *ctx) /* assign gpr to each interpolator according to priority */ num_baryc = 0; - for (i = 0; i < Elements(ctx->eg_interpolators); i++) { + for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) { if (ctx->eg_interpolators[i].enabled) { ctx->eg_interpolators[i].ij_index = num_baryc; num_baryc ++; @@ -940,7 +1243,7 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_ memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); vtx.op = FETCH_OP_VFETCH; - vtx.buffer_id = R600_SAMPLE_POSITIONS_CONST_BUFFER; + vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER; vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; if (sample_id == NULL) { vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w; @@ -1002,7 +1305,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx, (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) { index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX; - r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg); + r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs); if (r600_src->sel != V_SQ_ALU_SRC_LITERAL) return; } @@ -1040,12 +1343,50 @@ static void tgsi_src(struct r600_shader_ctx *ctx, r600_src->swizzle[2] = 0; r600_src->swizzle[3] = 0; r600_src->sel = 0; - } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { + } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { r600_src->swizzle[0] = 3; r600_src->swizzle[1] = 3; r600_src->swizzle[2] = 3; r600_src->swizzle[3] = 3; r600_src->sel = 1; + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) { + r600_src->swizzle[0] = 2; + r600_src->swizzle[1] = 2; + r600_src->swizzle[2] = 2; + r600_src->swizzle[3] = 2; + r600_src->sel = 0; + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) { + r600_src->sel = 1; + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) { + r600_src->sel = 3; + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) { + r600_src->sel = 2; + } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) { + if (ctx->type == PIPE_SHADER_TESS_CTRL) { + r600_src->sel = ctx->tess_input_info; + r600_src->swizzle[0] = 2; + r600_src->swizzle[1] = 2; + r600_src->swizzle[2] = 2; + r600_src->swizzle[3] = 2; + } else { + r600_src->sel = ctx->tess_input_info; + r600_src->swizzle[0] = 3; + r600_src->swizzle[1] = 3; + r600_src->swizzle[2] = 3; + r600_src->swizzle[3] = 3; + } + } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { + r600_src->sel = 0; + r600_src->swizzle[0] = 0; + r600_src->swizzle[1] = 0; + r600_src->swizzle[2] = 0; + r600_src->swizzle[3] = 0; + } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) { + r600_src->sel = 0; + r600_src->swizzle[0] = 3; + r600_src->swizzle[1] = 3; + r600_src->swizzle[2] = 3; + r600_src->swizzle[3] = 3; } } else { if (tgsi_src->Register.Indirect) @@ -1127,6 +1468,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi unsigned vtx_id = src->Dimension.Index; int offset_reg = vtx_id / 3; int offset_chan = vtx_id % 3; + int t2 = 0; /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y, * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */ @@ -1134,13 +1476,24 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi if (offset_reg == 0 && offset_chan == 2) offset_chan = 3; + if (src->Dimension.Indirect || src->Register.Indirect) + t2 = r600_get_temp(ctx); + if (src->Dimension.Indirect) { int treg[3]; - int t2; struct r600_bytecode_alu alu; int r, i; - - /* you have got to be shitting me - + unsigned addr_reg; + addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index); + if (src->DimIndirect.Index > 0) { + r = single_alu_op2(ctx, ALU_OP1_MOV, + ctx->bc->ar_reg, 0, + addr_reg, 0, + 0, 0); + if (r) + return r; + } + /* we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt. at least this is what fglrx seems to do. */ for (i = 0; i < 3; i++) { @@ -1148,7 +1501,6 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi } r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F); - t2 = r600_get_temp(ctx); for (i = 0; i < 3; i++) { memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; @@ -1173,8 +1525,33 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi if (r) return r; offset_reg = t2; + offset_chan = 0; } + if (src->Register.Indirect) { + int addr_reg; + unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID]; + + addr_reg = get_address_file_reg(ctx, src->Indirect.Index); + + /* pull the value from index_reg */ + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + t2, 1, + addr_reg, 0, + V_SQ_ALU_SRC_LITERAL, first); + if (r) + return r; + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + t2, 0, + t2, 1, + V_SQ_ALU_SRC_LITERAL, 4, + offset_reg, offset_chan); + if (r) + return r; + offset_reg = t2; + offset_chan = 0; + index = src->Register.Index - first; + } memset(&vtx, 0, sizeof(vtx)); vtx.buffer_id = R600_GS_RING_CONST_BUFFER; @@ -1203,7 +1580,7 @@ static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_regi static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - int i; + unsigned i; for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { struct tgsi_full_src_register *src = &inst->Src[i]; @@ -1220,64 +1597,335 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) fetch_gs_input(ctx, src, treg); ctx->src[i].sel = treg; + ctx->src[i].rel = 0; } } return 0; } -static int tgsi_split_constant(struct r600_shader_ctx *ctx) -{ - struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - struct r600_bytecode_alu alu; - int i, j, k, nconst, r; - for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { - if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { - nconst++; +/* Tessellation shaders pass outputs to the next shader using LDS. + * + * LS outputs = TCS(HS) inputs + * TCS(HS) outputs = TES(DS) inputs + * + * The LDS layout is: + * - TCS inputs for patch 0 + * - TCS inputs for patch 1 + * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2) + * - ... + * - TCS outputs for patch 0 = get_tcs_out_patch0_offset + * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset + * - TCS outputs for patch 1 + * - Per-patch TCS outputs for patch 1 + * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2) + * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2) + * - ... + * + * All three shaders VS(LS), TCS, TES share the same LDS space. + */ +/* this will return with the dw address in temp_reg.x */ +static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, + const struct tgsi_full_dst_register *dst, + const struct tgsi_full_src_register *src, + int stride_bytes_reg, int stride_bytes_chan) +{ + struct tgsi_full_dst_register reg; + ubyte *name, *index, *array_first; + int r; + int param; + struct tgsi_shader_info *info = &ctx->info; + /* Set the register description. The address computation is the same + * for sources and destinations. */ + if (src) { + reg.Register.File = src->Register.File; + reg.Register.Index = src->Register.Index; + reg.Register.Indirect = src->Register.Indirect; + reg.Register.Dimension = src->Register.Dimension; + reg.Indirect = src->Indirect; + reg.Dimension = src->Dimension; + reg.DimIndirect = src->DimIndirect; + } else + reg = *dst; + + /* If the register is 2-dimensional (e.g. an array of vertices + * in a primitive), calculate the base address of the vertex. */ + if (reg.Register.Dimension) { + int sel, chan; + if (reg.Dimension.Indirect) { + unsigned addr_reg; + assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS); + + addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index); + /* pull the value from index_reg */ + sel = addr_reg; + chan = 0; + } else { + sel = V_SQ_ALU_SRC_LITERAL; + chan = reg.Dimension.Index; } - tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); + + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + temp_reg, 0, + stride_bytes_reg, stride_bytes_chan, + sel, chan, + temp_reg, 0); + if (r) + return r; } - for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { - if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { - continue; - } - if (ctx->src[i].kc_rel) - ctx->shader->uses_index_registers = true; + if (reg.Register.File == TGSI_FILE_INPUT) { + name = info->input_semantic_name; + index = info->input_semantic_index; + array_first = info->input_array_first; + } else if (reg.Register.File == TGSI_FILE_OUTPUT) { + name = info->output_semantic_name; + index = info->output_semantic_index; + array_first = info->output_array_first; + } else { + assert(0); + return -1; + } + if (reg.Register.Indirect) { + int addr_reg; + int first; + /* Add the relative address of the element. */ + if (reg.Indirect.ArrayID) + first = array_first[reg.Indirect.ArrayID]; + else + first = reg.Register.Index; - if (ctx->src[i].rel) { - int chan = inst->Src[i].Indirect.Swizzle; - int treg = r600_get_temp(ctx); - if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) - return r; + addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); - ctx->src[i].kc_bank = 0; - ctx->src[i].kc_rel = 0; - ctx->src[i].sel = treg; - ctx->src[i].rel = 0; - j--; - } else if (j > 0) { - int treg = r600_get_temp(ctx); - for (k = 0; k < 4; k++) { - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.src[0].sel = ctx->src[i].sel; - alu.src[0].chan = k; - alu.src[0].rel = ctx->src[i].rel; - alu.src[0].kc_bank = ctx->src[i].kc_bank; - alu.src[0].kc_rel = ctx->src[i].kc_rel; - alu.dst.sel = treg; - alu.dst.chan = k; - alu.dst.write = 1; - if (k == 3) - alu.last = 1; - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; - } - ctx->src[i].sel = treg; - ctx->src[i].rel =0; - j--; + /* pull the value from index_reg */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, 16, + addr_reg, 0, + temp_reg, 0); + if (r) + return r; + + param = r600_get_lds_unique_index(name[first], + index[first]); + + } else { + param = r600_get_lds_unique_index(name[reg.Register.Index], + index[reg.Register.Index]); + } + + /* add to base_addr - passed in temp_reg.x */ + if (param) { + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, 0, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, param * 16); + if (r) + return r; + + } + return 0; +} + +static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, + unsigned dst_reg) +{ + struct r600_bytecode_alu alu; + int r, i; + + if ((ctx->bc->cf_last->ndw>>1) >= 0x60) + ctx->bc->force_add_cf = 1; + for (i = 1; i < 4; i++) { + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, i, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, 4 * i); + if (r) + return r; + } + for (i = 0; i < 4; i++) { + /* emit an LDS_READ_RET */ + memset(&alu, 0, sizeof(alu)); + alu.op = LDS_OP1_LDS_READ_RET; + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + alu.src[1].sel = V_SQ_ALU_SRC_0; + alu.src[2].sel = V_SQ_ALU_SRC_0; + alu.dst.chan = 0; + alu.is_lds_idx_op = true; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + for (i = 0; i < 4; i++) { + /* then read from LDS_OQ_A_POP */ + memset(&alu, 0, sizeof(alu)); + + alu.op = ALU_OP1_MOV; + alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP; + alu.src[0].chan = 0; + alu.dst.sel = dst_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; +} + +static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) +{ + int r; + unsigned temp_reg = r600_get_temp(ctx); + + r = get_lds_offset0(ctx, 2, temp_reg, + src->Register.Dimension ? false : true); + if (r) + return r; + + /* the base address is now in temp.x */ + r = r600_get_byte_address(ctx, temp_reg, + NULL, src, ctx->tess_output_info, 1); + if (r) + return r; + + r = do_lds_fetch_values(ctx, temp_reg, dst_reg); + if (r) + return r; + return 0; +} + +static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) +{ + int r; + unsigned temp_reg = r600_get_temp(ctx); + + /* t.x = ips * r0.y */ + r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, + temp_reg, 0, + ctx->tess_input_info, 0, + 0, 1); + + if (r) + return r; + + /* the base address is now in temp.x */ + r = r600_get_byte_address(ctx, temp_reg, + NULL, src, ctx->tess_input_info, 1); + if (r) + return r; + + r = do_lds_fetch_values(ctx, temp_reg, dst_reg); + if (r) + return r; + return 0; +} + +static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg) +{ + int r; + unsigned temp_reg = r600_get_temp(ctx); + + r = get_lds_offset0(ctx, 1, temp_reg, + src->Register.Dimension ? false : true); + if (r) + return r; + /* the base address is now in temp.x */ + r = r600_get_byte_address(ctx, temp_reg, + NULL, src, + ctx->tess_output_info, 1); + if (r) + return r; + + r = do_lds_fetch_values(ctx, temp_reg, dst_reg); + if (r) + return r; + return 0; +} + +static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + unsigned i; + + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + struct tgsi_full_src_register *src = &inst->Src[i]; + + if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) { + int treg = r600_get_temp(ctx); + fetch_tes_input(ctx, src, treg); + ctx->src[i].sel = treg; + ctx->src[i].rel = 0; + } + if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) { + int treg = r600_get_temp(ctx); + fetch_tcs_input(ctx, src, treg); + ctx->src[i].sel = treg; + ctx->src[i].rel = 0; + } + if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) { + int treg = r600_get_temp(ctx); + fetch_tcs_output(ctx, src, treg); + ctx->src[i].sel = treg; + ctx->src[i].rel = 0; + } + } + return 0; +} + +static int tgsi_split_constant(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, j, k, nconst, r; + + for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) { + if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) { + nconst++; + } + tgsi_src(ctx, &inst->Src[i], &ctx->src[i]); + } + for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { + if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) { + continue; + } + + if (ctx->src[i].rel) { + int chan = inst->Src[i].Indirect.Swizzle; + int treg = r600_get_temp(ctx); + if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg))) + return r; + + ctx->src[i].kc_bank = 0; + ctx->src[i].kc_rel = 0; + ctx->src[i].sel = treg; + ctx->src[i].rel = 0; + j--; + } else if (j > 0) { + int treg = r600_get_temp(ctx); + for (k = 0; k < 4; k++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = ctx->src[i].sel; + alu.src[0].chan = k; + alu.src[0].rel = ctx->src[i].rel; + alu.src[0].kc_bank = ctx->src[i].kc_bank; + alu.src[0].kc_rel = ctx->src[i].kc_rel; + alu.dst.sel = treg; + alu.dst.chan = k; + alu.dst.write = 1; + if (k == 3) + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + ctx->src[i].sel = treg; + ctx->src[i].rel =0; + j--; } } return 0; @@ -1334,9 +1982,11 @@ static int process_twoside_color_inputs(struct r600_shader_ctx *ctx) return 0; } -static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so) +static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so, + int stream, unsigned *stream_item_size) { unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS]; + unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS]; int i, j, r; /* Sanity checking. */ @@ -1356,8 +2006,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output /* Initialize locations where the outputs are stored. */ for (i = 0; i < so->num_outputs; i++) { - so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; + so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr; + start_comp[i] = so->output[i].start_component; /* Lower outputs with dst_offset < start_component. * * We can only output 4D vectors with a write mask, e.g. we can @@ -1383,7 +2034,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output if (r) return r; } - so->output[i].start_component = 0; + start_comp[i] = 0; so_gpr[i] = tmp; } } @@ -1392,16 +2043,22 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output for (i = 0; i < so->num_outputs; i++) { struct r600_bytecode_output output; + if (stream != -1 && stream != so->output[i].output_buffer) + continue; + memset(&output, 0, sizeof(struct r600_bytecode_output)); output.gpr = so_gpr[i]; - output.elem_size = so->output[i].num_components; - output.array_base = so->output[i].dst_offset - so->output[i].start_component; + output.elem_size = so->output[i].num_components - 1; + if (output.elem_size == 2) + output.elem_size = 3; // 3 not supported, write 4 with junk at end + output.array_base = so->output[i].dst_offset - start_comp[i]; output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; output.burst_count = 1; /* array_size is an upper limit for the burst_count * with MEM_STREAM instructions */ output.array_size = 0xFFF; - output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; + output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i]; + if (ctx->bc->chip_class >= EVERGREEN) { switch (so->output[i].output_buffer) { case 0: @@ -1417,6 +2074,9 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output output.op = CF_OP_MEM_STREAM0_BUF3; break; } + output.op += so->output[i].stream * 4; + assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3); + ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4; } else { switch (so->output[i].output_buffer) { case 0: @@ -1432,6 +2092,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output output.op = CF_OP_MEM_STREAM3; break; } + ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer; } r = r600_bytecode_add_output(ctx->bc, &output); if (r) @@ -1484,8 +2145,9 @@ static int generate_gs_copy_shader(struct r600_context *rctx, struct r600_bytecode_output output; struct r600_bytecode_cf *cf_jump, *cf_pop, *last_exp_pos = NULL, *last_exp_param = NULL; - int i, next_clip_pos = 61, next_param = 0; - + int i, j, next_clip_pos = 61, next_param = 0; + int ring; + bool only_ring_0 = true; cshader = calloc(1, sizeof(struct r600_pipe_shader)); if (!cshader) return 0; @@ -1497,13 +2159,16 @@ static int generate_gs_copy_shader(struct r600_context *rctx, ctx.shader = &cshader->shader; ctx.bc = &ctx.shader->bc; - ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX; + ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX; r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family, rctx->screen->has_compressed_msaa_texturing); ctx.bc->isa = rctx->isa; + cf_jump = NULL; + memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes)); + /* R0.x = R0.x & 0x3fffffff */ memset(&alu, 0, sizeof(alu)); alu.op = ALU_OP2_AND_INT; @@ -1522,22 +2187,10 @@ static int generate_gs_copy_shader(struct r600_context *rctx, alu.last = 1; r600_bytecode_add_alu(ctx.bc, &alu); - /* PRED_SETE_INT __, R0.y, 0 */ - memset(&alu, 0, sizeof(alu)); - alu.op = ALU_OP2_PRED_SETE_INT; - alu.src[0].chan = 1; - alu.src[1].sel = V_SQ_ALU_SRC_0; - alu.execute_mask = 1; - alu.update_pred = 1; - alu.last = 1; - r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); - - r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); - cf_jump = ctx.bc->cf_last; - /* fetch vertex data from GSVS ring */ for (i = 0; i < ocnt; ++i) { struct r600_shader_io *out = &ctx.shader->output[i]; + out->gpr = i + 1; out->ring_offset = i * 16; @@ -1545,8 +2198,10 @@ static int generate_gs_copy_shader(struct r600_context *rctx, vtx.op = FETCH_OP_VFETCH; vtx.buffer_id = R600_GS_RING_CONST_BUFFER; vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; + vtx.mega_fetch_count = 16; vtx.offset = out->ring_offset; vtx.dst_gpr = out->gpr; + vtx.src_gpr = 0; vtx.dst_sel_x = 0; vtx.dst_sel_y = 1; vtx.dst_sel_z = 2; @@ -1559,18 +2214,80 @@ static int generate_gs_copy_shader(struct r600_context *rctx, r600_bytecode_add_vtx(ctx.bc, &vtx); } + ctx.temp_reg = i + 1; + for (ring = 3; ring >= 0; --ring) { + bool enabled = false; + for (i = 0; i < so->num_outputs; i++) { + if (so->output[i].stream == ring) { + enabled = true; + if (ring > 0) + only_ring_0 = false; + break; + } + } + if (ring != 0 && !enabled) { + cshader->shader.ring_item_sizes[ring] = 0; + continue; + } + + if (cf_jump) { + // Patch up jump label + r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP); + cf_pop = ctx.bc->cf_last; + + cf_jump->cf_addr = cf_pop->id + 2; + cf_jump->pop_count = 1; + cf_pop->cf_addr = cf_pop->id + 2; + cf_pop->pop_count = 1; + } + + /* PRED_SETE_INT __, R0.y, ring */ + memset(&alu, 0, sizeof(alu)); + alu.op = ALU_OP2_PRED_SETE_INT; + alu.src[0].chan = 1; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = ring; + alu.execute_mask = 1; + alu.update_pred = 1; + alu.last = 1; + r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE); + + r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP); + cf_jump = ctx.bc->cf_last; + + if (enabled) + emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]); + cshader->shader.ring_item_sizes[ring] = ocnt * 16; + } + + /* bc adds nops - copy it */ + if (ctx.bc->chip_class == R600) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP0_NOP; + alu.last = 1; + r600_bytecode_add_alu(ctx.bc, &alu); - /* XXX handle clipvertex, streamout? */ - emit_streamout(&ctx, so); + r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); + } /* export vertex data */ /* XXX factor out common code with r600_shader_from_tgsi ? */ for (i = 0; i < ocnt; ++i) { struct r600_shader_io *out = &ctx.shader->output[i]; - + bool instream0 = true; if (out->name == TGSI_SEMANTIC_CLIPVERTEX) continue; + for (j = 0; j < so->num_outputs; j++) { + if (so->output[j].register_index == i) { + if (so->output[j].stream == 0) + break; + if (so->output[j].stream > 0) + instream0 = false; + } + } + if (!instream0) + continue; memset(&output, 0, sizeof(output)); output.gpr = out->gpr; output.elem_size = 3; @@ -1716,17 +2433,40 @@ static int generate_gs_copy_shader(struct r600_context *rctx, } gs->gs_copy_shader = cshader; + cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; ctx.bc->nstack = 1; - cshader->shader.ring_item_size = ocnt * 16; return r600_bytecode_build(ctx.bc); } -static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind) +static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind) +{ + if (ind) { + struct r600_bytecode_alu alu; + int r; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_ADD_INT; + alu.src[0].sel = ctx->gs_export_gpr_tregs[idx]; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = ctx->gs_out_ring_offset >> 4; + alu.dst.sel = ctx->gs_export_gpr_tregs[idx]; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; +} + +static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind) { struct r600_bytecode_output output; int i, k, ring_offset; + int effective_stream = stream == -1 ? 0 : stream; + int idx = 0; for (i = 0; i < ctx->shader->noutput; i++) { if (ctx->gs_for_vs) { @@ -1743,15 +2483,18 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind) if (ring_offset == -1) continue; - } else - ring_offset = i * 16; + } else { + ring_offset = idx * 16; + idx++; + } + if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION) + continue; /* next_ring_offset after parsing input decls contains total size of * single vertex data, gs_next_vertex - current vertex index */ if (!ind) ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex; - /* get a temp and add the ring offset to the next vertex base in the shader */ memset(&output, 0, sizeof(struct r600_bytecode_output)); output.gpr = ctx->shader->output[i].gpr; output.elem_size = 3; @@ -1762,85 +2505,487 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind) output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND; else output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; - output.op = CF_OP_MEM_RING; + switch (stream) { + default: + case 0: + output.op = CF_OP_MEM_RING; break; + case 1: + output.op = CF_OP_MEM_RING1; break; + case 2: + output.op = CF_OP_MEM_RING2; break; + case 3: + output.op = CF_OP_MEM_RING3; break; + } if (ind) { output.array_base = ring_offset >> 2; /* in dwords */ output.array_size = 0xfff; - output.index_gpr = ctx->gs_export_gpr_treg; + output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream]; } else output.array_base = ring_offset >> 2; /* in dwords */ r600_bytecode_add_output(ctx->bc, &output); } - if (ind) { - struct r600_bytecode_alu alu; - int r; - - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP2_ADD_INT; - alu.src[0].sel = ctx->gs_export_gpr_treg; - alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; - alu.src[1].value = ctx->gs_out_ring_offset >> 4; - alu.dst.sel = ctx->gs_export_gpr_treg; - alu.dst.write = 1; - alu.last = 1; - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; - } ++ctx->gs_next_vertex; return 0; } -static int r600_shader_from_tgsi(struct r600_context *rctx, - struct r600_pipe_shader *pipeshader, - struct r600_shader_key key) -{ - struct r600_screen *rscreen = rctx->screen; - struct r600_shader *shader = &pipeshader->shader; - struct tgsi_token *tokens = pipeshader->selector->tokens; - struct pipe_stream_output_info so = pipeshader->selector->so; - struct tgsi_full_immediate *immediate; - struct tgsi_full_property *property; - struct r600_shader_ctx ctx; - struct r600_bytecode_output output[32]; - unsigned output_done, noutput; - unsigned opcode; - int i, j, k, r = 0; - int next_param_base = 0, next_clip_base; - int max_color_exports = MAX2(key.nr_cbufs, 1); - /* Declarations used by llvm code */ - bool use_llvm = false; - bool indirect_gprs; - bool ring_outputs = false; - bool pos_emitted = false; -#ifdef R600_USE_LLVM - use_llvm = rscreen->b.debug_flags & DBG_LLVM; -#endif +static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) +{ + int r; + struct r600_bytecode_vtx vtx; + int temp_val = ctx->temp_reg; + /* need to store the TCS output somewhere */ + r = single_alu_op2(ctx, ALU_OP1_MOV, + temp_val, 0, + V_SQ_ALU_SRC_LITERAL, 0, + 0, 0); + if (r) + return r; + + /* used by VS/TCS */ + if (ctx->tess_input_info) { + /* fetch tcs input values into resv space */ + memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); + vtx.op = FETCH_OP_VFETCH; + vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; + vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; + vtx.mega_fetch_count = 16; + vtx.data_format = FMT_32_32_32_32; + vtx.num_format_all = 2; + vtx.format_comp_all = 1; + vtx.use_const_fields = 0; + vtx.endian = r600_endian_swap(32); + vtx.srf_mode_all = 1; + vtx.offset = 0; + vtx.dst_gpr = ctx->tess_input_info; + vtx.dst_sel_x = 0; + vtx.dst_sel_y = 1; + vtx.dst_sel_z = 2; + vtx.dst_sel_w = 3; + vtx.src_gpr = temp_val; + vtx.src_sel_x = 0; + + r = r600_bytecode_add_vtx(ctx->bc, &vtx); + if (r) + return r; + } + + /* used by TCS/TES */ + if (ctx->tess_output_info) { + /* fetch tcs output values into resv space */ + memset(&vtx, 0, sizeof(struct r600_bytecode_vtx)); + vtx.op = FETCH_OP_VFETCH; + vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER; + vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET; + vtx.mega_fetch_count = 16; + vtx.data_format = FMT_32_32_32_32; + vtx.num_format_all = 2; + vtx.format_comp_all = 1; + vtx.use_const_fields = 0; + vtx.endian = r600_endian_swap(32); + vtx.srf_mode_all = 1; + vtx.offset = 16; + vtx.dst_gpr = ctx->tess_output_info; + vtx.dst_sel_x = 0; + vtx.dst_sel_y = 1; + vtx.dst_sel_z = 2; + vtx.dst_sel_w = 3; + vtx.src_gpr = temp_val; + vtx.src_sel_x = 0; + + r = r600_bytecode_add_vtx(ctx->bc, &vtx); + if (r) + return r; + } + return 0; +} + +static int emit_lds_vs_writes(struct r600_shader_ctx *ctx) +{ + int i, j, r; + int temp_reg; + + /* fetch tcs input values into input_vals */ + ctx->tess_input_info = r600_get_temp(ctx); + ctx->tess_output_info = 0; + r = r600_fetch_tess_io_info(ctx); + if (r) + return r; + + temp_reg = r600_get_temp(ctx); + /* dst reg contains LDS address stride * idx */ + /* MUL vertexID, vertex_dw_stride */ + r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, + temp_reg, 0, + ctx->tess_input_info, 1, + 0, 1); /* rel id in r0.y? */ + if (r) + return r; + + for (i = 0; i < ctx->shader->noutput; i++) { + struct r600_bytecode_alu alu; + int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid); + + if (param) { + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, 1, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, param * 16); + if (r) + return r; + } + + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, 2, + temp_reg, param ? 1 : 0, + V_SQ_ALU_SRC_LITERAL, 8); + if (r) + return r; + + + for (j = 0; j < 2; j++) { + int chan = (j == 1) ? 2 : (param ? 1 : 0); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = LDS_OP3_LDS_WRITE_REL; + alu.src[0].sel = temp_reg; + alu.src[0].chan = chan; + alu.src[1].sel = ctx->shader->output[i].gpr; + alu.src[1].chan = j * 2; + alu.src[2].sel = ctx->shader->output[i].gpr; + alu.src[2].chan = (j * 2) + 1; + alu.last = 1; + alu.dst.chan = 0; + alu.lds_idx = 1; + alu.is_lds_idx_op = true; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + return 0; +} + +static int r600_store_tcs_output(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + const struct tgsi_full_dst_register *dst = &inst->Dst[0]; + int i, r, lasti; + int temp_reg = r600_get_temp(ctx); + struct r600_bytecode_alu alu; + unsigned write_mask = dst->Register.WriteMask; + + if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) + return 0; + + r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); + if (r) + return r; + + /* the base address is now in temp.x */ + r = r600_get_byte_address(ctx, temp_reg, + &inst->Dst[0], NULL, ctx->tess_output_info, 1); + if (r) + return r; + + /* LDS write */ + lasti = tgsi_last_instruction(write_mask); + for (i = 1; i <= lasti; i++) { + + if (!(write_mask & (1 << i))) + continue; + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, i, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, 4 * i); + if (r) + return r; + } + + for (i = 0; i <= lasti; i++) { + if (!(write_mask & (1 << i))) + continue; + + if ((i == 0 && ((write_mask & 3) == 3)) || + (i == 2 && ((write_mask & 0xc) == 0xc))) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = LDS_OP3_LDS_WRITE_REL; + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + + alu.src[1].sel = dst->Register.Index; + alu.src[1].sel += ctx->file_offset[dst->Register.File]; + alu.src[1].chan = i; + + alu.src[2].sel = dst->Register.Index; + alu.src[2].sel += ctx->file_offset[dst->Register.File]; + alu.src[2].chan = i + 1; + alu.lds_idx = 1; + alu.dst.chan = 0; + alu.last = 1; + alu.is_lds_idx_op = true; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + i += 1; + continue; + } + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = LDS_OP2_LDS_WRITE; + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + + alu.src[1].sel = dst->Register.Index; + alu.src[1].sel += ctx->file_offset[dst->Register.File]; + alu.src[1].chan = i; + + alu.src[2].sel = V_SQ_ALU_SRC_0; + alu.dst.chan = 0; + alu.last = 1; + alu.is_lds_idx_op = true; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; +} + +static int r600_tess_factor_read(struct r600_shader_ctx *ctx, + int output_idx) +{ + int param; + unsigned temp_reg = r600_get_temp(ctx); + unsigned name = ctx->shader->output[output_idx].name; + int dreg = ctx->shader->output[output_idx].gpr; + int r; + + param = r600_get_lds_unique_index(name, 0); + r = get_lds_offset0(ctx, 1, temp_reg, true); + if (r) + return r; + + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + temp_reg, 0, + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, param * 16); + if (r) + return r; + + do_lds_fetch_values(ctx, temp_reg, dreg); + return 0; +} + +static int r600_emit_tess_factor(struct r600_shader_ctx *ctx) +{ + unsigned i; + int stride, outer_comps, inner_comps; + int tessinner_idx = -1, tessouter_idx = -1; + int r; + int temp_reg = r600_get_temp(ctx); + int treg[3] = {-1, -1, -1}; + struct r600_bytecode_alu alu; + struct r600_bytecode_cf *cf_jump, *cf_pop; + + /* only execute factor emission for invocation 0 */ + /* PRED_SETE_INT __, R0.x, 0 */ + memset(&alu, 0, sizeof(alu)); + alu.op = ALU_OP2_PRED_SETE_INT; + alu.src[0].chan = 2; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.execute_mask = 1; + alu.update_pred = 1; + alu.last = 1; + r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE); + + r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP); + cf_jump = ctx->bc->cf_last; + + treg[0] = r600_get_temp(ctx); + switch (ctx->shader->tcs_prim_mode) { + case PIPE_PRIM_LINES: + stride = 8; /* 2 dwords, 1 vec2 store */ + outer_comps = 2; + inner_comps = 0; + break; + case PIPE_PRIM_TRIANGLES: + stride = 16; /* 4 dwords, 1 vec4 store */ + outer_comps = 3; + inner_comps = 1; + treg[1] = r600_get_temp(ctx); + break; + case PIPE_PRIM_QUADS: + stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */ + outer_comps = 4; + inner_comps = 2; + treg[1] = r600_get_temp(ctx); + treg[2] = r600_get_temp(ctx); + break; + default: + assert(0); + return -1; + } + + /* R0 is InvocationID, RelPatchID, PatchID, tf_base */ + /* TF_WRITE takes index in R.x, value in R.y */ + for (i = 0; i < ctx->shader->noutput; i++) { + if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSINNER) + tessinner_idx = i; + if (ctx->shader->output[i].name == TGSI_SEMANTIC_TESSOUTER) + tessouter_idx = i; + } + + if (tessouter_idx == -1) + return -1; + + if (tessinner_idx == -1 && inner_comps) + return -1; + + if (tessouter_idx != -1) { + r = r600_tess_factor_read(ctx, tessouter_idx); + if (r) + return r; + } + + if (tessinner_idx != -1) { + r = r600_tess_factor_read(ctx, tessinner_idx); + if (r) + return r; + } + + /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */ + /* r.x = relpatchid(r0.y) * tf_stride */ + + /* multiply incoming r0.y * stride - t.x = r0.y * stride */ + /* add incoming r0.w to it: t.x = t.x + r0.w */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + temp_reg, 0, + 0, 1, + V_SQ_ALU_SRC_LITERAL, stride, + 0, 3); + if (r) + return r; + + for (i = 0; i < outer_comps + inner_comps; i++) { + int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx; + int out_comp = i >= outer_comps ? i - outer_comps : i; + + r = single_alu_op2(ctx, ALU_OP2_ADD_INT, + treg[i / 2], (2 * (i % 2)), + temp_reg, 0, + V_SQ_ALU_SRC_LITERAL, 4 * i); + if (r) + return r; + r = single_alu_op2(ctx, ALU_OP1_MOV, + treg[i / 2], 1 + (2 * (i%2)), + ctx->shader->output[out_idx].gpr, out_comp, + 0, 0); + if (r) + return r; + } + for (i = 0; i < outer_comps + inner_comps; i++) { + struct r600_bytecode_gds gds; + + memset(&gds, 0, sizeof(struct r600_bytecode_gds)); + gds.src_gpr = treg[i / 2]; + gds.src_sel_x = 2 * (i % 2); + gds.src_sel_y = 1 + (2 * (i % 2)); + gds.src_sel_z = 4; + gds.dst_sel_x = 7; + gds.dst_sel_y = 7; + gds.dst_sel_z = 7; + gds.dst_sel_w = 7; + gds.op = FETCH_OP_TF_WRITE; + r = r600_bytecode_add_gds(ctx->bc, &gds); + if (r) + return r; + } + + // Patch up jump label + r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP); + cf_pop = ctx->bc->cf_last; + + cf_jump->cf_addr = cf_pop->id + 2; + cf_jump->pop_count = 1; + cf_pop->cf_addr = cf_pop->id + 2; + cf_pop->pop_count = 1; + + return 0; +} + +static int r600_shader_from_tgsi(struct r600_context *rctx, + struct r600_pipe_shader *pipeshader, + union r600_shader_key key) +{ + struct r600_screen *rscreen = rctx->screen; + struct r600_shader *shader = &pipeshader->shader; + struct tgsi_token *tokens = pipeshader->selector->tokens; + struct pipe_stream_output_info so = pipeshader->selector->so; + struct tgsi_full_immediate *immediate; + struct r600_shader_ctx ctx; + struct r600_bytecode_output output[32]; + unsigned output_done, noutput; + unsigned opcode; + int i, j, k, r = 0; + int next_param_base = 0, next_clip_base; + int max_color_exports = MAX2(key.ps.nr_cbufs, 1); + bool indirect_gprs; + bool ring_outputs = false; + bool lds_outputs = false; + bool lds_inputs = false; + bool pos_emitted = false; + ctx.bc = &shader->bc; ctx.shader = shader; ctx.native_integers = true; - shader->vs_as_gs_a = key.vs_as_gs_a; - shader->vs_as_es = key.vs_as_es; - r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, rscreen->has_compressed_msaa_texturing); ctx.tokens = tokens; tgsi_scan_shader(tokens, &ctx.info); shader->indirect_files = ctx.info.indirect_files; - indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT); + + shader->uses_doubles = ctx.info.uses_doubles; + + indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER)); tgsi_parse_init(&ctx.parse, tokens); - ctx.type = ctx.parse.FullHeader.Processor.Processor; + ctx.type = ctx.info.processor; shader->processor_type = ctx.type; ctx.bc->type = shader->processor_type; - ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY); + switch (ctx.type) { + case PIPE_SHADER_VERTEX: + shader->vs_as_gs_a = key.vs.as_gs_a; + shader->vs_as_es = key.vs.as_es; + shader->vs_as_ls = key.vs.as_ls; + if (shader->vs_as_es) + ring_outputs = true; + if (shader->vs_as_ls) + lds_outputs = true; + break; + case PIPE_SHADER_GEOMETRY: + ring_outputs = true; + break; + case PIPE_SHADER_TESS_CTRL: + shader->tcs_prim_mode = key.tcs.prim_mode; + lds_outputs = true; + lds_inputs = true; + break; + case PIPE_SHADER_TESS_EVAL: + shader->tes_as_es = key.tes.as_es; + lds_inputs = true; + if (shader->tes_as_es) + ring_outputs = true; + break; + case PIPE_SHADER_FRAGMENT: + shader->two_side = key.ps.color_two_side; + break; + default: + break; + } - if (key.vs_as_es) { + if (shader->vs_as_es || shader->tes_as_es) { ctx.gs_for_vs = &rctx->gs_shader->current->shader; } else { ctx.gs_for_vs = NULL; @@ -1849,8 +2994,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.next_ring_offset = 0; ctx.gs_out_ring_offset = 0; ctx.gs_next_vertex = 0; + ctx.gs_stream_output_info = &so; - shader->uses_index_registers = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; @@ -1860,7 +3005,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->nr_ps_color_exports = 0; shader->nr_ps_max_color_exports = 0; - shader->two_side = key.color_two_side; /* register allocations */ /* Values [0,127] correspond to GPR[0..127]. @@ -1887,40 +3031,42 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[i] = 0; } -#ifdef R600_USE_LLVM - if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) { - fprintf(stderr, "Warning: R600 LLVM backend does not support " - "indirect adressing. Falling back to TGSI " - "backend.\n"); - use_llvm = 0; - } -#endif - if (ctx.type == TGSI_PROCESSOR_VERTEX) { + if (ctx.type == PIPE_SHADER_VERTEX) { ctx.file_offset[TGSI_FILE_INPUT] = 1; - if (!use_llvm) { - r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); - } + r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); } - if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { + if (ctx.type == PIPE_SHADER_FRAGMENT) { if (ctx.bc->chip_class >= EVERGREEN) ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx); else ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]); } - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx.type == PIPE_SHADER_GEOMETRY) { /* FIXME 1 would be enough in some cases (3 or less input vertices) */ ctx.file_offset[TGSI_FILE_INPUT] = 2; } - ctx.use_llvm = use_llvm; + if (ctx.type == PIPE_SHADER_TESS_CTRL) + ctx.file_offset[TGSI_FILE_INPUT] = 1; + if (ctx.type == PIPE_SHADER_TESS_EVAL) { + bool add_tesscoord = false, add_tess_inout = false; + ctx.file_offset[TGSI_FILE_INPUT] = 1; + for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) { + /* if we have tesscoord save one reg */ + if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD) + add_tesscoord = true; + if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER || + ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER) + add_tess_inout = true; + } + if (add_tesscoord || add_tess_inout) + ctx.file_offset[TGSI_FILE_INPUT]++; + if (add_tess_inout) + ctx.file_offset[TGSI_FILE_INPUT]+=2; + } - if (use_llvm) { - ctx.file_offset[TGSI_FILE_OUTPUT] = - ctx.file_offset[TGSI_FILE_INPUT]; - } else { - ctx.file_offset[TGSI_FILE_OUTPUT] = + ctx.file_offset[TGSI_FILE_OUTPUT] = ctx.file_offset[TGSI_FILE_INPUT] + ctx.info.file_max[TGSI_FILE_INPUT] + 1; - } ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; @@ -1931,15 +3077,25 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL; ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1; - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { - ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1; - ctx.temp_reg = ctx.bc->ar_reg + 2; - ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3; - ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4; + ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1; + ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2; + + if (ctx.type == PIPE_SHADER_TESS_CTRL) { + ctx.tess_input_info = ctx.bc->ar_reg + 3; + ctx.tess_output_info = ctx.bc->ar_reg + 4; + ctx.temp_reg = ctx.bc->ar_reg + 5; + } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { + ctx.tess_input_info = 0; + ctx.tess_output_info = ctx.bc->ar_reg + 3; + ctx.temp_reg = ctx.bc->ar_reg + 4; + } else if (ctx.type == PIPE_SHADER_GEOMETRY) { + ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3; + ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4; + ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5; + ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6; + ctx.temp_reg = ctx.bc->ar_reg + 7; } else { - ctx.temp_reg = ctx.bc->ar_reg + 1; - ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2; - ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3; + ctx.temp_reg = ctx.bc->ar_reg + 3; } shader->max_arrays = 0; @@ -1962,10 +3118,17 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.nliterals = 0; ctx.literals = NULL; - shader->fs_write_all = FALSE; + + shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && + ctx.info.colors_written == 1; + shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; if (shader->vs_as_gs_a) - vs_add_primid_output(&ctx, key.vs_prim_id_out); + vs_add_primid_output(&ctx, key.vs.prim_id_out); + + if (ctx.type == PIPE_SHADER_TESS_EVAL) + r600_fetch_tess_io_info(&ctx); while (!tgsi_parse_end_of_tokens(&ctx.parse)) { tgsi_parse_token(&ctx.parse); @@ -1989,34 +3152,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, goto out_err; break; case TGSI_TOKEN_TYPE_INSTRUCTION: - break; case TGSI_TOKEN_TYPE_PROPERTY: - property = &ctx.parse.FullToken.FullProperty; - switch (property->Property.PropertyName) { - case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: - if (property->u[0].Data == 1) - shader->fs_write_all = TRUE; - break; - case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION: - if (property->u[0].Data == 1) - shader->vs_position_window_space = TRUE; - break; - case TGSI_PROPERTY_VS_PROHIBIT_UCPS: - /* we don't need this one */ - break; - case TGSI_PROPERTY_GS_INPUT_PRIM: - shader->gs_input_prim = property->u[0].Data; - break; - case TGSI_PROPERTY_GS_OUTPUT_PRIM: - shader->gs_output_prim = property->u[0].Data; - break; - case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: - shader->gs_max_out_vertices = property->u[0].Data; - break; - case TGSI_PROPERTY_GS_INVOCATIONS: - shader->gs_num_invocations = property->u[0].Data; - break; - } break; default: R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type); @@ -2025,7 +3161,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } } - shader->ring_item_size = ctx.next_ring_offset; + shader->ring_item_sizes[0] = ctx.next_ring_offset; + shader->ring_item_sizes[1] = 0; + shader->ring_item_sizes[2] = 0; + shader->ring_item_sizes[3] = 0; /* Process two side if needed */ if (shader->two_side && ctx.colors_used) { @@ -2067,70 +3206,12 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } } -/* LLVM backend setup */ -#ifdef R600_USE_LLVM - if (use_llvm) { - struct radeon_llvm_context radeon_llvm_ctx; - LLVMModuleRef mod; - bool dump = r600_can_dump_shader(&rscreen->b, tokens); - boolean use_kill = false; - - memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); - radeon_llvm_ctx.type = ctx.type; - radeon_llvm_ctx.two_side = shader->two_side; - radeon_llvm_ctx.face_gpr = ctx.face_gpr; - radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1; - radeon_llvm_ctx.r600_inputs = ctx.shader->input; - radeon_llvm_ctx.r600_outputs = ctx.shader->output; - radeon_llvm_ctx.color_buffer_count = max_color_exports; - radeon_llvm_ctx.chip_class = ctx.bc->chip_class; - radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN); - radeon_llvm_ctx.stream_outputs = &so; - radeon_llvm_ctx.clip_vertex = ctx.cv_output; - radeon_llvm_ctx.alpha_to_one = key.alpha_to_one; - radeon_llvm_ctx.has_compressed_msaa_texturing = - ctx.bc->has_compressed_msaa_texturing; - mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens); - ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; - ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; - - if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) { - radeon_llvm_dispose(&radeon_llvm_ctx); - use_llvm = 0; - fprintf(stderr, "R600 LLVM backend failed to compile " - "shader. Falling back to TGSI\n"); - } else { - ctx.file_offset[TGSI_FILE_OUTPUT] = - ctx.file_offset[TGSI_FILE_INPUT]; - } - if (use_kill) - ctx.shader->uses_kill = use_kill; - radeon_llvm_dispose(&radeon_llvm_ctx); - } -#endif -/* End of LLVM backend setup */ - if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) shader->nr_ps_max_color_exports = 8; - if (!use_llvm) { - if (ctx.fragcoord_input >= 0) { - if (ctx.bc->chip_class == CAYMAN) { - for (j = 0 ; j < 4; j++) { - struct r600_bytecode_alu alu; - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_RECIP_IEEE; - alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; - alu.src[0].chan = 3; - - alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; - alu.dst.chan = j; - alu.dst.write = (j == 3); - alu.last = 1; - if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) - return r; - } - } else { + if (ctx.fragcoord_input >= 0) { + if (ctx.bc->chip_class == CAYMAN) { + for (j = 0 ; j < 4; j++) { struct r600_bytecode_alu alu; memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_RECIP_IEEE; @@ -2138,67 +3219,100 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, alu.src[0].chan = 3; alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; - alu.dst.chan = 3; - alu.dst.write = 1; + alu.dst.chan = j; + alu.dst.write = (j == 3); alu.last = 1; if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) return r; } - } - - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + } else { struct r600_bytecode_alu alu; - int r; - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; - alu.src[0].value = 0; - alu.dst.sel = ctx.gs_export_gpr_treg; - alu.dst.write = 1; - alu.last = 1; - r = r600_bytecode_add_alu(ctx.bc, &alu); - if (r) + alu.op = ALU_OP1_RECIP_IEEE; + alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; + alu.src[0].chan = 3; + + alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) return r; } - if (shader->two_side && ctx.colors_used) { - if ((r = process_twoside_color_inputs(&ctx))) + } + + if (ctx.type == PIPE_SHADER_GEOMETRY) { + struct r600_bytecode_alu alu; + int r; + + /* GS thread with no output workaround - emit a cut at start of GS */ + if (ctx.bc->chip_class == R600) + r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); + + for (j = 0; j < 4; j++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0; + alu.dst.sel = ctx.gs_export_gpr_tregs[j]; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx.bc, &alu); + if (r) return r; } + } - tgsi_parse_init(&ctx.parse, tokens); - while (!tgsi_parse_end_of_tokens(&ctx.parse)) { - tgsi_parse_token(&ctx.parse); - switch (ctx.parse.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - r = tgsi_is_supported(&ctx); - if (r) - goto out_err; - ctx.max_driver_temp_used = 0; - /* reserve first tmp for everyone */ - r600_get_temp(&ctx); + if (ctx.type == PIPE_SHADER_TESS_CTRL) + r600_fetch_tess_io_info(&ctx); + + if (shader->two_side && ctx.colors_used) { + if ((r = process_twoside_color_inputs(&ctx))) + return r; + } + + tgsi_parse_init(&ctx.parse, tokens); + while (!tgsi_parse_end_of_tokens(&ctx.parse)) { + tgsi_parse_token(&ctx.parse); + switch (ctx.parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_INSTRUCTION: + r = tgsi_is_supported(&ctx); + if (r) + goto out_err; + ctx.max_driver_temp_used = 0; + /* reserve first tmp for everyone */ + r600_get_temp(&ctx); - opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; - if ((r = tgsi_split_constant(&ctx))) + opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; + if ((r = tgsi_split_constant(&ctx))) + goto out_err; + if ((r = tgsi_split_literal_constant(&ctx))) + goto out_err; + if (ctx.type == PIPE_SHADER_GEOMETRY) { + if ((r = tgsi_split_gs_inputs(&ctx))) goto out_err; - if ((r = tgsi_split_literal_constant(&ctx))) + } else if (lds_inputs) { + if ((r = tgsi_split_lds_inputs(&ctx))) goto out_err; - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) - if ((r = tgsi_split_gs_inputs(&ctx))) - goto out_err; - if (ctx.bc->chip_class == CAYMAN) - ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; - else if (ctx.bc->chip_class >= EVERGREEN) - ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; - else - ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; - r = ctx.inst_info->process(&ctx); + } + if (ctx.bc->chip_class == CAYMAN) + ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; + else if (ctx.bc->chip_class >= EVERGREEN) + ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; + else + ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; + r = ctx.inst_info->process(&ctx); + if (r) + goto out_err; + + if (ctx.type == PIPE_SHADER_TESS_CTRL) { + r = r600_store_tcs_output(&ctx); if (r) goto out_err; - break; - default: - break; } + break; + default: + break; } } @@ -2241,7 +3355,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, alu.src[0].chan = j; alu.src[1].sel = 512 + i; - alu.src[1].kc_bank = R600_UCP_CONST_BUFFER; + alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; alu.src[1].chan = j; alu.dst.sel = clipdist_temp[oreg]; @@ -2249,8 +3363,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, alu.dst.write = (j == ochan); if (j == 3) alu.last = 1; - if (!use_llvm) - r = r600_bytecode_add_alu(ctx.bc, &alu); + r = r600_bytecode_add_alu(ctx.bc, &alu); if (r) return r; } @@ -2258,15 +3371,35 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } /* Add stream outputs. */ - if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX && - so.num_outputs && !use_llvm) - emit_streamout(&ctx, &so); - + if (so.num_outputs) { + bool emit = false; + if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX) + emit = true; + if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL) + emit = true; + if (emit) + emit_streamout(&ctx, &so, -1, NULL); + } + pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask; convert_edgeflag_to_int(&ctx); - if (ring_outputs) { - if (key.vs_as_es) - emit_gs_ring_writes(&ctx, FALSE); + if (ctx.type == PIPE_SHADER_TESS_CTRL) + r600_emit_tess_factor(&ctx); + + if (lds_outputs) { + if (ctx.type == PIPE_SHADER_VERTEX) { + if (ctx.shader->noutput) + emit_lds_vs_writes(&ctx); + } + } else if (ring_outputs) { + if (shader->vs_as_es || shader->tes_as_es) { + ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx); + ctx.gs_export_gpr_tregs[1] = -1; + ctx.gs_export_gpr_tregs[2] = -1; + ctx.gs_export_gpr_tregs[3] = -1; + + emit_gs_ring_writes(&ctx, &so, -1, FALSE); + } } else { /* Export output */ next_clip_base = shader->vs_out_misc_write ? 62 : 61; @@ -2283,7 +3416,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].type = -1; output[j].op = CF_OP_EXPORT; switch (ctx.type) { - case TGSI_PROCESSOR_VERTEX: + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_TESS_EVAL: switch (shader->output[i].name) { case TGSI_SEMANTIC_POSITION: output[j].array_base = 60; @@ -2373,7 +3507,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } break; - case TGSI_PROCESSOR_FRAGMENT: + case PIPE_SHADER_FRAGMENT: if (shader->output[i].name == TGSI_SEMANTIC_COLOR) { /* never export more colors than the number of CBs */ if (shader->output[i].sid >= max_color_exports) { @@ -2381,7 +3515,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, j--; continue; } - output[j].swizzle_w = key.alpha_to_one ? 5 : 3; + output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; output[j].array_base = shader->output[i].sid; output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; shader->nr_ps_color_exports++; @@ -2394,7 +3528,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].swizzle_x = 0; output[j].swizzle_y = 1; output[j].swizzle_z = 2; - output[j].swizzle_w = key.alpha_to_one ? 5 : 3; + output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3; output[j].burst_count = 1; output[j].array_base = k; output[j].op = CF_OP_EXPORT; @@ -2427,6 +3561,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, goto out_err; } break; + case PIPE_SHADER_TESS_CTRL: + break; default: R600_ERR("unsupported processor type %d\n", ctx.type); r = -EINVAL; @@ -2440,7 +3576,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } /* add fake position export */ - if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) { + if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) { memset(&output[j], 0, sizeof(struct r600_bytecode_output)); output[j].gpr = 0; output[j].elem_size = 3; @@ -2456,7 +3592,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } /* add fake param output for vertex shader if no param is exported */ - if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) { + if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) { memset(&output[j], 0, sizeof(struct r600_bytecode_output)); output[j].gpr = 0; output[j].elem_size = 3; @@ -2472,7 +3608,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } /* add fake pixel export */ - if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) { + if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) { memset(&output[j], 0, sizeof(struct r600_bytecode_output)); output[j].gpr = 0; output[j].elem_size = 3; @@ -2485,6 +3621,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, output[j].array_base = 0; output[j].op = CF_OP_EXPORT; j++; + shader->nr_ps_color_exports++; } noutput = j; @@ -2497,31 +3634,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } } /* add output to bytecode */ - if (!use_llvm) { - for (i = 0; i < noutput; i++) { - r = r600_bytecode_add_output(ctx.bc, &output[i]); - if (r) - goto out_err; - } + for (i = 0; i < noutput; i++) { + r = r600_bytecode_add_output(ctx.bc, &output[i]); + if (r) + goto out_err; } } /* add program end */ - if (!use_llvm) { - if (ctx.bc->chip_class == CAYMAN) - cm_bytecode_add_cf_end(ctx.bc); - else { - const struct cf_op_info *last = NULL; + if (ctx.bc->chip_class == CAYMAN) + cm_bytecode_add_cf_end(ctx.bc); + else { + const struct cf_op_info *last = NULL; - if (ctx.bc->cf_last) - last = r600_isa_cf(ctx.bc->cf_last->op); + if (ctx.bc->cf_last) + last = r600_isa_cf(ctx.bc->cf_last->op); - /* alu clause instructions don't have EOP bit, so add NOP */ - if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS) - r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); + /* alu clause instructions don't have EOP bit, so add NOP */ + if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS) + r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); - ctx.bc->cf_last->end_of_program = 1; - } + ctx.bc->cf_last->end_of_program = 1; } /* check GPR limit - we have 124 = 128 - 4 @@ -2532,7 +3665,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, goto out_err; } - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + if (ctx.type == PIPE_SHADER_GEOMETRY) { if ((r = generate_gs_copy_shader(rctx, pipeshader, &so))) return r; } @@ -2596,23 +3729,173 @@ static void tgsi_dst(struct r600_shader_ctx *ctx, r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; r600_dst->chan = swizzle; r600_dst->write = 1; - if (tgsi_dst->Register.Indirect) - r600_dst->rel = V_SQ_REL_RELATIVE; if (inst->Instruction.Saturate) { r600_dst->clamp = 1; } + if (ctx->type == PIPE_SHADER_TESS_CTRL) { + if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) { + return; + } + } + if (tgsi_dst->Register.Indirect) + r600_dst->rel = V_SQ_REL_RELATIVE; + } -static int tgsi_last_instruction(unsigned writemask) +static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap) { - int i, lasti = 0; + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + unsigned write_mask = inst->Dst[0].Register.WriteMask; + struct r600_bytecode_alu alu; + int i, j, r, lasti = tgsi_last_instruction(write_mask); + int use_tmp = 0; - for (i = 0; i < 4; i++) { - if (writemask & (1 << i)) { - lasti = i; + if (singledest) { + switch (write_mask) { + case 0x1: + write_mask = 0x3; + break; + case 0x2: + use_tmp = 1; + write_mask = 0x3; + break; + case 0x4: + write_mask = 0xc; + break; + case 0x8: + write_mask = 0xc; + use_tmp = 3; + break; } } - return lasti; + + lasti = tgsi_last_instruction(write_mask); + for (i = 0; i <= lasti; i++) { + + if (!(write_mask & (1 << i))) + continue; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + + if (singledest) { + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (use_tmp) { + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + } + if (i == 1 || i == 3) + alu.dst.write = 0; + } else + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + + alu.op = ctx->inst_info->op; + if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) { + r600_bytecode_src(&alu.src[0], &ctx->src[0], i); + } else if (!swap) { + for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { + r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); + } + } else { + r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i)); + r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i)); + } + + /* handle some special cases */ + if (i == 1 || i == 3) { + switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) { + case TGSI_OPCODE_DABS: + r600_bytecode_src_set_abs(&alu.src[0]); + break; + default: + break; + } + } + if (i == lasti) { + alu.last = 1; + } + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + if (use_tmp) { + write_mask = inst->Dst[0].Register.WriteMask; + + /* move result from temp to dst */ + for (i = 0; i <= lasti; i++) { + if (!(write_mask & (1 << i))) + continue; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = use_tmp - 1; + alu.last = (i == lasti); + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + return 0; +} + +static int tgsi_op2_64(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + unsigned write_mask = inst->Dst[0].Register.WriteMask; + /* confirm writemasking */ + if ((write_mask & 0x3) != 0x3 && + (write_mask & 0xc) != 0xc) { + fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask); + return -1; + } + return tgsi_op2_64_params(ctx, false, false); +} + +static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_64_params(ctx, true, false); +} + +static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_64_params(ctx, true, true); +} + +static int tgsi_op3_64(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, j, r; + int lasti = 3; + int tmp = r600_get_temp(ctx); + + for (i = 0; i < lasti + 1; i++) { + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { + r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1); + } + + if (inst->Dst[0].Register.WriteMask & (1 << i)) + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + else + alu.dst.sel = tmp; + + alu.dst.chan = i; + alu.is_op3 = 1; + if (i == lasti) { + alu.last = 1; + } + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; } static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) @@ -2645,17 +3928,6 @@ static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) r600_bytecode_src(&alu.src[0], &ctx->src[1], i); r600_bytecode_src(&alu.src[1], &ctx->src[0], i); } - /* handle some special cases */ - switch (inst->Instruction.Opcode) { - case TGSI_OPCODE_SUB: - r600_bytecode_src_toggle_neg(&alu.src[1]); - break; - case TGSI_OPCODE_ABS: - r600_bytecode_src_set_abs(&alu.src[0]); - break; - default: - break; - } if (i == lasti || trans_only) { alu.last = 1; } @@ -2664,71 +3936,307 @@ static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only) return r; } - if (use_tmp) { - /* move result from temp to dst */ - for (i = 0; i <= lasti; i++) { - if (!(write_mask & (1 << i))) - continue; + if (use_tmp) { + /* move result from temp to dst */ + for (i = 0; i <= lasti; i++) { + if (!(write_mask & (1 << i))) + continue; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i; + alu.last = (i == lasti); + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + } + return 0; +} + +static int tgsi_op2(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_s(ctx, 0, 0); +} + +static int tgsi_op2_swap(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_s(ctx, 1, 0); +} + +static int tgsi_op2_trans(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_s(ctx, 0, 1); +} + +static int tgsi_ineg(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, r; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + for (i = 0; i < lasti + 1; i++) { + + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + + alu.src[0].sel = V_SQ_ALU_SRC_0; + + r600_bytecode_src(&alu.src[1], &ctx->src[0], i); + + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + + if (i == lasti) { + alu.last = 1; + } + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; + +} + +static int tgsi_dneg(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, r; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + for (i = 0; i < lasti + 1; i++) { + + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + + r600_bytecode_src(&alu.src[0], &ctx->src[0], i); + + if (i == 1 || i == 3) + r600_bytecode_src_toggle_neg(&alu.src[0]); + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + + if (i == lasti) { + alu.last = 1; + } + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; + +} + +static int tgsi_dfracexp(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + unsigned write_mask = inst->Dst[0].Register.WriteMask; + int i, j, r; + int firsti = write_mask == 0xc ? 2 : 0; + + for (i = 0; i <= 3; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { + r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i)); + } + + if (i == 3) + alu.last = 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + /* MOV first two channels to writemask dst0 */ + for (i = 0; i <= 1; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].chan = i + 2; + alu.src[0].sel = ctx->temp_reg; + + tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst); + alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i <= 3; i++) { + if (inst->Dst[1].Register.WriteMask & (1 << i)) { + /* MOV third channels to writemask dst1 */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].chan = 1; + alu.src[0].sel = ctx->temp_reg; + + tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + break; + } + } + return 0; +} + + +static int egcm_int_to_double(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, r; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D || + inst->Instruction.Opcode == TGSI_OPCODE_U2D); + + for (i = 0; i <= (lasti+1)/2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + + r600_bytecode_src(&alu.src[0], &ctx->src[0], i); + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + alu.last = 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i <= lasti; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_FLT32_TO_FLT64; - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.src[0].chan = i/2; + if (i%2 == 0) alu.src[0].sel = ctx->temp_reg; - alu.src[0].chan = i; - alu.last = (i == lasti); - - r = r600_bytecode_add_alu(ctx->bc, &alu); - if (r) - return r; + else { + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0x0; } + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.last = i == lasti; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; } + return 0; } -static int tgsi_op2(struct r600_shader_ctx *ctx) +static int egcm_double_to_int(struct r600_shader_ctx *ctx) { - return tgsi_op2_s(ctx, 0, 0); -} + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int i, r; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); -static int tgsi_op2_swap(struct r600_shader_ctx *ctx) -{ - return tgsi_op2_s(ctx, 1, 0); -} + assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I || + inst->Instruction.Opcode == TGSI_OPCODE_D2U); -static int tgsi_op2_trans(struct r600_shader_ctx *ctx) -{ - return tgsi_op2_s(ctx, 0, 1); + for (i = 0; i <= lasti; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_FLT64_TO_FLT32; + + r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i)); + alu.dst.chan = i; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = i%2 == 0; + alu.last = i == lasti; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i <= (lasti+1)/2; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + + alu.src[0].chan = i*2; + alu.src[0].sel = ctx->temp_reg; + tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); + alu.last = 1; + + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + return 0; } -static int tgsi_ineg(struct r600_shader_ctx *ctx) +static int cayman_emit_double_instr(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; - struct r600_bytecode_alu alu; int i, r; + struct r600_bytecode_alu alu; + int last_slot = 3; int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + int t1 = ctx->temp_reg; - for (i = 0; i < lasti + 1; i++) { - - if (!(inst->Dst[0].Register.WriteMask & (1 << i))) - continue; + /* these have to write the result to X/Y by the looks of it */ + for (i = 0 ; i < last_slot; i++) { memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ctx->inst_info->op; - alu.src[0].sel = V_SQ_ALU_SRC_0; + /* should only be one src regs */ + assert (inst->Instruction.NumSrcRegs == 1); - r600_bytecode_src(&alu.src[1], &ctx->src[0], i); + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + r600_bytecode_src(&alu.src[1], &ctx->src[0], 0); - tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + /* RSQ should take the absolute value of src */ + if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ || + ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) { + r600_bytecode_src_set_abs(&alu.src[1]); + } + alu.dst.sel = t1; + alu.dst.chan = i; + alu.dst.write = (i == 0 || i == 1); - if (i == lasti) { + if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1) alu.last = 1; - } r = r600_bytecode_add_alu(ctx->bc, &alu); if (r) return r; } - return 0; + for (i = 0 ; i <= lasti; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = t1; + alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.dst.write = 1; + if (i == lasti) + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; } static int cayman_emit_float_instr(struct r600_shader_ctx *ctx) @@ -2809,6 +4317,57 @@ static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) return 0; } + +static int cayman_mul_double_instr(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + int i, j, k, r; + struct r600_bytecode_alu alu; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + int t1 = ctx->temp_reg; + + /* t1 would get overwritten below if we actually tried to + * multiply two pairs of doubles at a time. */ + assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY || + inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW); + + k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ctx->inst_info->op; + for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { + r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1)); + } + alu.dst.sel = t1; + alu.dst.chan = i; + alu.dst.write = 1; + if (i == 3) + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i <= lasti; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = t1; + alu.src[0].chan = i; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.dst.write = 1; + if (i == lasti) + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + return 0; +} + /* * r600 - trunc to -PI..PI range * r700 - normalize by dividing by 2PI @@ -2816,10 +4375,6 @@ static int cayman_mul_int_instr(struct r600_shader_ctx *ctx) */ static int tgsi_setup_trig(struct r600_shader_ctx *ctx) { - static float half_inv_pi = 1.0 /(3.1415926535 * 2); - static float double_pi = 3.1415926535 * 2; - static float neg_pi = -3.1415926535; - int r; struct r600_bytecode_alu alu; @@ -2835,7 +4390,7 @@ static int tgsi_setup_trig(struct r600_shader_ctx *ctx) alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; alu.src[1].chan = 0; - alu.src[1].value = *(uint32_t *)&half_inv_pi; + alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI); alu.src[2].sel = V_SQ_ALU_SRC_0_5; alu.src[2].chan = 0; alu.last = 1; @@ -2874,8 +4429,8 @@ static int tgsi_setup_trig(struct r600_shader_ctx *ctx) alu.src[2].chan = 0; if (ctx->bc->chip_class == R600) { - alu.src[1].value = *(uint32_t *)&double_pi; - alu.src[2].value = *(uint32_t *)&neg_pi; + alu.src[1].value = u_bitcast_f2u(2.0f * M_PI); + alu.src[2].value = u_bitcast_f2u(-M_PI); } else { alu.src[1].sel = V_SQ_ALU_SRC_1; alu.src[2].sel = V_SQ_ALU_SRC_0_5; @@ -3144,7 +4699,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) { int chan; int sel; - int i; + unsigned i; if (ctx->bc->chip_class == CAYMAN) { for (i = 0; i < 3; i++) { @@ -4999,7 +6554,7 @@ static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx, inst->Src[index].Register.File != TGSI_FILE_INPUT && inst->Src[index].Register.File != TGSI_FILE_OUTPUT) || ctx->src[index].neg || ctx->src[index].abs || - (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY); + (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY); } static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx, @@ -5069,7 +6624,8 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l alu.src[0].sel = vtx.dst_gpr; alu.src[0].chan = i; - alu.src[1].sel = 512 + (id * 2); + alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL; + alu.src[1].sel += (id * 2); alu.src[1].chan = i % 4; alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; @@ -5091,7 +6647,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l alu.src[0].sel = vtx.dst_gpr; alu.src[0].chan = 3; - alu.src[1].sel = 512 + (id * 2) + 1; + alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1; alu.src[1].chan = 0; alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; @@ -5112,14 +6668,14 @@ static int r600_do_buffer_txq(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; - + alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; if (ctx->bc->chip_class >= EVERGREEN) { /* channel 0 or 2 of each word */ - alu.src[0].sel = 512 + (id / 2); + alu.src[0].sel += (id / 2); alu.src[0].chan = (id % 2) * 2; } else { /* r600 we have them at channel 2 of the second dword */ - alu.src[0].sel = 512 + (id * 2) + 1; + alu.src[0].sel += (id * 2) + 1; alu.src[0].chan = 1; } alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; @@ -5133,7 +6689,6 @@ static int r600_do_buffer_txq(struct r600_shader_ctx *ctx) static int tgsi_tex(struct r600_shader_ctx *ctx) { - static float one_point_five = 1.5f; struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_tex tex; struct r600_bytecode_alu alu; @@ -5152,6 +6707,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) /* Texture fetch instructions can only use gprs as source. * Also they cannot negate the source or take the absolute value */ const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ && + inst->Instruction.Opcode != TGSI_OPCODE_TXQS && tgsi_tex_src_requires_loading(ctx, 0)) || read_compressed_msaa || txf_add_offsets; @@ -5180,8 +6736,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) sampler_src_reg = 3; sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE - if (sampler_index_mode) - ctx->shader->uses_index_registers = true; src_gpr = tgsi_tex_get_src_gpr(ctx, 0); @@ -5336,7 +6890,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; alu.src[2].chan = 0; - alu.src[2].value = *(uint32_t *)&one_point_five; + alu.src[2].value = u_bitcast_f2u(1.5f); alu.dst.sel = ctx->temp_reg; alu.dst.chan = 0; @@ -5357,7 +6911,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; alu.src[2].chan = 0; - alu.src[2].value = *(uint32_t *)&one_point_five; + alu.src[2].value = u_bitcast_f2u(1.5f); alu.dst.sel = ctx->temp_reg; alu.dst.chan = 1; @@ -5391,7 +6945,6 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) { if (ctx->bc->chip_class >= EVERGREEN) { int mytmp = r600_get_temp(ctx); - static const float eight = 8.0f; memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; alu.src[0].sel = ctx->temp_reg; @@ -5411,7 +6964,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) r600_bytecode_src(&alu.src[0], &ctx->src[0], 3); alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; alu.src[1].chan = 0; - alu.src[1].value = *(uint32_t *)&eight; + alu.src[1].value = u_bitcast_f2u(8.0f); alu.src[2].sel = mytmp; alu.src[2].chan = 0; alu.dst.sel = ctx->temp_reg; @@ -5777,13 +7330,14 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_MOV; + alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL; if (ctx->bc->chip_class >= EVERGREEN) { /* channel 1 or 3 of each word */ - alu.src[0].sel = 512 + (id / 2); + alu.src[0].sel += (id / 2); alu.src[0].chan = ((id % 2) * 2) + 1; } else { /* r600 we have them at channel 2 of the second dword */ - alu.src[0].sel = 512 + (id * 2) + 1; + alu.src[0].sel += (id * 2) + 1; alu.src[0].chan = 2; } alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER; @@ -5896,6 +7450,12 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) tex.dst_sel_z = 7; tex.dst_sel_w = 7; } + else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { + tex.dst_sel_x = 3; + tex.dst_sel_y = 7; + tex.dst_sel_z = 7; + tex.dst_sel_w = 7; + } else { tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; @@ -5904,7 +7464,8 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) } - if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) { + if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ || + inst->Instruction.Opcode == TGSI_OPCODE_TXQS) { tex.src_sel_x = 4; tex.src_sel_y = 4; tex.src_sel_z = 4; @@ -6129,6 +7690,15 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx) int i, r, j; int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); int temp_regs[3]; + unsigned op; + + if (ctx->src[0].abs && ctx->src[0].neg) { + op = ALU_OP3_CNDE; + ctx->src[0].abs = 0; + ctx->src[0].neg = 0; + } else { + op = ALU_OP3_CNDGE; + } for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { temp_regs[j] = 0; @@ -6141,14 +7711,14 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx) continue; memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP3_CNDGE; + alu.op = op; r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]); if (r) return r; - r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[2]); + r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]); if (r) return r; - r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[2], &ctx->src[1]); + r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]); if (r) return r; tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); @@ -6270,7 +7840,7 @@ static int tgsi_exp(struct r600_shader_ctx *ctx) struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; int r; - int i; + unsigned i; /* result.x = 2^floor(src); */ if (inst->Dst[0].Register.WriteMask & 1) { @@ -6399,7 +7969,7 @@ static int tgsi_log(struct r600_shader_ctx *ctx) struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bytecode_alu alu; int r; - int i; + unsigned i; /* result.x = floor(log2(|src|)); */ if (inst->Dst[0].Register.WriteMask & 1) { @@ -6660,7 +8230,7 @@ static int tgsi_eg_arl(struct r600_shader_ctx *ctx) struct r600_bytecode_alu alu; int r; int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); - unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg; + unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index); assert(inst->Dst[0].Register.Index < 3); memset(&alu, 0, sizeof(struct r600_bytecode_alu)); @@ -7126,7 +8696,7 @@ static int tgsi_bgnloop(struct r600_shader_ctx *ctx) static int tgsi_endloop(struct r600_shader_ctx *ctx) { - int i; + unsigned i; r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END); @@ -7216,10 +8786,20 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) static int tgsi_gs_emit(struct r600_shader_ctx *ctx) { + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX]; + int r; + if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) - emit_gs_ring_writes(ctx, TRUE); + emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE); - return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); + r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op); + if (!r) { + ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream + if (ctx->inst_info->op == CF_OP_EMIT_VERTEX) + return emit_inc_ring_offset(ctx, stream, TRUE); + } + return r; } static int tgsi_umad(struct r600_shader_ctx *ctx) @@ -7294,6 +8874,105 @@ static int tgsi_umad(struct r600_shader_ctx *ctx) return 0; } +static int tgsi_pk2h(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r, i; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + /* temp.xy = f32_to_f16(src) */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_FLT32_TO_FLT16; + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + alu.dst.chan = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 1); + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.x = temp.y * 0x10000 + temp.x */ + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP3_MULADD_UINT24; + alu.is_op3 = 1; + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.last = i == lasti; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 1; + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 0x10000; + alu.src[2].sel = ctx->temp_reg; + alu.src[2].chan = 0; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + return 0; +} + +static int tgsi_up2h(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bytecode_alu alu; + int r, i; + int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask); + + /* temp.x = src.x */ + /* note: no need to mask out the high bits */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* temp.y = src.x >> 16 */ + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP2_LSHR_INT; + alu.dst.chan = 1; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + r600_bytecode_src(&alu.src[0], &ctx->src[0], 0); + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].value = 16; + alu.last = 1; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.wz = dst.xy = f16_to_f32(temp.xy) */ + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + alu.op = ALU_OP1_FLT16_TO_FLT32; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i % 2; + alu.last = i == lasti; + r = r600_bytecode_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + return 0; +} + static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl}, [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2}, @@ -7318,7 +8997,6 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3}, - [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2}, [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, @@ -7334,7 +9012,7 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd}, [32] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2}, + [33] = { ALU_OP0_NOP, tgsi_unsupported}, [34] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp}, [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, @@ -7405,21 +9083,21 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, - [104] = { ALU_OP0_NOP, tgsi_unsupported}, - [105] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, + [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, [106] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, - [112] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported}, [114] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_loop_breakc}, [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ - [118] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans}, [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, @@ -7517,9 +9195,8 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3}, - [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2}, [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, - [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate}, [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported}, [22] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7533,14 +9210,14 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow}, [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd}, [32] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2}, + [33] = { ALU_OP0_NOP, tgsi_unsupported}, [34] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp}, [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig}, [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ - [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7555,7 +9232,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, - [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7604,21 +9281,21 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, - [104] = { ALU_OP0_NOP, tgsi_unsupported}, - [105] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, + [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, [106] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, - [112] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported}, [114] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ - [118] = { ALU_OP0_NOP, tgsi_unsupported}, + /* Refer below for TGSI_OPCODE_DFMA */ [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i}, [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, @@ -7666,7 +9343,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7695,6 +9372,30 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, + [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, + [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, + [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, + [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, + [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, + [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, + [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, + [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, + [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, + [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, + [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, + [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, + [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, + [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, + [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, + [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, + [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, + [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, + [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, + [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, }; @@ -7716,9 +9417,8 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap}, [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2}, [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3}, - [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2}, [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp}, - [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3}, [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr}, [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported}, [22] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7732,14 +9432,14 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow}, [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd}, [32] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2}, + [33] = { ALU_OP0_NOP, tgsi_unsupported}, [34] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp}, [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig}, [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex}, [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex}, [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */ - [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h}, [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7754,7 +9454,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex}, [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex}, [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex}, - [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h}, [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7803,21 +9503,21 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop}, [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex}, - [104] = { ALU_OP0_NOP, tgsi_unsupported}, - [105] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex}, + [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported}, [106] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2}, [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2}, [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap}, [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap}, - [112] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported}, [114] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */ [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */ - [118] = { ALU_OP0_NOP, tgsi_unsupported}, + /* Refer below for TGSI_OPCODE_DFMA */ [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2}, [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv}, [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2}, @@ -7865,7 +9565,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported}, - [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported}, + [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier}, [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported}, [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported}, @@ -7894,5 +9594,29 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm}, [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm}, [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm}, + [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64}, + [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64}, + [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg}, + [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64}, + [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr}, + [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64}, + [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64}, + [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s}, + [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest}, + [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr}, + [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr}, + [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64}, + [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64}, + [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64}, + [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64}, + [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp}, + [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int}, + [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double}, + [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int}, + [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double}, + [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr}, [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported}, };