+ /* Sanity checking. */
+ if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
+ R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
+ r = -EINVAL;
+ goto out_err;
+ }
+ for (i = 0; i < so->num_outputs; i++) {
+ if (so->output[i].output_buffer >= 4) {
+ R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
+ so->output[i].output_buffer);
+ r = -EINVAL;
+ goto out_err;
+ }
+ }
+
+ /* Initialize locations where the outputs are stored. */
+ for (i = 0; i < so->num_outputs; i++) {
+ so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
+
+ /* Lower outputs with dst_offset < start_component.
+ *
+ * We can only output 4D vectors with a write mask, e.g. we can
+ * only output the W component at offset 3, etc. If we want
+ * to store Y, Z, or W at buffer offset 0, we need to use MOV
+ * to move it to X and output X. */
+ if (so->output[i].dst_offset < so->output[i].start_component) {
+ unsigned tmp = r600_get_temp(ctx);
+
+ for (j = 0; j < so->output[i].num_components; j++) {
+ struct r600_bytecode_alu alu;
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP1_MOV;
+ alu.src[0].sel = so_gpr[i];
+ alu.src[0].chan = so->output[i].start_component + j;
+
+ alu.dst.sel = tmp;
+ alu.dst.chan = j;
+ alu.dst.write = 1;
+ if (j == so->output[i].num_components - 1)
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ so->output[i].start_component = 0;
+ so_gpr[i] = tmp;
+ }
+ }
+
+ /* Write outputs to buffers. */
+ for (i = 0; i < so->num_outputs; i++) {
+ struct r600_bytecode_output output;
+
+ memset(&output, 0, sizeof(struct r600_bytecode_output));
+ output.gpr = so_gpr[i];
+ output.elem_size = so->output[i].num_components;
+ output.array_base = so->output[i].dst_offset - so->output[i].start_component;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+ output.burst_count = 1;
+ /* array_size is an upper limit for the burst_count
+ * with MEM_STREAM instructions */
+ output.array_size = 0xFFF;
+ output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
+ if (ctx->bc->chip_class >= EVERGREEN) {
+ switch (so->output[i].output_buffer) {
+ case 0:
+ output.op = CF_OP_MEM_STREAM0_BUF0;
+ break;
+ case 1:
+ output.op = CF_OP_MEM_STREAM0_BUF1;
+ break;
+ case 2:
+ output.op = CF_OP_MEM_STREAM0_BUF2;
+ break;
+ case 3:
+ output.op = CF_OP_MEM_STREAM0_BUF3;
+ break;
+ }
+ } else {
+ switch (so->output[i].output_buffer) {
+ case 0:
+ output.op = CF_OP_MEM_STREAM0;
+ break;
+ case 1:
+ output.op = CF_OP_MEM_STREAM1;
+ break;
+ case 2:
+ output.op = CF_OP_MEM_STREAM2;
+ break;
+ case 3:
+ output.op = CF_OP_MEM_STREAM3;
+ break;
+ }
+ }
+ r = r600_bytecode_add_output(ctx->bc, &output);
+ if (r)
+ goto out_err;
+ }
+ return 0;
+out_err:
+ return r;
+}
+
+static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
+{
+ struct r600_bytecode_alu alu;
+ unsigned reg;
+
+ if (!ctx->shader->vs_out_edgeflag)
+ return;
+
+ reg = ctx->shader->output[ctx->edgeflag_output].gpr;
+
+ /* clamp(x, 0, 1) */
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_MOV;
+ alu.src[0].sel = reg;
+ alu.dst.sel = reg;
+ alu.dst.write = 1;
+ alu.dst.clamp = 1;
+ alu.last = 1;
+ r600_bytecode_add_alu(ctx->bc, &alu);
+
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP1_FLT_TO_INT;
+ alu.src[0].sel = reg;
+ alu.dst.sel = reg;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r600_bytecode_add_alu(ctx->bc, &alu);
+}
+
+static int generate_gs_copy_shader(struct r600_context *rctx,
+ struct r600_pipe_shader *gs,
+ struct pipe_stream_output_info *so)
+{
+ struct r600_shader_ctx ctx = {};
+ struct r600_shader *gs_shader = &gs->shader;
+ struct r600_pipe_shader *cshader;
+ int ocnt = gs_shader->noutput;
+ struct r600_bytecode_alu alu;
+ struct r600_bytecode_vtx vtx;
+ struct r600_bytecode_output output;
+ struct r600_bytecode_cf *cf_jump, *cf_pop,
+ *last_exp_pos = NULL, *last_exp_param = NULL;
+ int i, next_clip_pos = 61, next_param = 0;
+
+ cshader = calloc(1, sizeof(struct r600_pipe_shader));
+ if (!cshader)
+ return 0;
+
+ memcpy(cshader->shader.output, gs_shader->output, ocnt *
+ sizeof(struct r600_shader_io));
+
+ cshader->shader.noutput = ocnt;
+
+ ctx.shader = &cshader->shader;
+ ctx.bc = &ctx.shader->bc;
+ ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
+
+ r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
+ rctx->screen->has_compressed_msaa_texturing);
+
+ ctx.bc->isa = rctx->isa;
+
+ /* R0.x = R0.x & 0x3fffffff */
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP2_AND_INT;
+ alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+ alu.src[1].value = 0x3fffffff;
+ alu.dst.write = 1;
+ r600_bytecode_add_alu(ctx.bc, &alu);
+
+ /* R0.y = R0.x >> 30 */
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP2_LSHR_INT;
+ alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+ alu.src[1].value = 0x1e;
+ alu.dst.chan = 1;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r600_bytecode_add_alu(ctx.bc, &alu);
+
+ /* PRED_SETE_INT __, R0.y, 0 */
+ memset(&alu, 0, sizeof(alu));
+ alu.op = ALU_OP2_PRED_SETE_INT;
+ alu.src[0].chan = 1;
+ alu.src[1].sel = V_SQ_ALU_SRC_0;
+ alu.execute_mask = 1;
+ alu.update_pred = 1;
+ alu.last = 1;
+ r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
+
+ r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
+ cf_jump = ctx.bc->cf_last;
+
+ /* fetch vertex data from GSVS ring */
+ for (i = 0; i < ocnt; ++i) {
+ struct r600_shader_io *out = &ctx.shader->output[i];
+ out->gpr = i + 1;
+ out->ring_offset = i * 16;
+
+ memset(&vtx, 0, sizeof(vtx));
+ vtx.op = FETCH_OP_VFETCH;
+ vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
+ vtx.fetch_type = 2;
+ vtx.offset = out->ring_offset;
+ vtx.dst_gpr = out->gpr;
+ vtx.dst_sel_x = 0;
+ vtx.dst_sel_y = 1;
+ vtx.dst_sel_z = 2;
+ vtx.dst_sel_w = 3;
+ if (rctx->b.chip_class >= EVERGREEN) {
+ vtx.use_const_fields = 1;
+ } else {
+ vtx.data_format = FMT_32_32_32_32_FLOAT;
+ }
+
+ r600_bytecode_add_vtx(ctx.bc, &vtx);
+ }
+
+ /* XXX handle clipvertex, streamout? */
+ emit_streamout(&ctx, so);
+
+ /* export vertex data */
+ /* XXX factor out common code with r600_shader_from_tgsi ? */
+ for (i = 0; i < ocnt; ++i) {
+ struct r600_shader_io *out = &ctx.shader->output[i];
+
+ if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
+ continue;
+
+ memset(&output, 0, sizeof(output));
+ output.gpr = out->gpr;
+ output.elem_size = 3;
+ output.swizzle_x = 0;
+ output.swizzle_y = 1;
+ output.swizzle_z = 2;
+ output.swizzle_w = 3;
+ output.burst_count = 1;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+ output.op = CF_OP_EXPORT;
+ switch (out->name) {
+ case TGSI_SEMANTIC_POSITION:
+ output.array_base = 60;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ break;
+
+ case TGSI_SEMANTIC_PSIZE:
+ output.array_base = 61;
+ if (next_clip_pos == 61)
+ next_clip_pos = 62;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ output.swizzle_y = 7;
+ output.swizzle_z = 7;
+ output.swizzle_w = 7;
+ ctx.shader->vs_out_misc_write = 1;
+ ctx.shader->vs_out_point_size = 1;
+ break;
+ case TGSI_SEMANTIC_LAYER:
+ if (out->spi_sid) {
+ /* duplicate it as PARAM to pass to the pixel shader */
+ output.array_base = next_param++;
+ r600_bytecode_add_output(ctx.bc, &output);
+ last_exp_param = ctx.bc->cf_last;
+ }
+ output.array_base = 61;
+ if (next_clip_pos == 61)
+ next_clip_pos = 62;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ output.swizzle_x = 7;
+ output.swizzle_y = 7;
+ output.swizzle_z = 0;
+ output.swizzle_w = 7;
+ ctx.shader->vs_out_misc_write = 1;
+ ctx.shader->vs_out_layer = 1;
+ break;
+ case TGSI_SEMANTIC_VIEWPORT_INDEX:
+ if (out->spi_sid) {
+ /* duplicate it as PARAM to pass to the pixel shader */
+ output.array_base = next_param++;
+ r600_bytecode_add_output(ctx.bc, &output);
+ last_exp_param = ctx.bc->cf_last;
+ }
+ output.array_base = 61;
+ if (next_clip_pos == 61)
+ next_clip_pos = 62;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ ctx.shader->vs_out_misc_write = 1;
+ ctx.shader->vs_out_viewport = 1;
+ output.swizzle_x = 7;
+ output.swizzle_y = 7;
+ output.swizzle_z = 7;
+ output.swizzle_w = 0;
+ break;
+ case TGSI_SEMANTIC_CLIPDIST:
+ /* spi_sid is 0 for clipdistance outputs that were generated
+ * for clipvertex - we don't need to pass them to PS */
+ ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
+ if (out->spi_sid) {
+ /* duplicate it as PARAM to pass to the pixel shader */
+ output.array_base = next_param++;
+ r600_bytecode_add_output(ctx.bc, &output);
+ last_exp_param = ctx.bc->cf_last;
+ }
+ output.array_base = next_clip_pos++;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ break;
+ case TGSI_SEMANTIC_FOG:
+ output.swizzle_y = 4; /* 0 */
+ output.swizzle_z = 4; /* 0 */
+ output.swizzle_w = 5; /* 1 */
+ break;
+ default:
+ output.array_base = next_param++;
+ break;
+ }
+ r600_bytecode_add_output(ctx.bc, &output);
+ if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
+ last_exp_param = ctx.bc->cf_last;
+ else
+ last_exp_pos = ctx.bc->cf_last;
+ }
+
+ if (!last_exp_pos) {
+ memset(&output, 0, sizeof(output));
+ output.gpr = 0;
+ output.elem_size = 3;
+ output.swizzle_x = 7;
+ output.swizzle_y = 7;
+ output.swizzle_z = 7;
+ output.swizzle_w = 7;
+ output.burst_count = 1;
+ output.type = 2;
+ output.op = CF_OP_EXPORT;
+ output.array_base = 60;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+ r600_bytecode_add_output(ctx.bc, &output);
+ last_exp_pos = ctx.bc->cf_last;
+ }
+
+ if (!last_exp_param) {
+ memset(&output, 0, sizeof(output));
+ output.gpr = 0;
+ output.elem_size = 3;
+ output.swizzle_x = 7;
+ output.swizzle_y = 7;
+ output.swizzle_z = 7;
+ output.swizzle_w = 7;
+ output.burst_count = 1;
+ output.type = 2;
+ output.op = CF_OP_EXPORT;
+ output.array_base = next_param++;
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+ r600_bytecode_add_output(ctx.bc, &output);
+ last_exp_param = ctx.bc->cf_last;
+ }
+
+ last_exp_pos->op = CF_OP_EXPORT_DONE;
+ last_exp_param->op = CF_OP_EXPORT_DONE;
+
+ r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
+ cf_pop = ctx.bc->cf_last;
+
+ cf_jump->cf_addr = cf_pop->id + 2;
+ cf_jump->pop_count = 1;
+ cf_pop->cf_addr = cf_pop->id + 2;
+ cf_pop->pop_count = 1;
+
+ if (ctx.bc->chip_class == CAYMAN)
+ cm_bytecode_add_cf_end(ctx.bc);
+ else {
+ r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
+ ctx.bc->cf_last->end_of_program = 1;
+ }
+
+ gs->gs_copy_shader = cshader;
+
+ ctx.bc->nstack = 1;
+ cshader->shader.ring_item_size = ocnt * 16;
+
+ return r600_bytecode_build(ctx.bc);
+}
+
+static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
+{
+ struct r600_bytecode_output output;
+ int i, k, ring_offset;
+
+ for (i = 0; i < ctx->shader->noutput; i++) {
+ if (ctx->gs_for_vs) {
+ /* for ES we need to lookup corresponding ring offset expected by GS
+ * (map this output to GS input by name and sid) */
+ /* FIXME precompute offsets */
+ ring_offset = -1;
+ for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
+ struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
+ struct r600_shader_io *out = &ctx->shader->output[i];
+ if (in->name == out->name && in->sid == out->sid)
+ ring_offset = in->ring_offset;
+ }
+
+ if (ring_offset == -1)
+ continue;
+ } else
+ ring_offset = i * 16;
+
+ /* next_ring_offset after parsing input decls contains total size of
+ * single vertex data, gs_next_vertex - current vertex index */
+ if (!ind)
+ ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
+
+ /* get a temp and add the ring offset to the next vertex base in the shader */
+ memset(&output, 0, sizeof(struct r600_bytecode_output));
+ output.gpr = ctx->shader->output[i].gpr;
+ output.elem_size = 3;
+ output.comp_mask = 0xF;
+ output.burst_count = 1;
+
+ if (ind)
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+ else
+ output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+ output.op = CF_OP_MEM_RING;
+
+
+ if (ind) {
+ output.array_base = ring_offset >> 2; /* in dwords */
+ output.array_size = 0xfff;
+ output.index_gpr = ctx->gs_export_gpr_treg;
+ } else
+ output.array_base = ring_offset >> 2; /* in dwords */
+ r600_bytecode_add_output(ctx->bc, &output);
+ }
+
+ if (ind) {
+ struct r600_bytecode_alu alu;
+ int r;
+
+ memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+ alu.op = ALU_OP2_ADD_INT;
+ alu.src[0].sel = ctx->gs_export_gpr_treg;
+ alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+ alu.src[1].value = ctx->gs_out_ring_offset >> 4;
+ alu.dst.sel = ctx->gs_export_gpr_treg;
+ alu.dst.write = 1;
+ alu.last = 1;
+ r = r600_bytecode_add_alu(ctx->bc, &alu);
+ if (r)
+ return r;
+ }
+ ++ctx->gs_next_vertex;
+ return 0;
+}
+
+static int r600_shader_from_tgsi(struct r600_context *rctx,