b7dab54daab55a795b84a6a2727a392015173017
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600_dump.h"
28 #include "r600d.h"
29 #include "sfn/sfn_nir.h"
30
31 #include "sb/sb_public.h"
32
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/tgsi_info.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "tgsi/tgsi_scan.h"
37 #include "tgsi/tgsi_dump.h"
38 #include "tgsi/tgsi_from_mesa.h"
39 #include "nir/tgsi_to_nir.h"
40 #include "nir/nir_to_tgsi_info.h"
41 #include "compiler/nir/nir.h"
42 #include "util/u_bitcast.h"
43 #include "util/u_memory.h"
44 #include "util/u_math.h"
45 #include <stdio.h>
46 #include <errno.h>
47
48 /* CAYMAN notes
49 Why CAYMAN got loops for lots of instructions is explained here.
50
51 -These 8xx t-slot only ops are implemented in all vector slots.
52 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53 These 8xx t-slot only opcodes become vector ops, with all four
54 slots expecting the arguments on sources a and b. Result is
55 broadcast to all channels.
56 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57 These 8xx t-slot only opcodes become vector ops in the z, y, and
58 x slots.
59 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61 SQRT_IEEE/_64
62 SIN/COS
63 The w slot may have an independent co-issued operation, or if the
64 result is required to be in the w slot, the opcode above may be
65 issued in the w slot as well.
66 The compiler must issue the source argument to slots z, y, and x
67 */
68
69 /* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72 .y = RelVertexID (??)
73 .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76 r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79 .y = RelPatchID (??)
80 .z = InvocationID
81 .w = tess factor base.
82
83 TES - .x = TessCoord.x
84 - .y = TessCoord.y
85 - .z = RelPatchID (??)
86 - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89 face_gpr.w = SampleID
90 */
91 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92 static int r600_shader_from_tgsi(struct r600_context *rctx,
93 struct r600_pipe_shader *pipeshader,
94 union r600_shader_key key);
95
96 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97 int size, unsigned comp_mask) {
98
99 if (!size)
100 return;
101
102 if (ps->num_arrays == ps->max_arrays) {
103 ps->max_arrays += 64;
104 ps->arrays = realloc(ps->arrays, ps->max_arrays *
105 sizeof(struct r600_shader_array));
106 }
107
108 int n = ps->num_arrays;
109 ++ps->num_arrays;
110
111 ps->arrays[n].comp_mask = comp_mask;
112 ps->arrays[n].gpr_start = start_gpr;
113 ps->arrays[n].gpr_count = size;
114 }
115
116 static void r600_dump_streamout(struct pipe_stream_output_info *so)
117 {
118 unsigned i;
119
120 fprintf(stderr, "STREAMOUT\n");
121 for (i = 0; i < so->num_outputs; i++) {
122 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123 so->output[i].start_component;
124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125 i,
126 so->output[i].stream,
127 so->output[i].output_buffer,
128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129 so->output[i].register_index,
130 mask & 1 ? "x" : "",
131 mask & 2 ? "y" : "",
132 mask & 4 ? "z" : "",
133 mask & 8 ? "w" : "",
134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135 }
136 }
137
138 static int store_shader(struct pipe_context *ctx,
139 struct r600_pipe_shader *shader)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 uint32_t *ptr, i;
143
144 if (shader->bo == NULL) {
145 shader->bo = (struct r600_resource*)
146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147 if (shader->bo == NULL) {
148 return -ENOMEM;
149 }
150 ptr = r600_buffer_map_sync_with_rings(
151 &rctx->b, shader->bo,
152 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
153 if (R600_BIG_ENDIAN) {
154 for (i = 0; i < shader->shader.bc.ndw; ++i) {
155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156 }
157 } else {
158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159 }
160 rctx->b.ws->buffer_unmap(shader->bo->buf);
161 }
162
163 return 0;
164 }
165
166 extern const struct nir_shader_compiler_options r600_nir_options;
167 static int nshader = 0;
168 int r600_pipe_shader_create(struct pipe_context *ctx,
169 struct r600_pipe_shader *shader,
170 union r600_shader_key key)
171 {
172 struct r600_context *rctx = (struct r600_context *)ctx;
173 struct r600_pipe_shader_selector *sel = shader->selector;
174 int r;
175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178 tgsi_get_processor_type(sel->tokens):
179 pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) &&
183 !(rscreen->b.debug_flags & DBG_NIR);
184 unsigned sb_disasm;
185 unsigned export_shader;
186
187 shader->shader.bc.isa = rctx->isa;
188
189 if (!(rscreen->b.debug_flags & DBG_NIR)) {
190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191 r = r600_shader_from_tgsi(rctx, shader, key);
192 if (r) {
193 R600_ERR("translation from TGSI failed !\n");
194 goto error;
195 }
196 } else {
197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
199 /* Lower int64 ops because we have some r600 build-in shaders that use it */
200 if (!ctx->screen->get_param(ctx->screen, PIPE_CAP_DOUBLES)) {
201 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
202 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
203 NIR_PASS_V(sel->nir, nir_lower_int64);
204 NIR_PASS_V(sel->nir, nir_opt_vectorize);
205 }
206 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false, false);
207 }
208 nir_tgsi_scan_shader(sel->nir, &sel->info, true);
209
210 r = r600_shader_from_nir(rctx, shader, &key);
211 if (r) {
212 fprintf(stderr, "--Failed shader--------------------------------------------------\n");
213
214 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
215 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
216 tgsi_dump(sel->tokens, 0);
217 }
218
219 if (rscreen->b.debug_flags & DBG_NIR) {
220 fprintf(stderr, "--NIR --------------------------------------------------------\n");
221 nir_print_shader(sel->nir, stderr);
222 }
223
224 R600_ERR("translation from NIR failed !\n");
225 goto error;
226 }
227 }
228
229 if (dump) {
230 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
231 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
232 tgsi_dump(sel->tokens, 0);
233 }
234
235 if (sel->so.num_outputs) {
236 r600_dump_streamout(&sel->so);
237 }
238 }
239
240 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
241 /* only disable for vertex shaders in tess paths */
242 if (key.vs.as_ls)
243 use_sb = 0;
244 }
245 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
246 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
247 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
248
249 /* disable SB for shaders using doubles */
250 use_sb &= !shader->shader.uses_doubles;
251
252 use_sb &= !shader->shader.uses_atomics;
253 use_sb &= !shader->shader.uses_images;
254 use_sb &= !shader->shader.uses_helper_invocation;
255
256 /* Check if the bytecode has already been built. */
257 if (!shader->shader.bc.bytecode) {
258 r = r600_bytecode_build(&shader->shader.bc);
259 if (r) {
260 R600_ERR("building bytecode failed !\n");
261 goto error;
262 }
263 }
264
265 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
266 if (dump && !sb_disasm) {
267 fprintf(stderr, "--------------------------------------------------------------\n");
268 r600_bytecode_disasm(&shader->shader.bc);
269 fprintf(stderr, "______________________________________________________________\n");
270 } else if ((dump && sb_disasm) || use_sb) {
271 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
272 dump, use_sb);
273 if (r) {
274 R600_ERR("r600_sb_bytecode_process failed !\n");
275 goto error;
276 }
277 }
278
279 if (dump) {
280 FILE *f;
281 char fname[1024];
282 snprintf(fname, 1024, "shader_from_%s_%d.cpp",
283 (sel->ir_type == PIPE_SHADER_IR_TGSI ?
284 (rscreen->b.debug_flags & DBG_NIR ? "tgsi-nir" : "tgsi")
285 : "nir"), nshader);
286 f = fopen(fname, "w");
287 print_shader_info(f, nshader++, &shader->shader);
288 print_shader_info(stderr, nshader++, &shader->shader);
289 print_pipe_info(stderr, &sel->info);
290 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
291 fprintf(f, "/****TGSI**********************************\n");
292 tgsi_dump_to_file(sel->tokens, 0, f);
293 }
294
295 if (rscreen->b.debug_flags & DBG_NIR){
296 fprintf(f, "/****NIR **********************************\n");
297 nir_print_shader(sel->nir, f);
298 }
299 fprintf(f, "******************************************/\n");
300 fclose(f);
301 }
302
303 if (shader->gs_copy_shader) {
304 if (dump) {
305 // dump copy shader
306 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
307 &shader->gs_copy_shader->shader, dump, 0);
308 if (r)
309 goto error;
310 }
311
312 if ((r = store_shader(ctx, shader->gs_copy_shader)))
313 goto error;
314 }
315
316 /* Store the shader in a buffer. */
317 if ((r = store_shader(ctx, shader)))
318 goto error;
319
320 /* Build state. */
321 switch (shader->shader.processor_type) {
322 case PIPE_SHADER_TESS_CTRL:
323 evergreen_update_hs_state(ctx, shader);
324 break;
325 case PIPE_SHADER_TESS_EVAL:
326 if (key.tes.as_es)
327 evergreen_update_es_state(ctx, shader);
328 else
329 evergreen_update_vs_state(ctx, shader);
330 break;
331 case PIPE_SHADER_GEOMETRY:
332 if (rctx->b.chip_class >= EVERGREEN) {
333 evergreen_update_gs_state(ctx, shader);
334 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
335 } else {
336 r600_update_gs_state(ctx, shader);
337 r600_update_vs_state(ctx, shader->gs_copy_shader);
338 }
339 break;
340 case PIPE_SHADER_VERTEX:
341 export_shader = key.vs.as_es;
342 if (rctx->b.chip_class >= EVERGREEN) {
343 if (key.vs.as_ls)
344 evergreen_update_ls_state(ctx, shader);
345 else if (key.vs.as_es)
346 evergreen_update_es_state(ctx, shader);
347 else
348 evergreen_update_vs_state(ctx, shader);
349 } else {
350 if (export_shader)
351 r600_update_es_state(ctx, shader);
352 else
353 r600_update_vs_state(ctx, shader);
354 }
355 break;
356 case PIPE_SHADER_FRAGMENT:
357 if (rctx->b.chip_class >= EVERGREEN) {
358 evergreen_update_ps_state(ctx, shader);
359 } else {
360 r600_update_ps_state(ctx, shader);
361 }
362 break;
363 case PIPE_SHADER_COMPUTE:
364 evergreen_update_ls_state(ctx, shader);
365 break;
366 default:
367 r = -EINVAL;
368 goto error;
369 }
370 return 0;
371
372 error:
373 r600_pipe_shader_destroy(ctx, shader);
374 return r;
375 }
376
377 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
378 {
379 r600_resource_reference(&shader->bo, NULL);
380 if (shader->shader.bc.cf.next)
381 r600_bytecode_clear(&shader->shader.bc);
382 r600_release_command_buffer(&shader->command_buffer);
383 }
384
385 /*
386 * tgsi -> r600 shader
387 */
388 struct r600_shader_tgsi_instruction;
389
390 struct r600_shader_src {
391 unsigned sel;
392 unsigned swizzle[4];
393 unsigned neg;
394 unsigned abs;
395 unsigned rel;
396 unsigned kc_bank;
397 boolean kc_rel; /* true if cache bank is indexed */
398 uint32_t value[4];
399 };
400
401 struct eg_interp {
402 boolean enabled;
403 unsigned ij_index;
404 };
405
406 struct r600_shader_ctx {
407 struct tgsi_shader_info info;
408 struct tgsi_array_info *array_infos;
409 /* flag for each tgsi temp array if its been spilled or not */
410 bool *spilled_arrays;
411 struct tgsi_parse_context parse;
412 const struct tgsi_token *tokens;
413 unsigned type;
414 unsigned file_offset[TGSI_FILE_COUNT];
415 unsigned temp_reg;
416 const struct r600_shader_tgsi_instruction *inst_info;
417 struct r600_bytecode *bc;
418 struct r600_shader *shader;
419 struct r600_shader_src src[4];
420 uint32_t *literals;
421 uint32_t nliterals;
422 uint32_t max_driver_temp_used;
423 /* needed for evergreen interpolation */
424 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
425 /* evergreen/cayman also store sample mask in face register */
426 int face_gpr;
427 /* sample id is .w component stored in fixed point position register */
428 int fixed_pt_position_gpr;
429 int colors_used;
430 boolean clip_vertex_write;
431 unsigned cv_output;
432 unsigned edgeflag_output;
433 int helper_invoc_reg;
434 int cs_block_size_reg;
435 int cs_grid_size_reg;
436 bool cs_block_size_loaded, cs_grid_size_loaded;
437 int fragcoord_input;
438 int next_ring_offset;
439 int gs_out_ring_offset;
440 int gs_next_vertex;
441 struct r600_shader *gs_for_vs;
442 int gs_export_gpr_tregs[4];
443 int gs_rotated_input[2];
444 const struct pipe_stream_output_info *gs_stream_output_info;
445 unsigned enabled_stream_buffers_mask;
446 unsigned tess_input_info; /* temp with tess input offsets */
447 unsigned tess_output_info; /* temp with tess input offsets */
448 unsigned thread_id_gpr; /* temp with thread id calculated for images */
449 };
450
451 struct r600_shader_tgsi_instruction {
452 unsigned op;
453 int (*process)(struct r600_shader_ctx *ctx);
454 };
455
456 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
457 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
458 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
459 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
460 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
461 static int tgsi_else(struct r600_shader_ctx *ctx);
462 static int tgsi_endif(struct r600_shader_ctx *ctx);
463 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
464 static int tgsi_endloop(struct r600_shader_ctx *ctx);
465 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
466 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
467 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
468 unsigned int dst_reg);
469 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
470 const struct r600_shader_src *shader_src,
471 unsigned chan);
472 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
473 unsigned dst_reg, unsigned mask);
474
475 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
476 {
477 if (ctx->bc->family == CHIP_HEMLOCK ||
478 ctx->bc->family == CHIP_CYPRESS ||
479 ctx->bc->family == CHIP_JUNIPER)
480 return false;
481 return true;
482 }
483
484 static int tgsi_last_instruction(unsigned writemask)
485 {
486 int i, lasti = 0;
487
488 for (i = 0; i < 4; i++) {
489 if (writemask & (1 << i)) {
490 lasti = i;
491 }
492 }
493 return lasti;
494 }
495
496 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
497 {
498 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
499 unsigned j;
500
501 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
502 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
503 return -EINVAL;
504 }
505 #if 0
506 if (i->Instruction.Label) {
507 R600_ERR("label unsupported\n");
508 return -EINVAL;
509 }
510 #endif
511 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
512 if (i->Src[j].Register.Dimension) {
513 switch (i->Src[j].Register.File) {
514 case TGSI_FILE_CONSTANT:
515 case TGSI_FILE_HW_ATOMIC:
516 break;
517 case TGSI_FILE_INPUT:
518 if (ctx->type == PIPE_SHADER_GEOMETRY ||
519 ctx->type == PIPE_SHADER_TESS_CTRL ||
520 ctx->type == PIPE_SHADER_TESS_EVAL)
521 break;
522 /* fallthrough */
523 case TGSI_FILE_OUTPUT:
524 if (ctx->type == PIPE_SHADER_TESS_CTRL)
525 break;
526 /* fallthrough */
527 default:
528 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
529 i->Src[j].Register.File,
530 i->Src[j].Register.Dimension);
531 return -EINVAL;
532 }
533 }
534 }
535 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
536 if (i->Dst[j].Register.Dimension) {
537 if (ctx->type == PIPE_SHADER_TESS_CTRL)
538 continue;
539 R600_ERR("unsupported dst (dimension)\n");
540 return -EINVAL;
541 }
542 }
543 return 0;
544 }
545
546 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
547 {
548 if (interpolate == TGSI_INTERPOLATE_COLOR ||
549 interpolate == TGSI_INTERPOLATE_LINEAR ||
550 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
551 {
552 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
553 int loc;
554
555 switch(location) {
556 case TGSI_INTERPOLATE_LOC_CENTER:
557 loc = 1;
558 break;
559 case TGSI_INTERPOLATE_LOC_CENTROID:
560 loc = 2;
561 break;
562 case TGSI_INTERPOLATE_LOC_SAMPLE:
563 default:
564 loc = 0; break;
565 }
566
567 return is_linear * 3 + loc;
568 }
569
570 return -1;
571 }
572
573 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
574 int input)
575 {
576 int i = eg_get_interpolator_index(
577 ctx->shader->input[input].interpolate,
578 ctx->shader->input[input].interpolate_location);
579 assert(i >= 0);
580 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
581 }
582
583 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
584 {
585 int i, r;
586 struct r600_bytecode_alu alu;
587 int gpr = 0, base_chan = 0;
588 int ij_index = ctx->shader->input[input].ij_index;
589
590 /* work out gpr and base_chan from index */
591 gpr = ij_index / 2;
592 base_chan = (2 * (ij_index % 2)) + 1;
593
594 for (i = 0; i < 8; i++) {
595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
596
597 if (i < 4)
598 alu.op = ALU_OP2_INTERP_ZW;
599 else
600 alu.op = ALU_OP2_INTERP_XY;
601
602 if ((i > 1) && (i < 6)) {
603 alu.dst.sel = ctx->shader->input[input].gpr;
604 alu.dst.write = 1;
605 }
606
607 alu.dst.chan = i % 4;
608
609 alu.src[0].sel = gpr;
610 alu.src[0].chan = (base_chan - (i % 2));
611
612 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
613
614 alu.bank_swizzle_force = SQ_ALU_VEC_210;
615 if ((i % 4) == 3)
616 alu.last = 1;
617 r = r600_bytecode_add_alu(ctx->bc, &alu);
618 if (r)
619 return r;
620 }
621 return 0;
622 }
623
624 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
625 {
626 int i, r;
627 struct r600_bytecode_alu alu;
628
629 for (i = 0; i < 4; i++) {
630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
631
632 alu.op = ALU_OP1_INTERP_LOAD_P0;
633
634 alu.dst.sel = ctx->shader->input[input].gpr;
635 alu.dst.write = 1;
636
637 alu.dst.chan = i;
638
639 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
640 alu.src[0].chan = i;
641
642 if (i == 3)
643 alu.last = 1;
644 r = r600_bytecode_add_alu(ctx->bc, &alu);
645 if (r)
646 return r;
647 }
648 return 0;
649 }
650
651 /*
652 * Special export handling in shaders
653 *
654 * shader export ARRAY_BASE for EXPORT_POS:
655 * 60 is position
656 * 61 is misc vector
657 * 62, 63 are clip distance vectors
658 *
659 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
660 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
661 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
662 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
663 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
664 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
665 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
666 * exclusive from render target index)
667 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
668 *
669 *
670 * shader export ARRAY_BASE for EXPORT_PIXEL:
671 * 0-7 CB targets
672 * 61 computed Z vector
673 *
674 * The use of the values exported in the computed Z vector are controlled
675 * by DB_SHADER_CONTROL:
676 * Z_EXPORT_ENABLE - Z as a float in RED
677 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
678 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
679 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
680 * DB_SOURCE_FORMAT - export control restrictions
681 *
682 */
683
684
685 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
686 static int r600_spi_sid(struct r600_shader_io * io)
687 {
688 int index, name = io->name;
689
690 /* These params are handled differently, they don't need
691 * semantic indices, so we'll use 0 for them.
692 */
693 if (name == TGSI_SEMANTIC_POSITION ||
694 name == TGSI_SEMANTIC_PSIZE ||
695 name == TGSI_SEMANTIC_EDGEFLAG ||
696 name == TGSI_SEMANTIC_FACE ||
697 name == TGSI_SEMANTIC_SAMPLEMASK)
698 index = 0;
699 else {
700 if (name == TGSI_SEMANTIC_GENERIC) {
701 /* For generic params simply use sid from tgsi */
702 index = 9 + io->sid;
703 } else if (name == TGSI_SEMANTIC_TEXCOORD) {
704 index = io->sid;
705 } else {
706 /* For non-generic params - pack name and sid into 8 bits */
707 index = 0x80 | (name<<3) | (io->sid);
708 }
709
710 /* Make sure that all really used indices have nonzero value, so
711 * we can just compare it to 0 later instead of comparing the name
712 * with different values to detect special cases. */
713 index++;
714 }
715
716 return index;
717 };
718
719 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
720 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
721 {
722 switch (semantic_name) {
723 case TGSI_SEMANTIC_POSITION:
724 return 0;
725 case TGSI_SEMANTIC_PSIZE:
726 return 1;
727 case TGSI_SEMANTIC_CLIPDIST:
728 assert(index <= 1);
729 return 2 + index;
730 case TGSI_SEMANTIC_TEXCOORD:
731 return 4 + index;
732 case TGSI_SEMANTIC_GENERIC:
733 if (index <= 63-4)
734 return 4 + index;
735 else
736 /* same explanation as in the default statement,
737 * the only user hitting this is st/nine.
738 */
739 return 0;
740
741 /* patch indices are completely separate and thus start from 0 */
742 case TGSI_SEMANTIC_TESSOUTER:
743 return 0;
744 case TGSI_SEMANTIC_TESSINNER:
745 return 1;
746 case TGSI_SEMANTIC_PATCH:
747 return 2 + index;
748
749 default:
750 /* Don't fail here. The result of this function is only used
751 * for LS, TCS, TES, and GS, where legacy GL semantics can't
752 * occur, but this function is called for all vertex shaders
753 * before it's known whether LS will be compiled or not.
754 */
755 return 0;
756 }
757 }
758
759 /* turn input into interpolate on EG */
760 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
761 {
762 int r = 0;
763
764 if (ctx->shader->input[index].spi_sid) {
765 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
766 if (ctx->shader->input[index].interpolate > 0) {
767 evergreen_interp_assign_ij_index(ctx, index);
768 r = evergreen_interp_alu(ctx, index);
769 } else {
770 r = evergreen_interp_flat(ctx, index);
771 }
772 }
773 return r;
774 }
775
776 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
777 {
778 struct r600_bytecode_alu alu;
779 int i, r;
780 int gpr_front = ctx->shader->input[front].gpr;
781 int gpr_back = ctx->shader->input[back].gpr;
782
783 for (i = 0; i < 4; i++) {
784 memset(&alu, 0, sizeof(alu));
785 alu.op = ALU_OP3_CNDGT;
786 alu.is_op3 = 1;
787 alu.dst.write = 1;
788 alu.dst.sel = gpr_front;
789 alu.src[0].sel = ctx->face_gpr;
790 alu.src[1].sel = gpr_front;
791 alu.src[2].sel = gpr_back;
792
793 alu.dst.chan = i;
794 alu.src[1].chan = i;
795 alu.src[2].chan = i;
796 alu.last = (i==3);
797
798 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
799 return r;
800 }
801
802 return 0;
803 }
804
805 /* execute a single slot ALU calculation */
806 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
807 int dst_sel, int dst_chan,
808 int src0_sel, unsigned src0_chan_val,
809 int src1_sel, unsigned src1_chan_val)
810 {
811 struct r600_bytecode_alu alu;
812 int r, i;
813
814 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
815 for (i = 0; i < 4; i++) {
816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
817 alu.op = op;
818 alu.src[0].sel = src0_sel;
819 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
820 alu.src[0].value = src0_chan_val;
821 else
822 alu.src[0].chan = src0_chan_val;
823 alu.src[1].sel = src1_sel;
824 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
825 alu.src[1].value = src1_chan_val;
826 else
827 alu.src[1].chan = src1_chan_val;
828 alu.dst.sel = dst_sel;
829 alu.dst.chan = i;
830 alu.dst.write = i == dst_chan;
831 alu.last = (i == 3);
832 r = r600_bytecode_add_alu(ctx->bc, &alu);
833 if (r)
834 return r;
835 }
836 return 0;
837 }
838
839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
840 alu.op = op;
841 alu.src[0].sel = src0_sel;
842 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
843 alu.src[0].value = src0_chan_val;
844 else
845 alu.src[0].chan = src0_chan_val;
846 alu.src[1].sel = src1_sel;
847 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
848 alu.src[1].value = src1_chan_val;
849 else
850 alu.src[1].chan = src1_chan_val;
851 alu.dst.sel = dst_sel;
852 alu.dst.chan = dst_chan;
853 alu.dst.write = 1;
854 alu.last = 1;
855 r = r600_bytecode_add_alu(ctx->bc, &alu);
856 if (r)
857 return r;
858 return 0;
859 }
860
861 /* execute a single slot ALU calculation */
862 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
863 int dst_sel, int dst_chan,
864 int src0_sel, unsigned src0_chan_val,
865 int src1_sel, unsigned src1_chan_val,
866 int src2_sel, unsigned src2_chan_val)
867 {
868 struct r600_bytecode_alu alu;
869 int r;
870
871 /* validate this for other ops */
872 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
874 alu.op = op;
875 alu.src[0].sel = src0_sel;
876 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
877 alu.src[0].value = src0_chan_val;
878 else
879 alu.src[0].chan = src0_chan_val;
880 alu.src[1].sel = src1_sel;
881 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
882 alu.src[1].value = src1_chan_val;
883 else
884 alu.src[1].chan = src1_chan_val;
885 alu.src[2].sel = src2_sel;
886 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
887 alu.src[2].value = src2_chan_val;
888 else
889 alu.src[2].chan = src2_chan_val;
890 alu.dst.sel = dst_sel;
891 alu.dst.chan = dst_chan;
892 alu.is_op3 = 1;
893 alu.last = 1;
894 r = r600_bytecode_add_alu(ctx->bc, &alu);
895 if (r)
896 return r;
897 return 0;
898 }
899
900 /* put it in temp_reg.x */
901 static int get_lds_offset0(struct r600_shader_ctx *ctx,
902 int rel_patch_chan,
903 int temp_reg, bool is_patch_var)
904 {
905 int r;
906
907 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
908 /* ADD
909 Dimension - patch0_offset (input_vals.z),
910 Non-dim - patch0_data_offset (input_vals.w)
911 */
912 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
913 temp_reg, 0,
914 ctx->tess_output_info, 0,
915 0, rel_patch_chan,
916 ctx->tess_output_info, is_patch_var ? 3 : 2);
917 if (r)
918 return r;
919 return 0;
920 }
921
922 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
923 {
924 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
925 }
926
927 static int r600_get_temp(struct r600_shader_ctx *ctx)
928 {
929 return ctx->temp_reg + ctx->max_driver_temp_used++;
930 }
931
932 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
933 {
934 int i;
935 i = ctx->shader->noutput++;
936 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
937 ctx->shader->output[i].sid = 0;
938 ctx->shader->output[i].gpr = 0;
939 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
940 ctx->shader->output[i].write_mask = 0x4;
941 ctx->shader->output[i].spi_sid = prim_id_sid;
942
943 return 0;
944 }
945
946 static int tgsi_barrier(struct r600_shader_ctx *ctx)
947 {
948 struct r600_bytecode_alu alu;
949 int r;
950
951 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
952 alu.op = ctx->inst_info->op;
953 alu.last = 1;
954
955 r = r600_bytecode_add_alu(ctx->bc, &alu);
956 if (r)
957 return r;
958 return 0;
959 }
960
961 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
962 {
963 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
964 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
965 unsigned narrays_left = n;
966 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
967
968 *scratch_space_needed = 0;
969 while (*regno > 124 && narrays_left) {
970 unsigned i;
971 unsigned largest = 0;
972 unsigned largest_index = 0;
973
974 for (i = 0; i < n; i++) {
975 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
976 if (!spilled[i] && size > largest) {
977 largest = size;
978 largest_index = i;
979 }
980 }
981
982 spilled[largest_index] = true;
983 *regno -= largest;
984 *scratch_space_needed += largest;
985
986 narrays_left --;
987 }
988
989 if (narrays_left == 0) {
990 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
991 }
992 }
993
994 /* Take spilled temp arrays into account when translating tgsi register
995 * indexes into r600 gprs if spilled is false, or scratch array offset if
996 * spilled is true */
997 static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
998 {
999 unsigned i;
1000 unsigned spilled_size = 0;
1001
1002 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1003 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1004 if (ctx->spilled_arrays[i]) {
1005 /* vec4 index into spilled scratch memory */
1006 *spilled = true;
1007 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1008 }
1009 else {
1010 /* regular GPR array */
1011 *spilled = false;
1012 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1013 }
1014 }
1015
1016 if (tgsi_reg_index < ctx->array_infos[i].range.First)
1017 break;
1018 if (ctx->spilled_arrays[i]) {
1019 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1020 }
1021 }
1022
1023 /* regular GPR index, minus the holes from spilled arrays */
1024 *spilled = false;
1025
1026 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1027 }
1028
1029 /* look up spill area base offset and array size for a spilled temp array */
1030 static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1031 unsigned *array_base, unsigned *array_size)
1032 {
1033 unsigned i;
1034 unsigned offset = 0;
1035
1036 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1037 if (ctx->spilled_arrays[i]) {
1038 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1039
1040 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1041 *array_base = offset;
1042 *array_size = size - 1; /* hw counts from 1 */
1043
1044 return;
1045 }
1046
1047 offset += size;
1048 }
1049 }
1050 }
1051
1052 static int tgsi_declaration(struct r600_shader_ctx *ctx)
1053 {
1054 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1055 int r, i, j, count = d->Range.Last - d->Range.First + 1;
1056
1057 switch (d->Declaration.File) {
1058 case TGSI_FILE_INPUT:
1059 for (j = 0; j < count; j++) {
1060 i = ctx->shader->ninput + j;
1061 assert(i < ARRAY_SIZE(ctx->shader->input));
1062 ctx->shader->input[i].name = d->Semantic.Name;
1063 ctx->shader->input[i].sid = d->Semantic.Index + j;
1064 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1065 ctx->shader->input[i].interpolate_location = d->Interp.Location;
1066 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1067 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1068 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1069 switch (ctx->shader->input[i].name) {
1070 case TGSI_SEMANTIC_FACE:
1071 if (ctx->face_gpr != -1)
1072 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1073 else
1074 ctx->face_gpr = ctx->shader->input[i].gpr;
1075 break;
1076 case TGSI_SEMANTIC_COLOR:
1077 ctx->colors_used++;
1078 break;
1079 case TGSI_SEMANTIC_POSITION:
1080 ctx->fragcoord_input = i;
1081 break;
1082 case TGSI_SEMANTIC_PRIMID:
1083 /* set this for now */
1084 ctx->shader->gs_prim_id_input = true;
1085 ctx->shader->ps_prim_id_input = i;
1086 break;
1087 }
1088 if (ctx->bc->chip_class >= EVERGREEN) {
1089 if ((r = evergreen_interp_input(ctx, i)))
1090 return r;
1091 }
1092 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1093 /* FIXME probably skip inputs if they aren't passed in the ring */
1094 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1095 ctx->next_ring_offset += 16;
1096 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1097 ctx->shader->gs_prim_id_input = true;
1098 }
1099 }
1100 ctx->shader->ninput += count;
1101 break;
1102 case TGSI_FILE_OUTPUT:
1103 for (j = 0; j < count; j++) {
1104 i = ctx->shader->noutput + j;
1105 assert(i < ARRAY_SIZE(ctx->shader->output));
1106 ctx->shader->output[i].name = d->Semantic.Name;
1107 ctx->shader->output[i].sid = d->Semantic.Index + j;
1108 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1109 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1110 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1111 if (ctx->type == PIPE_SHADER_VERTEX ||
1112 ctx->type == PIPE_SHADER_GEOMETRY ||
1113 ctx->type == PIPE_SHADER_TESS_EVAL) {
1114 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1115 switch (d->Semantic.Name) {
1116 case TGSI_SEMANTIC_CLIPDIST:
1117 break;
1118 case TGSI_SEMANTIC_PSIZE:
1119 ctx->shader->vs_out_misc_write = 1;
1120 ctx->shader->vs_out_point_size = 1;
1121 break;
1122 case TGSI_SEMANTIC_EDGEFLAG:
1123 ctx->shader->vs_out_misc_write = 1;
1124 ctx->shader->vs_out_edgeflag = 1;
1125 ctx->edgeflag_output = i;
1126 break;
1127 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1128 ctx->shader->vs_out_misc_write = 1;
1129 ctx->shader->vs_out_viewport = 1;
1130 break;
1131 case TGSI_SEMANTIC_LAYER:
1132 ctx->shader->vs_out_misc_write = 1;
1133 ctx->shader->vs_out_layer = 1;
1134 break;
1135 case TGSI_SEMANTIC_CLIPVERTEX:
1136 ctx->clip_vertex_write = TRUE;
1137 ctx->cv_output = i;
1138 break;
1139 }
1140 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1141 ctx->gs_out_ring_offset += 16;
1142 }
1143 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1144 switch (d->Semantic.Name) {
1145 case TGSI_SEMANTIC_COLOR:
1146 ctx->shader->nr_ps_max_color_exports++;
1147 break;
1148 }
1149 }
1150 }
1151 ctx->shader->noutput += count;
1152 break;
1153 case TGSI_FILE_TEMPORARY:
1154 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1155 if (d->Array.ArrayID) {
1156 bool spilled;
1157 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1158 d->Range.First,
1159 &spilled);
1160
1161 if (!spilled) {
1162 r600_add_gpr_array(ctx->shader, idx,
1163 d->Range.Last - d->Range.First + 1, 0x0F);
1164 }
1165 }
1166 }
1167 break;
1168
1169 case TGSI_FILE_CONSTANT:
1170 case TGSI_FILE_SAMPLER:
1171 case TGSI_FILE_SAMPLER_VIEW:
1172 case TGSI_FILE_ADDRESS:
1173 case TGSI_FILE_BUFFER:
1174 case TGSI_FILE_IMAGE:
1175 case TGSI_FILE_MEMORY:
1176 break;
1177
1178 case TGSI_FILE_HW_ATOMIC:
1179 i = ctx->shader->nhwatomic_ranges;
1180 ctx->shader->atomics[i].start = d->Range.First;
1181 ctx->shader->atomics[i].end = d->Range.Last;
1182 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1183 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1184 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1185 ctx->shader->nhwatomic_ranges++;
1186 ctx->shader->nhwatomic += count;
1187 break;
1188
1189 case TGSI_FILE_SYSTEM_VALUE:
1190 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1191 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1192 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1193 break; /* Already handled from allocate_system_value_inputs */
1194 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1195 break;
1196 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1197 break;
1198 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1199 break;
1200 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1201 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1202 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1203 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1204 unsigned temp_reg = r600_get_temp(ctx);
1205
1206 r = get_lds_offset0(ctx, 2, temp_reg, true);
1207 if (r)
1208 return r;
1209
1210 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1211 temp_reg, 0,
1212 temp_reg, 0,
1213 V_SQ_ALU_SRC_LITERAL, param * 16);
1214 if (r)
1215 return r;
1216
1217 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1218 }
1219 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1220 /* MOV r1.x, r0.x;
1221 MOV r1.y, r0.y;
1222 */
1223 for (i = 0; i < 2; i++) {
1224 struct r600_bytecode_alu alu;
1225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1226 alu.op = ALU_OP1_MOV;
1227 alu.src[0].sel = 0;
1228 alu.src[0].chan = 0 + i;
1229 alu.dst.sel = 1;
1230 alu.dst.chan = 0 + i;
1231 alu.dst.write = 1;
1232 alu.last = (i == 1) ? 1 : 0;
1233 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1234 return r;
1235 }
1236 /* ADD r1.z, 1.0f, -r0.x */
1237 struct r600_bytecode_alu alu;
1238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1239 alu.op = ALU_OP2_ADD;
1240 alu.src[0].sel = V_SQ_ALU_SRC_1;
1241 alu.src[1].sel = 1;
1242 alu.src[1].chan = 0;
1243 alu.src[1].neg = 1;
1244 alu.dst.sel = 1;
1245 alu.dst.chan = 2;
1246 alu.dst.write = 1;
1247 alu.last = 1;
1248 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1249 return r;
1250
1251 /* ADD r1.z, r1.z, -r1.y */
1252 alu.op = ALU_OP2_ADD;
1253 alu.src[0].sel = 1;
1254 alu.src[0].chan = 2;
1255 alu.src[1].sel = 1;
1256 alu.src[1].chan = 1;
1257 alu.src[1].neg = 1;
1258 alu.dst.sel = 1;
1259 alu.dst.chan = 2;
1260 alu.dst.write = 1;
1261 alu.last = 1;
1262 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1263 return r;
1264 break;
1265 }
1266 break;
1267 default:
1268 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1269 return -EINVAL;
1270 }
1271 return 0;
1272 }
1273
1274 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1275 {
1276 struct tgsi_parse_context parse;
1277 struct {
1278 boolean enabled;
1279 int *reg;
1280 unsigned name, alternate_name;
1281 } inputs[2] = {
1282 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1283
1284 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1285 };
1286 int num_regs = 0;
1287 unsigned k, i;
1288
1289 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1290 return 0;
1291 }
1292
1293 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1294 while (!tgsi_parse_end_of_tokens(&parse)) {
1295 tgsi_parse_token(&parse);
1296
1297 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1298 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1299 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1300 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1301 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1302 {
1303 int interpolate, location, k;
1304
1305 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1306 location = TGSI_INTERPOLATE_LOC_CENTER;
1307 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1308 location = TGSI_INTERPOLATE_LOC_CENTER;
1309 /* Needs sample positions, currently those are always available */
1310 } else {
1311 location = TGSI_INTERPOLATE_LOC_CENTROID;
1312 }
1313
1314 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1315 k = eg_get_interpolator_index(interpolate, location);
1316 if (k >= 0)
1317 ctx->eg_interpolators[k].enabled = true;
1318 }
1319 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1320 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1321 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1322 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1323 if (d->Semantic.Name == inputs[k].name ||
1324 d->Semantic.Name == inputs[k].alternate_name) {
1325 inputs[k].enabled = true;
1326 }
1327 }
1328 }
1329 }
1330 }
1331
1332 tgsi_parse_free(&parse);
1333
1334 if (ctx->info.reads_samplemask &&
1335 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1336 inputs[1].enabled = true;
1337 }
1338
1339 if (ctx->bc->chip_class >= EVERGREEN) {
1340 int num_baryc = 0;
1341 /* assign gpr to each interpolator according to priority */
1342 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1343 if (ctx->eg_interpolators[i].enabled) {
1344 ctx->eg_interpolators[i].ij_index = num_baryc;
1345 num_baryc++;
1346 }
1347 }
1348 num_baryc = (num_baryc + 1) >> 1;
1349 gpr_offset += num_baryc;
1350 }
1351
1352 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1353 boolean enabled = inputs[i].enabled;
1354 int *reg = inputs[i].reg;
1355 unsigned name = inputs[i].name;
1356
1357 if (enabled) {
1358 int gpr = gpr_offset + num_regs++;
1359 ctx->shader->nsys_inputs++;
1360
1361 // add to inputs, allocate a gpr
1362 k = ctx->shader->ninput++;
1363 ctx->shader->input[k].name = name;
1364 ctx->shader->input[k].sid = 0;
1365 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1366 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1367 *reg = ctx->shader->input[k].gpr = gpr;
1368 }
1369 }
1370
1371 return gpr_offset + num_regs;
1372 }
1373
1374 /*
1375 * for evergreen we need to scan the shader to find the number of GPRs we need to
1376 * reserve for interpolation and system values
1377 *
1378 * we need to know if we are going to emit any sample or centroid inputs
1379 * if perspective and linear are required
1380 */
1381 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1382 {
1383 unsigned i;
1384
1385 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1386
1387 /*
1388 * Could get this information from the shader info. But right now
1389 * we interpolate all declared inputs, whereas the shader info will
1390 * only contain the bits if the inputs are actually used, so it might
1391 * not be safe...
1392 */
1393 for (i = 0; i < ctx->info.num_inputs; i++) {
1394 int k;
1395 /* skip position/face/mask/sampleid */
1396 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1397 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1398 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1399 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1400 continue;
1401
1402 k = eg_get_interpolator_index(
1403 ctx->info.input_interpolate[i],
1404 ctx->info.input_interpolate_loc[i]);
1405 if (k >= 0)
1406 ctx->eg_interpolators[k].enabled = TRUE;
1407 }
1408
1409 /* XXX PULL MODEL and LINE STIPPLE */
1410
1411 return allocate_system_value_inputs(ctx, 0);
1412 }
1413
1414 /* sample_id_sel == NULL means fetch for current sample */
1415 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1416 {
1417 struct r600_bytecode_vtx vtx;
1418 int r, t1;
1419
1420 t1 = r600_get_temp(ctx);
1421
1422 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1423 vtx.op = FETCH_OP_VFETCH;
1424 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1425 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1426 if (sample_id == NULL) {
1427 assert(ctx->fixed_pt_position_gpr != -1);
1428
1429 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1430 vtx.src_sel_x = 3;
1431 }
1432 else {
1433 struct r600_bytecode_alu alu;
1434
1435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1436 alu.op = ALU_OP1_MOV;
1437 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1438 alu.dst.sel = t1;
1439 alu.dst.write = 1;
1440 alu.last = 1;
1441 r = r600_bytecode_add_alu(ctx->bc, &alu);
1442 if (r)
1443 return r;
1444
1445 vtx.src_gpr = t1;
1446 vtx.src_sel_x = 0;
1447 }
1448 vtx.mega_fetch_count = 16;
1449 vtx.dst_gpr = t1;
1450 vtx.dst_sel_x = 0;
1451 vtx.dst_sel_y = 1;
1452 vtx.dst_sel_z = 2;
1453 vtx.dst_sel_w = 3;
1454 vtx.data_format = FMT_32_32_32_32_FLOAT;
1455 vtx.num_format_all = 2;
1456 vtx.format_comp_all = 1;
1457 vtx.use_const_fields = 0;
1458 vtx.offset = 0;
1459 vtx.endian = r600_endian_swap(32);
1460 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1461
1462 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1463 if (r)
1464 return r;
1465
1466 return t1;
1467 }
1468
1469 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1470 {
1471 int r;
1472 struct r600_bytecode_alu alu;
1473
1474 /* do a vtx fetch with wqm set on the vtx fetch */
1475 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1476 alu.op = ALU_OP1_MOV;
1477 alu.dst.sel = ctx->helper_invoc_reg;
1478 alu.dst.chan = 0;
1479 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1480 alu.src[0].value = 0xffffffff;
1481 alu.dst.write = 1;
1482 alu.last = 1;
1483 r = r600_bytecode_add_alu(ctx->bc, &alu);
1484 if (r)
1485 return r;
1486
1487 /* do a vtx fetch in VPM mode */
1488 struct r600_bytecode_vtx vtx;
1489 memset(&vtx, 0, sizeof(vtx));
1490 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1491 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1492 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1493 vtx.src_gpr = 0;
1494 vtx.mega_fetch_count = 16; /* no idea here really... */
1495 vtx.dst_gpr = ctx->helper_invoc_reg;
1496 vtx.dst_sel_x = 4;
1497 vtx.dst_sel_y = 7; /* SEL_Y */
1498 vtx.dst_sel_z = 7; /* SEL_Z */
1499 vtx.dst_sel_w = 7; /* SEL_W */
1500 vtx.data_format = FMT_32;
1501 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1502 return r;
1503 ctx->bc->cf_last->vpm = 1;
1504 return 0;
1505 }
1506
1507 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1508 {
1509 int r;
1510 struct r600_bytecode_alu alu;
1511
1512 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1513 alu.op = ALU_OP1_MOV;
1514 alu.dst.sel = ctx->helper_invoc_reg;
1515 alu.dst.chan = 0;
1516 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1517 alu.src[0].value = 0xffffffff;
1518 alu.dst.write = 1;
1519 alu.last = 1;
1520 r = r600_bytecode_add_alu(ctx->bc, &alu);
1521 if (r)
1522 return r;
1523
1524 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1525 alu.op = ALU_OP1_MOV;
1526 alu.dst.sel = ctx->helper_invoc_reg;
1527 alu.dst.chan = 0;
1528 alu.src[0].sel = V_SQ_ALU_SRC_0;
1529 alu.dst.write = 1;
1530 alu.last = 1;
1531 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1532 if (r)
1533 return r;
1534
1535 return ctx->helper_invoc_reg;
1536 }
1537
1538 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1539 {
1540 struct r600_bytecode_vtx vtx;
1541 int r, t1;
1542
1543 if (ctx->cs_block_size_loaded)
1544 return ctx->cs_block_size_reg;
1545 if (ctx->cs_grid_size_loaded)
1546 return ctx->cs_grid_size_reg;
1547
1548 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1549 struct r600_bytecode_alu alu;
1550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1551 alu.op = ALU_OP1_MOV;
1552 alu.src[0].sel = V_SQ_ALU_SRC_0;
1553 alu.dst.sel = t1;
1554 alu.dst.write = 1;
1555 alu.last = 1;
1556 r = r600_bytecode_add_alu(ctx->bc, &alu);
1557 if (r)
1558 return r;
1559
1560 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1561 vtx.op = FETCH_OP_VFETCH;
1562 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1563 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1564 vtx.src_gpr = t1;
1565 vtx.src_sel_x = 0;
1566
1567 vtx.mega_fetch_count = 16;
1568 vtx.dst_gpr = t1;
1569 vtx.dst_sel_x = 0;
1570 vtx.dst_sel_y = 1;
1571 vtx.dst_sel_z = 2;
1572 vtx.dst_sel_w = 7;
1573 vtx.data_format = FMT_32_32_32_32;
1574 vtx.num_format_all = 1;
1575 vtx.format_comp_all = 0;
1576 vtx.use_const_fields = 0;
1577 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1578 vtx.endian = r600_endian_swap(32);
1579 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1580
1581 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1582 if (r)
1583 return r;
1584
1585 if (load_block)
1586 ctx->cs_block_size_loaded = true;
1587 else
1588 ctx->cs_grid_size_loaded = true;
1589 return t1;
1590 }
1591
1592 static void tgsi_src(struct r600_shader_ctx *ctx,
1593 const struct tgsi_full_src_register *tgsi_src,
1594 struct r600_shader_src *r600_src)
1595 {
1596 memset(r600_src, 0, sizeof(*r600_src));
1597 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1598 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1599 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1600 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1601 r600_src->neg = tgsi_src->Register.Negate;
1602 r600_src->abs = tgsi_src->Register.Absolute;
1603
1604 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1605 bool spilled;
1606 unsigned idx;
1607
1608 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1609
1610 if (spilled) {
1611 int reg = r600_get_temp(ctx);
1612 int r;
1613
1614 r600_src->sel = reg;
1615
1616 if (ctx->bc->chip_class < R700) {
1617 struct r600_bytecode_output cf;
1618
1619 memset(&cf, 0, sizeof(struct r600_bytecode_output));
1620 cf.op = CF_OP_MEM_SCRATCH;
1621 cf.elem_size = 3;
1622 cf.gpr = reg;
1623 cf.comp_mask = 0xF;
1624 cf.swizzle_x = 0;
1625 cf.swizzle_y = 1;
1626 cf.swizzle_z = 2;
1627 cf.swizzle_w = 3;
1628 cf.burst_count = 1;
1629
1630 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1631 &cf.array_base, &cf.array_size);
1632
1633 if (tgsi_src->Register.Indirect) {
1634 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1635 cf.index_gpr = ctx->bc->ar_reg;
1636 }
1637 else {
1638 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1639 cf.array_base += idx;
1640 cf.array_size = 0;
1641 }
1642
1643 r = r600_bytecode_add_output(ctx->bc, &cf);
1644 }
1645 else {
1646 struct r600_bytecode_vtx vtx;
1647
1648 if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1649 r600_bytecode_need_wait_ack(ctx->bc, false);
1650 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1651 }
1652
1653 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1654 vtx.op = FETCH_OP_READ_SCRATCH;
1655 vtx.dst_gpr = reg;
1656 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1657 vtx.elem_size = 3;
1658 vtx.data_format = FMT_32_32_32_32;
1659 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1660 vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1661 vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1662 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1663 vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1664
1665 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1666 &vtx.array_base, &vtx.array_size);
1667
1668 if (tgsi_src->Register.Indirect) {
1669 vtx.indexed = 1;
1670 vtx.src_gpr = ctx->bc->ar_reg;
1671 }
1672 else {
1673 vtx.array_base += idx;
1674 vtx.array_size = 0;
1675 }
1676
1677 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1678 }
1679
1680 if (r)
1681 return;
1682 }
1683 else {
1684 if (tgsi_src->Register.Indirect)
1685 r600_src->rel = V_SQ_REL_RELATIVE;
1686
1687 r600_src->sel = idx;
1688 }
1689
1690 return;
1691 }
1692
1693 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1694 int index;
1695 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1696 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1697 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1698
1699 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1700 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1701 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1702 return;
1703 }
1704 index = tgsi_src->Register.Index;
1705 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1706 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1707 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1708 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1709 r600_src->swizzle[0] = 2; // Z value
1710 r600_src->swizzle[1] = 2;
1711 r600_src->swizzle[2] = 2;
1712 r600_src->swizzle[3] = 2;
1713 r600_src->sel = ctx->face_gpr;
1714 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1715 r600_src->swizzle[0] = 3; // W value
1716 r600_src->swizzle[1] = 3;
1717 r600_src->swizzle[2] = 3;
1718 r600_src->swizzle[3] = 3;
1719 r600_src->sel = ctx->fixed_pt_position_gpr;
1720 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1721 r600_src->swizzle[0] = 0;
1722 r600_src->swizzle[1] = 1;
1723 r600_src->swizzle[2] = 4;
1724 r600_src->swizzle[3] = 4;
1725 r600_src->sel = load_sample_position(ctx, NULL, -1);
1726 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1727 r600_src->swizzle[0] = 3;
1728 r600_src->swizzle[1] = 3;
1729 r600_src->swizzle[2] = 3;
1730 r600_src->swizzle[3] = 3;
1731 r600_src->sel = 0;
1732 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1733 r600_src->swizzle[0] = 0;
1734 r600_src->swizzle[1] = 0;
1735 r600_src->swizzle[2] = 0;
1736 r600_src->swizzle[3] = 0;
1737 r600_src->sel = 0;
1738 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1739 r600_src->sel = 0;
1740 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1741 r600_src->sel = 1;
1742 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1743 r600_src->swizzle[0] = 3;
1744 r600_src->swizzle[1] = 3;
1745 r600_src->swizzle[2] = 3;
1746 r600_src->swizzle[3] = 3;
1747 r600_src->sel = 1;
1748 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1749 r600_src->swizzle[0] = 2;
1750 r600_src->swizzle[1] = 2;
1751 r600_src->swizzle[2] = 2;
1752 r600_src->swizzle[3] = 2;
1753 r600_src->sel = 0;
1754 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1755 r600_src->sel = 1;
1756 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1757 r600_src->sel = 3;
1758 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1759 r600_src->sel = 2;
1760 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1761 r600_src->sel = ctx->tess_input_info;
1762 r600_src->swizzle[0] = 2;
1763 r600_src->swizzle[1] = 2;
1764 r600_src->swizzle[2] = 2;
1765 r600_src->swizzle[3] = 2;
1766 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1767 r600_src->sel = 0;
1768 r600_src->swizzle[0] = 0;
1769 r600_src->swizzle[1] = 0;
1770 r600_src->swizzle[2] = 0;
1771 r600_src->swizzle[3] = 0;
1772 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1773 r600_src->sel = 0;
1774 r600_src->swizzle[0] = 3;
1775 r600_src->swizzle[1] = 3;
1776 r600_src->swizzle[2] = 3;
1777 r600_src->swizzle[3] = 3;
1778 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1779 r600_src->sel = load_block_grid_size(ctx, false);
1780 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1781 r600_src->sel = load_block_grid_size(ctx, true);
1782 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1783 r600_src->sel = ctx->helper_invoc_reg;
1784 r600_src->swizzle[0] = 0;
1785 r600_src->swizzle[1] = 0;
1786 r600_src->swizzle[2] = 0;
1787 r600_src->swizzle[3] = 0;
1788 }
1789 } else {
1790 if (tgsi_src->Register.Indirect)
1791 r600_src->rel = V_SQ_REL_RELATIVE;
1792 r600_src->sel = tgsi_src->Register.Index;
1793 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1794 }
1795 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1796 if (tgsi_src->Register.Dimension) {
1797 r600_src->kc_bank = tgsi_src->Dimension.Index;
1798 if (tgsi_src->Dimension.Indirect) {
1799 r600_src->kc_rel = 1;
1800 }
1801 }
1802 }
1803 }
1804
1805 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1806 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1807 unsigned int dst_reg)
1808 {
1809 struct r600_bytecode_vtx vtx;
1810 unsigned int ar_reg;
1811 int r;
1812
1813 if (offset) {
1814 struct r600_bytecode_alu alu;
1815
1816 memset(&alu, 0, sizeof(alu));
1817
1818 alu.op = ALU_OP2_ADD_INT;
1819 alu.src[0].sel = ctx->bc->ar_reg;
1820 alu.src[0].chan = ar_chan;
1821
1822 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1823 alu.src[1].value = offset;
1824
1825 alu.dst.sel = dst_reg;
1826 alu.dst.chan = ar_chan;
1827 alu.dst.write = 1;
1828 alu.last = 1;
1829
1830 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1831 return r;
1832
1833 ar_reg = dst_reg;
1834 } else {
1835 ar_reg = ctx->bc->ar_reg;
1836 }
1837
1838 memset(&vtx, 0, sizeof(vtx));
1839 vtx.buffer_id = cb_idx;
1840 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1841 vtx.src_gpr = ar_reg;
1842 vtx.src_sel_x = ar_chan;
1843 vtx.mega_fetch_count = 16;
1844 vtx.dst_gpr = dst_reg;
1845 vtx.dst_sel_x = 0; /* SEL_X */
1846 vtx.dst_sel_y = 1; /* SEL_Y */
1847 vtx.dst_sel_z = 2; /* SEL_Z */
1848 vtx.dst_sel_w = 3; /* SEL_W */
1849 vtx.data_format = FMT_32_32_32_32_FLOAT;
1850 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1851 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1852 vtx.endian = r600_endian_swap(32);
1853 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1854
1855 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1856 return r;
1857
1858 return 0;
1859 }
1860
1861 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1862 {
1863 struct r600_bytecode_vtx vtx;
1864 int r;
1865 unsigned index = src->Register.Index;
1866 unsigned vtx_id = src->Dimension.Index;
1867 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1868 int offset_chan = vtx_id % 3;
1869 int t2 = 0;
1870
1871 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1872 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1873
1874 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1875 offset_chan = 3;
1876
1877 if (src->Dimension.Indirect || src->Register.Indirect)
1878 t2 = r600_get_temp(ctx);
1879
1880 if (src->Dimension.Indirect) {
1881 int treg[3];
1882 struct r600_bytecode_alu alu;
1883 int r, i;
1884 unsigned addr_reg;
1885 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1886 if (src->DimIndirect.Index > 0) {
1887 r = single_alu_op2(ctx, ALU_OP1_MOV,
1888 ctx->bc->ar_reg, 0,
1889 addr_reg, 0,
1890 0, 0);
1891 if (r)
1892 return r;
1893 }
1894 /*
1895 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1896 at least this is what fglrx seems to do. */
1897 for (i = 0; i < 3; i++) {
1898 treg[i] = r600_get_temp(ctx);
1899 }
1900 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1901
1902 for (i = 0; i < 3; i++) {
1903 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1904 alu.op = ALU_OP1_MOV;
1905 alu.src[0].sel = ctx->gs_rotated_input[0];
1906 alu.src[0].chan = i == 2 ? 3 : i;
1907 alu.dst.sel = treg[i];
1908 alu.dst.chan = 0;
1909 alu.dst.write = 1;
1910 alu.last = 1;
1911 r = r600_bytecode_add_alu(ctx->bc, &alu);
1912 if (r)
1913 return r;
1914 }
1915 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1916 alu.op = ALU_OP1_MOV;
1917 alu.src[0].sel = treg[0];
1918 alu.src[0].rel = 1;
1919 alu.dst.sel = t2;
1920 alu.dst.write = 1;
1921 alu.last = 1;
1922 r = r600_bytecode_add_alu(ctx->bc, &alu);
1923 if (r)
1924 return r;
1925 offset_reg = t2;
1926 offset_chan = 0;
1927 }
1928
1929 if (src->Register.Indirect) {
1930 int addr_reg;
1931 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1932
1933 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1934
1935 /* pull the value from index_reg */
1936 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1937 t2, 1,
1938 addr_reg, 0,
1939 V_SQ_ALU_SRC_LITERAL, first);
1940 if (r)
1941 return r;
1942 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1943 t2, 0,
1944 t2, 1,
1945 V_SQ_ALU_SRC_LITERAL, 4,
1946 offset_reg, offset_chan);
1947 if (r)
1948 return r;
1949 offset_reg = t2;
1950 offset_chan = 0;
1951 index = src->Register.Index - first;
1952 }
1953
1954 memset(&vtx, 0, sizeof(vtx));
1955 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1956 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1957 vtx.src_gpr = offset_reg;
1958 vtx.src_sel_x = offset_chan;
1959 vtx.offset = index * 16; /*bytes*/
1960 vtx.mega_fetch_count = 16;
1961 vtx.dst_gpr = dst_reg;
1962 vtx.dst_sel_x = 0; /* SEL_X */
1963 vtx.dst_sel_y = 1; /* SEL_Y */
1964 vtx.dst_sel_z = 2; /* SEL_Z */
1965 vtx.dst_sel_w = 3; /* SEL_W */
1966 if (ctx->bc->chip_class >= EVERGREEN) {
1967 vtx.use_const_fields = 1;
1968 } else {
1969 vtx.data_format = FMT_32_32_32_32_FLOAT;
1970 }
1971
1972 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1973 return r;
1974
1975 return 0;
1976 }
1977
1978 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1979 {
1980 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1981 unsigned i;
1982
1983 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1984 struct tgsi_full_src_register *src = &inst->Src[i];
1985
1986 if (src->Register.File == TGSI_FILE_INPUT) {
1987 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1988 /* primitive id is in R0.z */
1989 ctx->src[i].sel = 0;
1990 ctx->src[i].swizzle[0] = 2;
1991 }
1992 }
1993 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1994 int treg = r600_get_temp(ctx);
1995
1996 fetch_gs_input(ctx, src, treg);
1997 ctx->src[i].sel = treg;
1998 ctx->src[i].rel = 0;
1999 }
2000 }
2001 return 0;
2002 }
2003
2004
2005 /* Tessellation shaders pass outputs to the next shader using LDS.
2006 *
2007 * LS outputs = TCS(HS) inputs
2008 * TCS(HS) outputs = TES(DS) inputs
2009 *
2010 * The LDS layout is:
2011 * - TCS inputs for patch 0
2012 * - TCS inputs for patch 1
2013 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
2014 * - ...
2015 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
2016 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
2017 * - TCS outputs for patch 1
2018 * - Per-patch TCS outputs for patch 1
2019 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
2020 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2021 * - ...
2022 *
2023 * All three shaders VS(LS), TCS, TES share the same LDS space.
2024 */
2025 /* this will return with the dw address in temp_reg.x */
2026 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2027 const struct tgsi_full_dst_register *dst,
2028 const struct tgsi_full_src_register *src,
2029 int stride_bytes_reg, int stride_bytes_chan)
2030 {
2031 struct tgsi_full_dst_register reg;
2032 ubyte *name, *index, *array_first;
2033 int r;
2034 int param;
2035 struct tgsi_shader_info *info = &ctx->info;
2036 /* Set the register description. The address computation is the same
2037 * for sources and destinations. */
2038 if (src) {
2039 reg.Register.File = src->Register.File;
2040 reg.Register.Index = src->Register.Index;
2041 reg.Register.Indirect = src->Register.Indirect;
2042 reg.Register.Dimension = src->Register.Dimension;
2043 reg.Indirect = src->Indirect;
2044 reg.Dimension = src->Dimension;
2045 reg.DimIndirect = src->DimIndirect;
2046 } else
2047 reg = *dst;
2048
2049 /* If the register is 2-dimensional (e.g. an array of vertices
2050 * in a primitive), calculate the base address of the vertex. */
2051 if (reg.Register.Dimension) {
2052 int sel, chan;
2053 if (reg.Dimension.Indirect) {
2054 unsigned addr_reg;
2055 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2056
2057 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2058 /* pull the value from index_reg */
2059 sel = addr_reg;
2060 chan = 0;
2061 } else {
2062 sel = V_SQ_ALU_SRC_LITERAL;
2063 chan = reg.Dimension.Index;
2064 }
2065
2066 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2067 temp_reg, 0,
2068 stride_bytes_reg, stride_bytes_chan,
2069 sel, chan,
2070 temp_reg, 0);
2071 if (r)
2072 return r;
2073 }
2074
2075 if (reg.Register.File == TGSI_FILE_INPUT) {
2076 name = info->input_semantic_name;
2077 index = info->input_semantic_index;
2078 array_first = info->input_array_first;
2079 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2080 name = info->output_semantic_name;
2081 index = info->output_semantic_index;
2082 array_first = info->output_array_first;
2083 } else {
2084 assert(0);
2085 return -1;
2086 }
2087 if (reg.Register.Indirect) {
2088 int addr_reg;
2089 int first;
2090 /* Add the relative address of the element. */
2091 if (reg.Indirect.ArrayID)
2092 first = array_first[reg.Indirect.ArrayID];
2093 else
2094 first = reg.Register.Index;
2095
2096 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2097
2098 /* pull the value from index_reg */
2099 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2100 temp_reg, 0,
2101 V_SQ_ALU_SRC_LITERAL, 16,
2102 addr_reg, 0,
2103 temp_reg, 0);
2104 if (r)
2105 return r;
2106
2107 param = r600_get_lds_unique_index(name[first],
2108 index[first]);
2109
2110 } else {
2111 param = r600_get_lds_unique_index(name[reg.Register.Index],
2112 index[reg.Register.Index]);
2113 }
2114
2115 /* add to base_addr - passed in temp_reg.x */
2116 if (param) {
2117 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2118 temp_reg, 0,
2119 temp_reg, 0,
2120 V_SQ_ALU_SRC_LITERAL, param * 16);
2121 if (r)
2122 return r;
2123
2124 }
2125 return 0;
2126 }
2127
2128 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2129 unsigned dst_reg, unsigned mask)
2130 {
2131 struct r600_bytecode_alu alu;
2132 int r, i, lasti;
2133
2134 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2135 ctx->bc->force_add_cf = 1;
2136
2137 lasti = tgsi_last_instruction(mask);
2138 for (i = 1; i <= lasti; i++) {
2139 if (!(mask & (1 << i)))
2140 continue;
2141
2142 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2143 temp_reg, i,
2144 temp_reg, 0,
2145 V_SQ_ALU_SRC_LITERAL, 4 * i);
2146 if (r)
2147 return r;
2148 }
2149 for (i = 0; i <= lasti; i++) {
2150 if (!(mask & (1 << i)))
2151 continue;
2152
2153 /* emit an LDS_READ_RET */
2154 memset(&alu, 0, sizeof(alu));
2155 alu.op = LDS_OP1_LDS_READ_RET;
2156 alu.src[0].sel = temp_reg;
2157 alu.src[0].chan = i;
2158 alu.src[1].sel = V_SQ_ALU_SRC_0;
2159 alu.src[2].sel = V_SQ_ALU_SRC_0;
2160 alu.dst.chan = 0;
2161 alu.is_lds_idx_op = true;
2162 alu.last = 1;
2163 r = r600_bytecode_add_alu(ctx->bc, &alu);
2164 if (r)
2165 return r;
2166 }
2167 for (i = 0; i <= lasti; i++) {
2168 if (!(mask & (1 << i)))
2169 continue;
2170
2171 /* then read from LDS_OQ_A_POP */
2172 memset(&alu, 0, sizeof(alu));
2173
2174 alu.op = ALU_OP1_MOV;
2175 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2176 alu.src[0].chan = 0;
2177 alu.dst.sel = dst_reg;
2178 alu.dst.chan = i;
2179 alu.dst.write = 1;
2180 alu.last = 1;
2181 r = r600_bytecode_add_alu(ctx->bc, &alu);
2182 if (r)
2183 return r;
2184 }
2185 return 0;
2186 }
2187
2188 static int fetch_mask(struct tgsi_src_register *reg)
2189 {
2190 int mask = 0;
2191 mask |= 1 << reg->SwizzleX;
2192 mask |= 1 << reg->SwizzleY;
2193 mask |= 1 << reg->SwizzleZ;
2194 mask |= 1 << reg->SwizzleW;
2195 return mask;
2196 }
2197
2198 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2199 {
2200 int r;
2201 unsigned temp_reg = r600_get_temp(ctx);
2202
2203 r = get_lds_offset0(ctx, 2, temp_reg,
2204 src->Register.Dimension ? false : true);
2205 if (r)
2206 return r;
2207
2208 /* the base address is now in temp.x */
2209 r = r600_get_byte_address(ctx, temp_reg,
2210 NULL, src, ctx->tess_output_info, 1);
2211 if (r)
2212 return r;
2213
2214 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2215 if (r)
2216 return r;
2217 return 0;
2218 }
2219
2220 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2221 {
2222 int r;
2223 unsigned temp_reg = r600_get_temp(ctx);
2224
2225 /* t.x = ips * r0.y */
2226 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2227 temp_reg, 0,
2228 ctx->tess_input_info, 0,
2229 0, 1);
2230
2231 if (r)
2232 return r;
2233
2234 /* the base address is now in temp.x */
2235 r = r600_get_byte_address(ctx, temp_reg,
2236 NULL, src, ctx->tess_input_info, 1);
2237 if (r)
2238 return r;
2239
2240 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2241 if (r)
2242 return r;
2243 return 0;
2244 }
2245
2246 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2247 {
2248 int r;
2249 unsigned temp_reg = r600_get_temp(ctx);
2250
2251 r = get_lds_offset0(ctx, 1, temp_reg,
2252 src->Register.Dimension ? false : true);
2253 if (r)
2254 return r;
2255 /* the base address is now in temp.x */
2256 r = r600_get_byte_address(ctx, temp_reg,
2257 NULL, src,
2258 ctx->tess_output_info, 1);
2259 if (r)
2260 return r;
2261
2262 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2263 if (r)
2264 return r;
2265 return 0;
2266 }
2267
2268 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2269 {
2270 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2271 unsigned i;
2272
2273 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2274 struct tgsi_full_src_register *src = &inst->Src[i];
2275
2276 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2277 int treg = r600_get_temp(ctx);
2278 fetch_tes_input(ctx, src, treg);
2279 ctx->src[i].sel = treg;
2280 ctx->src[i].rel = 0;
2281 }
2282 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2283 int treg = r600_get_temp(ctx);
2284 fetch_tcs_input(ctx, src, treg);
2285 ctx->src[i].sel = treg;
2286 ctx->src[i].rel = 0;
2287 }
2288 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2289 int treg = r600_get_temp(ctx);
2290 fetch_tcs_output(ctx, src, treg);
2291 ctx->src[i].sel = treg;
2292 ctx->src[i].rel = 0;
2293 }
2294 }
2295 return 0;
2296 }
2297
2298 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2299 {
2300 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2301 struct r600_bytecode_alu alu;
2302 int i, j, k, nconst, r;
2303
2304 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2305 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2306 nconst++;
2307 }
2308 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2309 }
2310 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2311 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2312 continue;
2313 }
2314
2315 if (ctx->src[i].rel) {
2316 int chan = inst->Src[i].Indirect.Swizzle;
2317 int treg = r600_get_temp(ctx);
2318 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2319 return r;
2320
2321 ctx->src[i].kc_bank = 0;
2322 ctx->src[i].kc_rel = 0;
2323 ctx->src[i].sel = treg;
2324 ctx->src[i].rel = 0;
2325 j--;
2326 } else if (j > 0) {
2327 int treg = r600_get_temp(ctx);
2328 for (k = 0; k < 4; k++) {
2329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2330 alu.op = ALU_OP1_MOV;
2331 alu.src[0].sel = ctx->src[i].sel;
2332 alu.src[0].chan = k;
2333 alu.src[0].rel = ctx->src[i].rel;
2334 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2335 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2336 alu.dst.sel = treg;
2337 alu.dst.chan = k;
2338 alu.dst.write = 1;
2339 if (k == 3)
2340 alu.last = 1;
2341 r = r600_bytecode_add_alu(ctx->bc, &alu);
2342 if (r)
2343 return r;
2344 }
2345 ctx->src[i].sel = treg;
2346 ctx->src[i].rel =0;
2347 j--;
2348 }
2349 }
2350 return 0;
2351 }
2352
2353 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2354 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2355 {
2356 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2357 struct r600_bytecode_alu alu;
2358 int i, j, k, nliteral, r;
2359
2360 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2361 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2362 nliteral++;
2363 }
2364 }
2365 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2366 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2367 int treg = r600_get_temp(ctx);
2368 for (k = 0; k < 4; k++) {
2369 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2370 alu.op = ALU_OP1_MOV;
2371 alu.src[0].sel = ctx->src[i].sel;
2372 alu.src[0].chan = k;
2373 alu.src[0].value = ctx->src[i].value[k];
2374 alu.dst.sel = treg;
2375 alu.dst.chan = k;
2376 alu.dst.write = 1;
2377 if (k == 3)
2378 alu.last = 1;
2379 r = r600_bytecode_add_alu(ctx->bc, &alu);
2380 if (r)
2381 return r;
2382 }
2383 ctx->src[i].sel = treg;
2384 j--;
2385 }
2386 }
2387 return 0;
2388 }
2389
2390 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2391 {
2392 int i, r, count = ctx->shader->ninput;
2393
2394 for (i = 0; i < count; i++) {
2395 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2396 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2397 if (r)
2398 return r;
2399 }
2400 }
2401 return 0;
2402 }
2403
2404 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2405 int stream, unsigned *stream_item_size UNUSED)
2406 {
2407 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2408 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2409 int j, r;
2410 unsigned i;
2411
2412 /* Sanity checking. */
2413 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2414 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2415 r = -EINVAL;
2416 goto out_err;
2417 }
2418 for (i = 0; i < so->num_outputs; i++) {
2419 if (so->output[i].output_buffer >= 4) {
2420 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2421 so->output[i].output_buffer);
2422 r = -EINVAL;
2423 goto out_err;
2424 }
2425 }
2426
2427 /* Initialize locations where the outputs are stored. */
2428 for (i = 0; i < so->num_outputs; i++) {
2429
2430 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2431 start_comp[i] = so->output[i].start_component;
2432 /* Lower outputs with dst_offset < start_component.
2433 *
2434 * We can only output 4D vectors with a write mask, e.g. we can
2435 * only output the W component at offset 3, etc. If we want
2436 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2437 * to move it to X and output X. */
2438 if (so->output[i].dst_offset < so->output[i].start_component) {
2439 unsigned tmp = r600_get_temp(ctx);
2440
2441 for (j = 0; j < so->output[i].num_components; j++) {
2442 struct r600_bytecode_alu alu;
2443 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2444 alu.op = ALU_OP1_MOV;
2445 alu.src[0].sel = so_gpr[i];
2446 alu.src[0].chan = so->output[i].start_component + j;
2447
2448 alu.dst.sel = tmp;
2449 alu.dst.chan = j;
2450 alu.dst.write = 1;
2451 if (j == so->output[i].num_components - 1)
2452 alu.last = 1;
2453 r = r600_bytecode_add_alu(ctx->bc, &alu);
2454 if (r)
2455 return r;
2456 }
2457 start_comp[i] = 0;
2458 so_gpr[i] = tmp;
2459 }
2460 }
2461
2462 /* Write outputs to buffers. */
2463 for (i = 0; i < so->num_outputs; i++) {
2464 struct r600_bytecode_output output;
2465
2466 if (stream != -1 && stream != so->output[i].stream)
2467 continue;
2468
2469 memset(&output, 0, sizeof(struct r600_bytecode_output));
2470 output.gpr = so_gpr[i];
2471 output.elem_size = so->output[i].num_components - 1;
2472 if (output.elem_size == 2)
2473 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2474 output.array_base = so->output[i].dst_offset - start_comp[i];
2475 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2476 output.burst_count = 1;
2477 /* array_size is an upper limit for the burst_count
2478 * with MEM_STREAM instructions */
2479 output.array_size = 0xFFF;
2480 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2481
2482 if (ctx->bc->chip_class >= EVERGREEN) {
2483 switch (so->output[i].output_buffer) {
2484 case 0:
2485 output.op = CF_OP_MEM_STREAM0_BUF0;
2486 break;
2487 case 1:
2488 output.op = CF_OP_MEM_STREAM0_BUF1;
2489 break;
2490 case 2:
2491 output.op = CF_OP_MEM_STREAM0_BUF2;
2492 break;
2493 case 3:
2494 output.op = CF_OP_MEM_STREAM0_BUF3;
2495 break;
2496 }
2497 output.op += so->output[i].stream * 4;
2498 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2499 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2500 } else {
2501 switch (so->output[i].output_buffer) {
2502 case 0:
2503 output.op = CF_OP_MEM_STREAM0;
2504 break;
2505 case 1:
2506 output.op = CF_OP_MEM_STREAM1;
2507 break;
2508 case 2:
2509 output.op = CF_OP_MEM_STREAM2;
2510 break;
2511 case 3:
2512 output.op = CF_OP_MEM_STREAM3;
2513 break;
2514 }
2515 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2516 }
2517 r = r600_bytecode_add_output(ctx->bc, &output);
2518 if (r)
2519 goto out_err;
2520 }
2521 return 0;
2522 out_err:
2523 return r;
2524 }
2525
2526 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2527 {
2528 struct r600_bytecode_alu alu;
2529 unsigned reg;
2530
2531 if (!ctx->shader->vs_out_edgeflag)
2532 return;
2533
2534 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2535
2536 /* clamp(x, 0, 1) */
2537 memset(&alu, 0, sizeof(alu));
2538 alu.op = ALU_OP1_MOV;
2539 alu.src[0].sel = reg;
2540 alu.dst.sel = reg;
2541 alu.dst.write = 1;
2542 alu.dst.clamp = 1;
2543 alu.last = 1;
2544 r600_bytecode_add_alu(ctx->bc, &alu);
2545
2546 memset(&alu, 0, sizeof(alu));
2547 alu.op = ALU_OP1_FLT_TO_INT;
2548 alu.src[0].sel = reg;
2549 alu.dst.sel = reg;
2550 alu.dst.write = 1;
2551 alu.last = 1;
2552 r600_bytecode_add_alu(ctx->bc, &alu);
2553 }
2554
2555 int generate_gs_copy_shader(struct r600_context *rctx,
2556 struct r600_pipe_shader *gs,
2557 struct pipe_stream_output_info *so)
2558 {
2559 struct r600_shader_ctx ctx = {};
2560 struct r600_shader *gs_shader = &gs->shader;
2561 struct r600_pipe_shader *cshader;
2562 unsigned ocnt = gs_shader->noutput;
2563 struct r600_bytecode_alu alu;
2564 struct r600_bytecode_vtx vtx;
2565 struct r600_bytecode_output output;
2566 struct r600_bytecode_cf *cf_jump, *cf_pop,
2567 *last_exp_pos = NULL, *last_exp_param = NULL;
2568 int next_clip_pos = 61, next_param = 0;
2569 unsigned i, j;
2570 int ring;
2571 bool only_ring_0 = true;
2572 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2573 if (!cshader)
2574 return 0;
2575
2576 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2577 sizeof(struct r600_shader_io));
2578
2579 cshader->shader.noutput = ocnt;
2580
2581 ctx.shader = &cshader->shader;
2582 ctx.bc = &ctx.shader->bc;
2583 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2584
2585 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2586 rctx->screen->has_compressed_msaa_texturing);
2587
2588 ctx.bc->isa = rctx->isa;
2589
2590 cf_jump = NULL;
2591 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2592
2593 /* R0.x = R0.x & 0x3fffffff */
2594 memset(&alu, 0, sizeof(alu));
2595 alu.op = ALU_OP2_AND_INT;
2596 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2597 alu.src[1].value = 0x3fffffff;
2598 alu.dst.write = 1;
2599 r600_bytecode_add_alu(ctx.bc, &alu);
2600
2601 /* R0.y = R0.x >> 30 */
2602 memset(&alu, 0, sizeof(alu));
2603 alu.op = ALU_OP2_LSHR_INT;
2604 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2605 alu.src[1].value = 0x1e;
2606 alu.dst.chan = 1;
2607 alu.dst.write = 1;
2608 alu.last = 1;
2609 r600_bytecode_add_alu(ctx.bc, &alu);
2610
2611 /* fetch vertex data from GSVS ring */
2612 for (i = 0; i < ocnt; ++i) {
2613 struct r600_shader_io *out = &ctx.shader->output[i];
2614
2615 out->gpr = i + 1;
2616 out->ring_offset = i * 16;
2617
2618 memset(&vtx, 0, sizeof(vtx));
2619 vtx.op = FETCH_OP_VFETCH;
2620 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2621 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2622 vtx.mega_fetch_count = 16;
2623 vtx.offset = out->ring_offset;
2624 vtx.dst_gpr = out->gpr;
2625 vtx.src_gpr = 0;
2626 vtx.dst_sel_x = 0;
2627 vtx.dst_sel_y = 1;
2628 vtx.dst_sel_z = 2;
2629 vtx.dst_sel_w = 3;
2630 if (rctx->b.chip_class >= EVERGREEN) {
2631 vtx.use_const_fields = 1;
2632 } else {
2633 vtx.data_format = FMT_32_32_32_32_FLOAT;
2634 }
2635
2636 r600_bytecode_add_vtx(ctx.bc, &vtx);
2637 }
2638 ctx.temp_reg = i + 1;
2639 for (ring = 3; ring >= 0; --ring) {
2640 bool enabled = false;
2641 for (i = 0; i < so->num_outputs; i++) {
2642 if (so->output[i].stream == ring) {
2643 enabled = true;
2644 if (ring > 0)
2645 only_ring_0 = false;
2646 break;
2647 }
2648 }
2649 if (ring != 0 && !enabled) {
2650 cshader->shader.ring_item_sizes[ring] = 0;
2651 continue;
2652 }
2653
2654 if (cf_jump) {
2655 // Patch up jump label
2656 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2657 cf_pop = ctx.bc->cf_last;
2658
2659 cf_jump->cf_addr = cf_pop->id + 2;
2660 cf_jump->pop_count = 1;
2661 cf_pop->cf_addr = cf_pop->id + 2;
2662 cf_pop->pop_count = 1;
2663 }
2664
2665 /* PRED_SETE_INT __, R0.y, ring */
2666 memset(&alu, 0, sizeof(alu));
2667 alu.op = ALU_OP2_PRED_SETE_INT;
2668 alu.src[0].chan = 1;
2669 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2670 alu.src[1].value = ring;
2671 alu.execute_mask = 1;
2672 alu.update_pred = 1;
2673 alu.last = 1;
2674 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2675
2676 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2677 cf_jump = ctx.bc->cf_last;
2678
2679 if (enabled)
2680 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2681 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2682 }
2683
2684 /* bc adds nops - copy it */
2685 if (ctx.bc->chip_class == R600) {
2686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2687 alu.op = ALU_OP0_NOP;
2688 alu.last = 1;
2689 r600_bytecode_add_alu(ctx.bc, &alu);
2690
2691 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2692 }
2693
2694 /* export vertex data */
2695 /* XXX factor out common code with r600_shader_from_tgsi ? */
2696 for (i = 0; i < ocnt; ++i) {
2697 struct r600_shader_io *out = &ctx.shader->output[i];
2698 bool instream0 = true;
2699 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2700 continue;
2701
2702 for (j = 0; j < so->num_outputs; j++) {
2703 if (so->output[j].register_index == i) {
2704 if (so->output[j].stream == 0)
2705 break;
2706 if (so->output[j].stream > 0)
2707 instream0 = false;
2708 }
2709 }
2710 if (!instream0)
2711 continue;
2712 memset(&output, 0, sizeof(output));
2713 output.gpr = out->gpr;
2714 output.elem_size = 3;
2715 output.swizzle_x = 0;
2716 output.swizzle_y = 1;
2717 output.swizzle_z = 2;
2718 output.swizzle_w = 3;
2719 output.burst_count = 1;
2720 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2721 output.op = CF_OP_EXPORT;
2722 switch (out->name) {
2723 case TGSI_SEMANTIC_POSITION:
2724 output.array_base = 60;
2725 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2726 break;
2727
2728 case TGSI_SEMANTIC_PSIZE:
2729 output.array_base = 61;
2730 if (next_clip_pos == 61)
2731 next_clip_pos = 62;
2732 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2733 output.swizzle_y = 7;
2734 output.swizzle_z = 7;
2735 output.swizzle_w = 7;
2736 ctx.shader->vs_out_misc_write = 1;
2737 ctx.shader->vs_out_point_size = 1;
2738 break;
2739 case TGSI_SEMANTIC_LAYER:
2740 if (out->spi_sid) {
2741 /* duplicate it as PARAM to pass to the pixel shader */
2742 output.array_base = next_param++;
2743 r600_bytecode_add_output(ctx.bc, &output);
2744 last_exp_param = ctx.bc->cf_last;
2745 }
2746 output.array_base = 61;
2747 if (next_clip_pos == 61)
2748 next_clip_pos = 62;
2749 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2750 output.swizzle_x = 7;
2751 output.swizzle_y = 7;
2752 output.swizzle_z = 0;
2753 output.swizzle_w = 7;
2754 ctx.shader->vs_out_misc_write = 1;
2755 ctx.shader->vs_out_layer = 1;
2756 break;
2757 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2758 if (out->spi_sid) {
2759 /* duplicate it as PARAM to pass to the pixel shader */
2760 output.array_base = next_param++;
2761 r600_bytecode_add_output(ctx.bc, &output);
2762 last_exp_param = ctx.bc->cf_last;
2763 }
2764 output.array_base = 61;
2765 if (next_clip_pos == 61)
2766 next_clip_pos = 62;
2767 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2768 ctx.shader->vs_out_misc_write = 1;
2769 ctx.shader->vs_out_viewport = 1;
2770 output.swizzle_x = 7;
2771 output.swizzle_y = 7;
2772 output.swizzle_z = 7;
2773 output.swizzle_w = 0;
2774 break;
2775 case TGSI_SEMANTIC_CLIPDIST:
2776 /* spi_sid is 0 for clipdistance outputs that were generated
2777 * for clipvertex - we don't need to pass them to PS */
2778 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2779 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2780 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2781 if (out->spi_sid) {
2782 /* duplicate it as PARAM to pass to the pixel shader */
2783 output.array_base = next_param++;
2784 r600_bytecode_add_output(ctx.bc, &output);
2785 last_exp_param = ctx.bc->cf_last;
2786 }
2787 output.array_base = next_clip_pos++;
2788 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2789 break;
2790 case TGSI_SEMANTIC_FOG:
2791 output.swizzle_y = 4; /* 0 */
2792 output.swizzle_z = 4; /* 0 */
2793 output.swizzle_w = 5; /* 1 */
2794 break;
2795 default:
2796 output.array_base = next_param++;
2797 break;
2798 }
2799 r600_bytecode_add_output(ctx.bc, &output);
2800 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2801 last_exp_param = ctx.bc->cf_last;
2802 else
2803 last_exp_pos = ctx.bc->cf_last;
2804 }
2805
2806 if (!last_exp_pos) {
2807 memset(&output, 0, sizeof(output));
2808 output.gpr = 0;
2809 output.elem_size = 3;
2810 output.swizzle_x = 7;
2811 output.swizzle_y = 7;
2812 output.swizzle_z = 7;
2813 output.swizzle_w = 7;
2814 output.burst_count = 1;
2815 output.type = 2;
2816 output.op = CF_OP_EXPORT;
2817 output.array_base = 60;
2818 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2819 r600_bytecode_add_output(ctx.bc, &output);
2820 last_exp_pos = ctx.bc->cf_last;
2821 }
2822
2823 if (!last_exp_param) {
2824 memset(&output, 0, sizeof(output));
2825 output.gpr = 0;
2826 output.elem_size = 3;
2827 output.swizzle_x = 7;
2828 output.swizzle_y = 7;
2829 output.swizzle_z = 7;
2830 output.swizzle_w = 7;
2831 output.burst_count = 1;
2832 output.type = 2;
2833 output.op = CF_OP_EXPORT;
2834 output.array_base = next_param++;
2835 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2836 r600_bytecode_add_output(ctx.bc, &output);
2837 last_exp_param = ctx.bc->cf_last;
2838 }
2839
2840 last_exp_pos->op = CF_OP_EXPORT_DONE;
2841 last_exp_param->op = CF_OP_EXPORT_DONE;
2842
2843 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2844 cf_pop = ctx.bc->cf_last;
2845
2846 cf_jump->cf_addr = cf_pop->id + 2;
2847 cf_jump->pop_count = 1;
2848 cf_pop->cf_addr = cf_pop->id + 2;
2849 cf_pop->pop_count = 1;
2850
2851 if (ctx.bc->chip_class == CAYMAN)
2852 cm_bytecode_add_cf_end(ctx.bc);
2853 else {
2854 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2855 ctx.bc->cf_last->end_of_program = 1;
2856 }
2857
2858 gs->gs_copy_shader = cshader;
2859 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2860
2861 ctx.bc->nstack = 1;
2862
2863 return r600_bytecode_build(ctx.bc);
2864 }
2865
2866 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2867 {
2868 if (ind) {
2869 struct r600_bytecode_alu alu;
2870 int r;
2871
2872 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2873 alu.op = ALU_OP2_ADD_INT;
2874 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2875 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2876 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2877 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2878 alu.dst.write = 1;
2879 alu.last = 1;
2880 r = r600_bytecode_add_alu(ctx->bc, &alu);
2881 if (r)
2882 return r;
2883 }
2884 return 0;
2885 }
2886
2887 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2888 {
2889 struct r600_bytecode_output output;
2890 int ring_offset;
2891 unsigned i, k;
2892 int effective_stream = stream == -1 ? 0 : stream;
2893 int idx = 0;
2894
2895 for (i = 0; i < ctx->shader->noutput; i++) {
2896 if (ctx->gs_for_vs) {
2897 /* for ES we need to lookup corresponding ring offset expected by GS
2898 * (map this output to GS input by name and sid) */
2899 /* FIXME precompute offsets */
2900 ring_offset = -1;
2901 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2902 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2903 struct r600_shader_io *out = &ctx->shader->output[i];
2904 if (in->name == out->name && in->sid == out->sid)
2905 ring_offset = in->ring_offset;
2906 }
2907
2908 if (ring_offset == -1)
2909 continue;
2910 } else {
2911 ring_offset = idx * 16;
2912 idx++;
2913 }
2914
2915 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2916 continue;
2917 /* next_ring_offset after parsing input decls contains total size of
2918 * single vertex data, gs_next_vertex - current vertex index */
2919 if (!ind)
2920 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2921
2922 memset(&output, 0, sizeof(struct r600_bytecode_output));
2923 output.gpr = ctx->shader->output[i].gpr;
2924 output.elem_size = 3;
2925 output.comp_mask = 0xF;
2926 output.burst_count = 1;
2927
2928 if (ind)
2929 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2930 else
2931 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2932
2933 switch (stream) {
2934 default:
2935 case 0:
2936 output.op = CF_OP_MEM_RING; break;
2937 case 1:
2938 output.op = CF_OP_MEM_RING1; break;
2939 case 2:
2940 output.op = CF_OP_MEM_RING2; break;
2941 case 3:
2942 output.op = CF_OP_MEM_RING3; break;
2943 }
2944
2945 if (ind) {
2946 output.array_base = ring_offset >> 2; /* in dwords */
2947 output.array_size = 0xfff;
2948 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2949 } else
2950 output.array_base = ring_offset >> 2; /* in dwords */
2951 r600_bytecode_add_output(ctx->bc, &output);
2952 }
2953
2954 ++ctx->gs_next_vertex;
2955 return 0;
2956 }
2957
2958
2959 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2960 {
2961 int r;
2962 struct r600_bytecode_vtx vtx;
2963 int temp_val = ctx->temp_reg;
2964 /* need to store the TCS output somewhere */
2965 r = single_alu_op2(ctx, ALU_OP1_MOV,
2966 temp_val, 0,
2967 V_SQ_ALU_SRC_LITERAL, 0,
2968 0, 0);
2969 if (r)
2970 return r;
2971
2972 /* used by VS/TCS */
2973 if (ctx->tess_input_info) {
2974 /* fetch tcs input values into resv space */
2975 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2976 vtx.op = FETCH_OP_VFETCH;
2977 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2978 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2979 vtx.mega_fetch_count = 16;
2980 vtx.data_format = FMT_32_32_32_32;
2981 vtx.num_format_all = 2;
2982 vtx.format_comp_all = 1;
2983 vtx.use_const_fields = 0;
2984 vtx.endian = r600_endian_swap(32);
2985 vtx.srf_mode_all = 1;
2986 vtx.offset = 0;
2987 vtx.dst_gpr = ctx->tess_input_info;
2988 vtx.dst_sel_x = 0;
2989 vtx.dst_sel_y = 1;
2990 vtx.dst_sel_z = 2;
2991 vtx.dst_sel_w = 3;
2992 vtx.src_gpr = temp_val;
2993 vtx.src_sel_x = 0;
2994
2995 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2996 if (r)
2997 return r;
2998 }
2999
3000 /* used by TCS/TES */
3001 if (ctx->tess_output_info) {
3002 /* fetch tcs output values into resv space */
3003 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3004 vtx.op = FETCH_OP_VFETCH;
3005 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3006 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3007 vtx.mega_fetch_count = 16;
3008 vtx.data_format = FMT_32_32_32_32;
3009 vtx.num_format_all = 2;
3010 vtx.format_comp_all = 1;
3011 vtx.use_const_fields = 0;
3012 vtx.endian = r600_endian_swap(32);
3013 vtx.srf_mode_all = 1;
3014 vtx.offset = 16;
3015 vtx.dst_gpr = ctx->tess_output_info;
3016 vtx.dst_sel_x = 0;
3017 vtx.dst_sel_y = 1;
3018 vtx.dst_sel_z = 2;
3019 vtx.dst_sel_w = 3;
3020 vtx.src_gpr = temp_val;
3021 vtx.src_sel_x = 0;
3022
3023 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3024 if (r)
3025 return r;
3026 }
3027 return 0;
3028 }
3029
3030 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3031 {
3032 int j, r;
3033 int temp_reg;
3034 unsigned i;
3035
3036 /* fetch tcs input values into input_vals */
3037 ctx->tess_input_info = r600_get_temp(ctx);
3038 ctx->tess_output_info = 0;
3039 r = r600_fetch_tess_io_info(ctx);
3040 if (r)
3041 return r;
3042
3043 temp_reg = r600_get_temp(ctx);
3044 /* dst reg contains LDS address stride * idx */
3045 /* MUL vertexID, vertex_dw_stride */
3046 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3047 temp_reg, 0,
3048 ctx->tess_input_info, 1,
3049 0, 1); /* rel id in r0.y? */
3050 if (r)
3051 return r;
3052
3053 for (i = 0; i < ctx->shader->noutput; i++) {
3054 struct r600_bytecode_alu alu;
3055 int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3056 ctx->shader->output[i].sid);
3057
3058 if (param) {
3059 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3060 temp_reg, 1,
3061 temp_reg, 0,
3062 V_SQ_ALU_SRC_LITERAL, param * 16);
3063 if (r)
3064 return r;
3065 }
3066
3067 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3068 temp_reg, 2,
3069 temp_reg, param ? 1 : 0,
3070 V_SQ_ALU_SRC_LITERAL, 8);
3071 if (r)
3072 return r;
3073
3074
3075 for (j = 0; j < 2; j++) {
3076 int chan = (j == 1) ? 2 : (param ? 1 : 0);
3077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3078 alu.op = LDS_OP3_LDS_WRITE_REL;
3079 alu.src[0].sel = temp_reg;
3080 alu.src[0].chan = chan;
3081 alu.src[1].sel = ctx->shader->output[i].gpr;
3082 alu.src[1].chan = j * 2;
3083 alu.src[2].sel = ctx->shader->output[i].gpr;
3084 alu.src[2].chan = (j * 2) + 1;
3085 alu.last = 1;
3086 alu.dst.chan = 0;
3087 alu.lds_idx = 1;
3088 alu.is_lds_idx_op = true;
3089 r = r600_bytecode_add_alu(ctx->bc, &alu);
3090 if (r)
3091 return r;
3092 }
3093 }
3094 return 0;
3095 }
3096
3097 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3098 {
3099 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3100 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3101 int i, r, lasti;
3102 int temp_reg = r600_get_temp(ctx);
3103 struct r600_bytecode_alu alu;
3104 unsigned write_mask = dst->Register.WriteMask;
3105
3106 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3107 return 0;
3108
3109 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3110 if (r)
3111 return r;
3112
3113 /* the base address is now in temp.x */
3114 r = r600_get_byte_address(ctx, temp_reg,
3115 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3116 if (r)
3117 return r;
3118
3119 /* LDS write */
3120 lasti = tgsi_last_instruction(write_mask);
3121 for (i = 1; i <= lasti; i++) {
3122
3123 if (!(write_mask & (1 << i)))
3124 continue;
3125 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3126 temp_reg, i,
3127 temp_reg, 0,
3128 V_SQ_ALU_SRC_LITERAL, 4 * i);
3129 if (r)
3130 return r;
3131 }
3132
3133 for (i = 0; i <= lasti; i++) {
3134 if (!(write_mask & (1 << i)))
3135 continue;
3136
3137 if ((i == 0 && ((write_mask & 3) == 3)) ||
3138 (i == 2 && ((write_mask & 0xc) == 0xc))) {
3139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3140 alu.op = LDS_OP3_LDS_WRITE_REL;
3141 alu.src[0].sel = temp_reg;
3142 alu.src[0].chan = i;
3143
3144 alu.src[1].sel = dst->Register.Index;
3145 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3146 alu.src[1].chan = i;
3147
3148 alu.src[2].sel = dst->Register.Index;
3149 alu.src[2].sel += ctx->file_offset[dst->Register.File];
3150 alu.src[2].chan = i + 1;
3151 alu.lds_idx = 1;
3152 alu.dst.chan = 0;
3153 alu.last = 1;
3154 alu.is_lds_idx_op = true;
3155 r = r600_bytecode_add_alu(ctx->bc, &alu);
3156 if (r)
3157 return r;
3158 i += 1;
3159 continue;
3160 }
3161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3162 alu.op = LDS_OP2_LDS_WRITE;
3163 alu.src[0].sel = temp_reg;
3164 alu.src[0].chan = i;
3165
3166 alu.src[1].sel = dst->Register.Index;
3167 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3168 alu.src[1].chan = i;
3169
3170 alu.src[2].sel = V_SQ_ALU_SRC_0;
3171 alu.dst.chan = 0;
3172 alu.last = 1;
3173 alu.is_lds_idx_op = true;
3174 r = r600_bytecode_add_alu(ctx->bc, &alu);
3175 if (r)
3176 return r;
3177 }
3178 return 0;
3179 }
3180
3181 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3182 int output_idx, int nc)
3183 {
3184 int param;
3185 unsigned temp_reg = r600_get_temp(ctx);
3186 unsigned name = ctx->shader->output[output_idx].name;
3187 int dreg = ctx->shader->output[output_idx].gpr;
3188 int r;
3189
3190 param = r600_get_lds_unique_index(name, 0);
3191 r = get_lds_offset0(ctx, 1, temp_reg, true);
3192 if (r)
3193 return r;
3194
3195 if (param) {
3196 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3197 temp_reg, 0,
3198 temp_reg, 0,
3199 V_SQ_ALU_SRC_LITERAL, param * 16);
3200 if (r)
3201 return r;
3202 }
3203
3204 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3205 return 0;
3206 }
3207
3208 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3209 {
3210 int stride, outer_comps, inner_comps;
3211 int tessinner_idx = -1, tessouter_idx = -1;
3212 int i, r;
3213 unsigned j;
3214 int temp_reg = r600_get_temp(ctx);
3215 int treg[3] = {-1, -1, -1};
3216 struct r600_bytecode_alu alu;
3217 struct r600_bytecode_cf *cf_jump, *cf_pop;
3218
3219 /* only execute factor emission for invocation 0 */
3220 /* PRED_SETE_INT __, R0.x, 0 */
3221 memset(&alu, 0, sizeof(alu));
3222 alu.op = ALU_OP2_PRED_SETE_INT;
3223 alu.src[0].chan = 2;
3224 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3225 alu.execute_mask = 1;
3226 alu.update_pred = 1;
3227 alu.last = 1;
3228 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3229
3230 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3231 cf_jump = ctx->bc->cf_last;
3232
3233 treg[0] = r600_get_temp(ctx);
3234 switch (ctx->shader->tcs_prim_mode) {
3235 case PIPE_PRIM_LINES:
3236 stride = 8; /* 2 dwords, 1 vec2 store */
3237 outer_comps = 2;
3238 inner_comps = 0;
3239 break;
3240 case PIPE_PRIM_TRIANGLES:
3241 stride = 16; /* 4 dwords, 1 vec4 store */
3242 outer_comps = 3;
3243 inner_comps = 1;
3244 treg[1] = r600_get_temp(ctx);
3245 break;
3246 case PIPE_PRIM_QUADS:
3247 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3248 outer_comps = 4;
3249 inner_comps = 2;
3250 treg[1] = r600_get_temp(ctx);
3251 treg[2] = r600_get_temp(ctx);
3252 break;
3253 default:
3254 assert(0);
3255 return -1;
3256 }
3257
3258 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3259 /* TF_WRITE takes index in R.x, value in R.y */
3260 for (j = 0; j < ctx->shader->noutput; j++) {
3261 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3262 tessinner_idx = j;
3263 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3264 tessouter_idx = j;
3265 }
3266
3267 if (tessouter_idx == -1)
3268 return -1;
3269
3270 if (tessinner_idx == -1 && inner_comps)
3271 return -1;
3272
3273 if (tessouter_idx != -1) {
3274 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps