r600: Lower int64 ops from TGSI-to-NIR shaders too
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600_dump.h"
28 #include "r600d.h"
29 #include "sfn/sfn_nir.h"
30
31 #include "sb/sb_public.h"
32
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/tgsi_info.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "tgsi/tgsi_scan.h"
37 #include "tgsi/tgsi_dump.h"
38 #include "tgsi/tgsi_from_mesa.h"
39 #include "nir/tgsi_to_nir.h"
40 #include "nir/nir_to_tgsi_info.h"
41 #include "compiler/nir/nir.h"
42 #include "util/u_bitcast.h"
43 #include "util/u_memory.h"
44 #include "util/u_math.h"
45 #include <stdio.h>
46 #include <errno.h>
47
48 /* CAYMAN notes
49 Why CAYMAN got loops for lots of instructions is explained here.
50
51 -These 8xx t-slot only ops are implemented in all vector slots.
52 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53 These 8xx t-slot only opcodes become vector ops, with all four
54 slots expecting the arguments on sources a and b. Result is
55 broadcast to all channels.
56 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57 These 8xx t-slot only opcodes become vector ops in the z, y, and
58 x slots.
59 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61 SQRT_IEEE/_64
62 SIN/COS
63 The w slot may have an independent co-issued operation, or if the
64 result is required to be in the w slot, the opcode above may be
65 issued in the w slot as well.
66 The compiler must issue the source argument to slots z, y, and x
67 */
68
69 /* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72 .y = RelVertexID (??)
73 .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76 r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79 .y = RelPatchID (??)
80 .z = InvocationID
81 .w = tess factor base.
82
83 TES - .x = TessCoord.x
84 - .y = TessCoord.y
85 - .z = RelPatchID (??)
86 - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89 face_gpr.w = SampleID
90 */
91 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92 static int r600_shader_from_tgsi(struct r600_context *rctx,
93 struct r600_pipe_shader *pipeshader,
94 union r600_shader_key key);
95
96 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97 int size, unsigned comp_mask) {
98
99 if (!size)
100 return;
101
102 if (ps->num_arrays == ps->max_arrays) {
103 ps->max_arrays += 64;
104 ps->arrays = realloc(ps->arrays, ps->max_arrays *
105 sizeof(struct r600_shader_array));
106 }
107
108 int n = ps->num_arrays;
109 ++ps->num_arrays;
110
111 ps->arrays[n].comp_mask = comp_mask;
112 ps->arrays[n].gpr_start = start_gpr;
113 ps->arrays[n].gpr_count = size;
114 }
115
116 static void r600_dump_streamout(struct pipe_stream_output_info *so)
117 {
118 unsigned i;
119
120 fprintf(stderr, "STREAMOUT\n");
121 for (i = 0; i < so->num_outputs; i++) {
122 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123 so->output[i].start_component;
124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125 i,
126 so->output[i].stream,
127 so->output[i].output_buffer,
128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129 so->output[i].register_index,
130 mask & 1 ? "x" : "",
131 mask & 2 ? "y" : "",
132 mask & 4 ? "z" : "",
133 mask & 8 ? "w" : "",
134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135 }
136 }
137
138 static int store_shader(struct pipe_context *ctx,
139 struct r600_pipe_shader *shader)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 uint32_t *ptr, i;
143
144 if (shader->bo == NULL) {
145 shader->bo = (struct r600_resource*)
146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147 if (shader->bo == NULL) {
148 return -ENOMEM;
149 }
150 ptr = r600_buffer_map_sync_with_rings(
151 &rctx->b, shader->bo,
152 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
153 if (R600_BIG_ENDIAN) {
154 for (i = 0; i < shader->shader.bc.ndw; ++i) {
155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156 }
157 } else {
158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159 }
160 rctx->b.ws->buffer_unmap(shader->bo->buf);
161 }
162
163 return 0;
164 }
165
166 extern const struct nir_shader_compiler_options r600_nir_options;
167 static int nshader = 0;
168 int r600_pipe_shader_create(struct pipe_context *ctx,
169 struct r600_pipe_shader *shader,
170 union r600_shader_key key)
171 {
172 struct r600_context *rctx = (struct r600_context *)ctx;
173 struct r600_pipe_shader_selector *sel = shader->selector;
174 int r;
175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178 tgsi_get_processor_type(sel->tokens):
179 pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) &&
183 !(rscreen->b.debug_flags & DBG_NIR);
184 unsigned sb_disasm;
185 unsigned export_shader;
186
187 shader->shader.bc.isa = rctx->isa;
188
189 if (!(rscreen->b.debug_flags & DBG_NIR)) {
190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191 r = r600_shader_from_tgsi(rctx, shader, key);
192 if (r) {
193 R600_ERR("translation from TGSI failed !\n");
194 goto error;
195 }
196 } else {
197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
199 /* Lower int64 ops because we have some r600 build-in shaders that use it */
200 if (!ctx->screen->get_param(ctx->screen, PIPE_CAP_DOUBLES)) {
201 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
202 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
203 NIR_PASS_V(sel->nir, nir_lower_int64, ~0);
204 NIR_PASS_V(sel->nir, nir_opt_vectorize);
205 }
206 }
207 nir_tgsi_scan_shader(sel->nir, &sel->info, true);
208
209 r = r600_shader_from_nir(rctx, shader, &key);
210 if (r) {
211 fprintf(stderr, "--Failed shader--------------------------------------------------\n");
212
213 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
214 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
215 tgsi_dump(sel->tokens, 0);
216 }
217
218 if (rscreen->b.debug_flags & DBG_NIR) {
219 fprintf(stderr, "--NIR --------------------------------------------------------\n");
220 nir_print_shader(sel->nir, stderr);
221 }
222
223 R600_ERR("translation from NIR failed !\n");
224 goto error;
225 }
226 }
227
228 if (dump) {
229 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
230 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
231 tgsi_dump(sel->tokens, 0);
232 }
233
234 if (sel->so.num_outputs) {
235 r600_dump_streamout(&sel->so);
236 }
237 }
238
239 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
240 /* only disable for vertex shaders in tess paths */
241 if (key.vs.as_ls)
242 use_sb = 0;
243 }
244 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
245 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
246 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
247
248 /* disable SB for shaders using doubles */
249 use_sb &= !shader->shader.uses_doubles;
250
251 use_sb &= !shader->shader.uses_atomics;
252 use_sb &= !shader->shader.uses_images;
253 use_sb &= !shader->shader.uses_helper_invocation;
254
255 /* Check if the bytecode has already been built. */
256 if (!shader->shader.bc.bytecode) {
257 r = r600_bytecode_build(&shader->shader.bc);
258 if (r) {
259 R600_ERR("building bytecode failed !\n");
260 goto error;
261 }
262 }
263
264 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
265 if (dump && !sb_disasm) {
266 fprintf(stderr, "--------------------------------------------------------------\n");
267 r600_bytecode_disasm(&shader->shader.bc);
268 fprintf(stderr, "______________________________________________________________\n");
269 } else if ((dump && sb_disasm) || use_sb) {
270 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
271 dump, use_sb);
272 if (r) {
273 R600_ERR("r600_sb_bytecode_process failed !\n");
274 goto error;
275 }
276 }
277
278 if (dump) {
279 FILE *f;
280 char fname[1024];
281 snprintf(fname, 1024, "shader_from_%s_%d.cpp",
282 (sel->ir_type == PIPE_SHADER_IR_TGSI ?
283 (rscreen->b.debug_flags & DBG_NIR ? "tgsi-nir" : "tgsi")
284 : "nir"), nshader);
285 f = fopen(fname, "w");
286 print_shader_info(f, nshader++, &shader->shader);
287 print_shader_info(stderr, nshader++, &shader->shader);
288 print_pipe_info(stderr, &sel->info);
289 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
290 fprintf(f, "/****TGSI**********************************\n");
291 tgsi_dump_to_file(sel->tokens, 0, f);
292 }
293
294 if (rscreen->b.debug_flags & DBG_NIR){
295 fprintf(f, "/****NIR **********************************\n");
296 nir_print_shader(sel->nir, f);
297 }
298 fprintf(f, "******************************************/\n");
299 fclose(f);
300 }
301
302 if (shader->gs_copy_shader) {
303 if (dump) {
304 // dump copy shader
305 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
306 &shader->gs_copy_shader->shader, dump, 0);
307 if (r)
308 goto error;
309 }
310
311 if ((r = store_shader(ctx, shader->gs_copy_shader)))
312 goto error;
313 }
314
315 /* Store the shader in a buffer. */
316 if ((r = store_shader(ctx, shader)))
317 goto error;
318
319 /* Build state. */
320 switch (shader->shader.processor_type) {
321 case PIPE_SHADER_TESS_CTRL:
322 evergreen_update_hs_state(ctx, shader);
323 break;
324 case PIPE_SHADER_TESS_EVAL:
325 if (key.tes.as_es)
326 evergreen_update_es_state(ctx, shader);
327 else
328 evergreen_update_vs_state(ctx, shader);
329 break;
330 case PIPE_SHADER_GEOMETRY:
331 if (rctx->b.chip_class >= EVERGREEN) {
332 evergreen_update_gs_state(ctx, shader);
333 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
334 } else {
335 r600_update_gs_state(ctx, shader);
336 r600_update_vs_state(ctx, shader->gs_copy_shader);
337 }
338 break;
339 case PIPE_SHADER_VERTEX:
340 export_shader = key.vs.as_es;
341 if (rctx->b.chip_class >= EVERGREEN) {
342 if (key.vs.as_ls)
343 evergreen_update_ls_state(ctx, shader);
344 else if (key.vs.as_es)
345 evergreen_update_es_state(ctx, shader);
346 else
347 evergreen_update_vs_state(ctx, shader);
348 } else {
349 if (export_shader)
350 r600_update_es_state(ctx, shader);
351 else
352 r600_update_vs_state(ctx, shader);
353 }
354 break;
355 case PIPE_SHADER_FRAGMENT:
356 if (rctx->b.chip_class >= EVERGREEN) {
357 evergreen_update_ps_state(ctx, shader);
358 } else {
359 r600_update_ps_state(ctx, shader);
360 }
361 break;
362 case PIPE_SHADER_COMPUTE:
363 evergreen_update_ls_state(ctx, shader);
364 break;
365 default:
366 r = -EINVAL;
367 goto error;
368 }
369 return 0;
370
371 error:
372 r600_pipe_shader_destroy(ctx, shader);
373 return r;
374 }
375
376 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
377 {
378 r600_resource_reference(&shader->bo, NULL);
379 if (shader->shader.bc.cf.next)
380 r600_bytecode_clear(&shader->shader.bc);
381 r600_release_command_buffer(&shader->command_buffer);
382 }
383
384 /*
385 * tgsi -> r600 shader
386 */
387 struct r600_shader_tgsi_instruction;
388
389 struct r600_shader_src {
390 unsigned sel;
391 unsigned swizzle[4];
392 unsigned neg;
393 unsigned abs;
394 unsigned rel;
395 unsigned kc_bank;
396 boolean kc_rel; /* true if cache bank is indexed */
397 uint32_t value[4];
398 };
399
400 struct eg_interp {
401 boolean enabled;
402 unsigned ij_index;
403 };
404
405 struct r600_shader_ctx {
406 struct tgsi_shader_info info;
407 struct tgsi_array_info *array_infos;
408 /* flag for each tgsi temp array if its been spilled or not */
409 bool *spilled_arrays;
410 struct tgsi_parse_context parse;
411 const struct tgsi_token *tokens;
412 unsigned type;
413 unsigned file_offset[TGSI_FILE_COUNT];
414 unsigned temp_reg;
415 const struct r600_shader_tgsi_instruction *inst_info;
416 struct r600_bytecode *bc;
417 struct r600_shader *shader;
418 struct r600_shader_src src[4];
419 uint32_t *literals;
420 uint32_t nliterals;
421 uint32_t max_driver_temp_used;
422 /* needed for evergreen interpolation */
423 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
424 /* evergreen/cayman also store sample mask in face register */
425 int face_gpr;
426 /* sample id is .w component stored in fixed point position register */
427 int fixed_pt_position_gpr;
428 int colors_used;
429 boolean clip_vertex_write;
430 unsigned cv_output;
431 unsigned edgeflag_output;
432 int helper_invoc_reg;
433 int cs_block_size_reg;
434 int cs_grid_size_reg;
435 bool cs_block_size_loaded, cs_grid_size_loaded;
436 int fragcoord_input;
437 int next_ring_offset;
438 int gs_out_ring_offset;
439 int gs_next_vertex;
440 struct r600_shader *gs_for_vs;
441 int gs_export_gpr_tregs[4];
442 int gs_rotated_input[2];
443 const struct pipe_stream_output_info *gs_stream_output_info;
444 unsigned enabled_stream_buffers_mask;
445 unsigned tess_input_info; /* temp with tess input offsets */
446 unsigned tess_output_info; /* temp with tess input offsets */
447 unsigned thread_id_gpr; /* temp with thread id calculated for images */
448 };
449
450 struct r600_shader_tgsi_instruction {
451 unsigned op;
452 int (*process)(struct r600_shader_ctx *ctx);
453 };
454
455 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
456 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
457 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
458 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
459 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
460 static int tgsi_else(struct r600_shader_ctx *ctx);
461 static int tgsi_endif(struct r600_shader_ctx *ctx);
462 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
463 static int tgsi_endloop(struct r600_shader_ctx *ctx);
464 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
465 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
466 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
467 unsigned int dst_reg);
468 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
469 const struct r600_shader_src *shader_src,
470 unsigned chan);
471 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
472 unsigned dst_reg, unsigned mask);
473
474 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
475 {
476 if (ctx->bc->family == CHIP_HEMLOCK ||
477 ctx->bc->family == CHIP_CYPRESS ||
478 ctx->bc->family == CHIP_JUNIPER)
479 return false;
480 return true;
481 }
482
483 static int tgsi_last_instruction(unsigned writemask)
484 {
485 int i, lasti = 0;
486
487 for (i = 0; i < 4; i++) {
488 if (writemask & (1 << i)) {
489 lasti = i;
490 }
491 }
492 return lasti;
493 }
494
495 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
496 {
497 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
498 unsigned j;
499
500 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
501 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
502 return -EINVAL;
503 }
504 #if 0
505 if (i->Instruction.Label) {
506 R600_ERR("label unsupported\n");
507 return -EINVAL;
508 }
509 #endif
510 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
511 if (i->Src[j].Register.Dimension) {
512 switch (i->Src[j].Register.File) {
513 case TGSI_FILE_CONSTANT:
514 case TGSI_FILE_HW_ATOMIC:
515 break;
516 case TGSI_FILE_INPUT:
517 if (ctx->type == PIPE_SHADER_GEOMETRY ||
518 ctx->type == PIPE_SHADER_TESS_CTRL ||
519 ctx->type == PIPE_SHADER_TESS_EVAL)
520 break;
521 /* fallthrough */
522 case TGSI_FILE_OUTPUT:
523 if (ctx->type == PIPE_SHADER_TESS_CTRL)
524 break;
525 /* fallthrough */
526 default:
527 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
528 i->Src[j].Register.File,
529 i->Src[j].Register.Dimension);
530 return -EINVAL;
531 }
532 }
533 }
534 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
535 if (i->Dst[j].Register.Dimension) {
536 if (ctx->type == PIPE_SHADER_TESS_CTRL)
537 continue;
538 R600_ERR("unsupported dst (dimension)\n");
539 return -EINVAL;
540 }
541 }
542 return 0;
543 }
544
545 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
546 {
547 if (interpolate == TGSI_INTERPOLATE_COLOR ||
548 interpolate == TGSI_INTERPOLATE_LINEAR ||
549 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
550 {
551 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
552 int loc;
553
554 switch(location) {
555 case TGSI_INTERPOLATE_LOC_CENTER:
556 loc = 1;
557 break;
558 case TGSI_INTERPOLATE_LOC_CENTROID:
559 loc = 2;
560 break;
561 case TGSI_INTERPOLATE_LOC_SAMPLE:
562 default:
563 loc = 0; break;
564 }
565
566 return is_linear * 3 + loc;
567 }
568
569 return -1;
570 }
571
572 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
573 int input)
574 {
575 int i = eg_get_interpolator_index(
576 ctx->shader->input[input].interpolate,
577 ctx->shader->input[input].interpolate_location);
578 assert(i >= 0);
579 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
580 }
581
582 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
583 {
584 int i, r;
585 struct r600_bytecode_alu alu;
586 int gpr = 0, base_chan = 0;
587 int ij_index = ctx->shader->input[input].ij_index;
588
589 /* work out gpr and base_chan from index */
590 gpr = ij_index / 2;
591 base_chan = (2 * (ij_index % 2)) + 1;
592
593 for (i = 0; i < 8; i++) {
594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
595
596 if (i < 4)
597 alu.op = ALU_OP2_INTERP_ZW;
598 else
599 alu.op = ALU_OP2_INTERP_XY;
600
601 if ((i > 1) && (i < 6)) {
602 alu.dst.sel = ctx->shader->input[input].gpr;
603 alu.dst.write = 1;
604 }
605
606 alu.dst.chan = i % 4;
607
608 alu.src[0].sel = gpr;
609 alu.src[0].chan = (base_chan - (i % 2));
610
611 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
612
613 alu.bank_swizzle_force = SQ_ALU_VEC_210;
614 if ((i % 4) == 3)
615 alu.last = 1;
616 r = r600_bytecode_add_alu(ctx->bc, &alu);
617 if (r)
618 return r;
619 }
620 return 0;
621 }
622
623 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
624 {
625 int i, r;
626 struct r600_bytecode_alu alu;
627
628 for (i = 0; i < 4; i++) {
629 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
630
631 alu.op = ALU_OP1_INTERP_LOAD_P0;
632
633 alu.dst.sel = ctx->shader->input[input].gpr;
634 alu.dst.write = 1;
635
636 alu.dst.chan = i;
637
638 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
639 alu.src[0].chan = i;
640
641 if (i == 3)
642 alu.last = 1;
643 r = r600_bytecode_add_alu(ctx->bc, &alu);
644 if (r)
645 return r;
646 }
647 return 0;
648 }
649
650 /*
651 * Special export handling in shaders
652 *
653 * shader export ARRAY_BASE for EXPORT_POS:
654 * 60 is position
655 * 61 is misc vector
656 * 62, 63 are clip distance vectors
657 *
658 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
659 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
660 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
661 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
662 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
663 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
664 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
665 * exclusive from render target index)
666 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
667 *
668 *
669 * shader export ARRAY_BASE for EXPORT_PIXEL:
670 * 0-7 CB targets
671 * 61 computed Z vector
672 *
673 * The use of the values exported in the computed Z vector are controlled
674 * by DB_SHADER_CONTROL:
675 * Z_EXPORT_ENABLE - Z as a float in RED
676 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
677 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
678 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
679 * DB_SOURCE_FORMAT - export control restrictions
680 *
681 */
682
683
684 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
685 static int r600_spi_sid(struct r600_shader_io * io)
686 {
687 int index, name = io->name;
688
689 /* These params are handled differently, they don't need
690 * semantic indices, so we'll use 0 for them.
691 */
692 if (name == TGSI_SEMANTIC_POSITION ||
693 name == TGSI_SEMANTIC_PSIZE ||
694 name == TGSI_SEMANTIC_EDGEFLAG ||
695 name == TGSI_SEMANTIC_FACE ||
696 name == TGSI_SEMANTIC_SAMPLEMASK)
697 index = 0;
698 else {
699 if (name == TGSI_SEMANTIC_GENERIC) {
700 /* For generic params simply use sid from tgsi */
701 index = 9 + io->sid;
702 } else if (name == TGSI_SEMANTIC_TEXCOORD) {
703 index = io->sid;
704 } else {
705 /* For non-generic params - pack name and sid into 8 bits */
706 index = 0x80 | (name<<3) | (io->sid);
707 }
708
709 /* Make sure that all really used indices have nonzero value, so
710 * we can just compare it to 0 later instead of comparing the name
711 * with different values to detect special cases. */
712 index++;
713 }
714
715 return index;
716 };
717
718 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
719 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
720 {
721 switch (semantic_name) {
722 case TGSI_SEMANTIC_POSITION:
723 return 0;
724 case TGSI_SEMANTIC_PSIZE:
725 return 1;
726 case TGSI_SEMANTIC_CLIPDIST:
727 assert(index <= 1);
728 return 2 + index;
729 case TGSI_SEMANTIC_TEXCOORD:
730 return 4 + index;
731 case TGSI_SEMANTIC_GENERIC:
732 if (index <= 63-4)
733 return 4 + index;
734 else
735 /* same explanation as in the default statement,
736 * the only user hitting this is st/nine.
737 */
738 return 0;
739
740 /* patch indices are completely separate and thus start from 0 */
741 case TGSI_SEMANTIC_TESSOUTER:
742 return 0;
743 case TGSI_SEMANTIC_TESSINNER:
744 return 1;
745 case TGSI_SEMANTIC_PATCH:
746 return 2 + index;
747
748 default:
749 /* Don't fail here. The result of this function is only used
750 * for LS, TCS, TES, and GS, where legacy GL semantics can't
751 * occur, but this function is called for all vertex shaders
752 * before it's known whether LS will be compiled or not.
753 */
754 return 0;
755 }
756 }
757
758 /* turn input into interpolate on EG */
759 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
760 {
761 int r = 0;
762
763 if (ctx->shader->input[index].spi_sid) {
764 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
765 if (ctx->shader->input[index].interpolate > 0) {
766 evergreen_interp_assign_ij_index(ctx, index);
767 r = evergreen_interp_alu(ctx, index);
768 } else {
769 r = evergreen_interp_flat(ctx, index);
770 }
771 }
772 return r;
773 }
774
775 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
776 {
777 struct r600_bytecode_alu alu;
778 int i, r;
779 int gpr_front = ctx->shader->input[front].gpr;
780 int gpr_back = ctx->shader->input[back].gpr;
781
782 for (i = 0; i < 4; i++) {
783 memset(&alu, 0, sizeof(alu));
784 alu.op = ALU_OP3_CNDGT;
785 alu.is_op3 = 1;
786 alu.dst.write = 1;
787 alu.dst.sel = gpr_front;
788 alu.src[0].sel = ctx->face_gpr;
789 alu.src[1].sel = gpr_front;
790 alu.src[2].sel = gpr_back;
791
792 alu.dst.chan = i;
793 alu.src[1].chan = i;
794 alu.src[2].chan = i;
795 alu.last = (i==3);
796
797 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
798 return r;
799 }
800
801 return 0;
802 }
803
804 /* execute a single slot ALU calculation */
805 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
806 int dst_sel, int dst_chan,
807 int src0_sel, unsigned src0_chan_val,
808 int src1_sel, unsigned src1_chan_val)
809 {
810 struct r600_bytecode_alu alu;
811 int r, i;
812
813 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
814 for (i = 0; i < 4; i++) {
815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
816 alu.op = op;
817 alu.src[0].sel = src0_sel;
818 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
819 alu.src[0].value = src0_chan_val;
820 else
821 alu.src[0].chan = src0_chan_val;
822 alu.src[1].sel = src1_sel;
823 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
824 alu.src[1].value = src1_chan_val;
825 else
826 alu.src[1].chan = src1_chan_val;
827 alu.dst.sel = dst_sel;
828 alu.dst.chan = i;
829 alu.dst.write = i == dst_chan;
830 alu.last = (i == 3);
831 r = r600_bytecode_add_alu(ctx->bc, &alu);
832 if (r)
833 return r;
834 }
835 return 0;
836 }
837
838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
839 alu.op = op;
840 alu.src[0].sel = src0_sel;
841 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
842 alu.src[0].value = src0_chan_val;
843 else
844 alu.src[0].chan = src0_chan_val;
845 alu.src[1].sel = src1_sel;
846 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
847 alu.src[1].value = src1_chan_val;
848 else
849 alu.src[1].chan = src1_chan_val;
850 alu.dst.sel = dst_sel;
851 alu.dst.chan = dst_chan;
852 alu.dst.write = 1;
853 alu.last = 1;
854 r = r600_bytecode_add_alu(ctx->bc, &alu);
855 if (r)
856 return r;
857 return 0;
858 }
859
860 /* execute a single slot ALU calculation */
861 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
862 int dst_sel, int dst_chan,
863 int src0_sel, unsigned src0_chan_val,
864 int src1_sel, unsigned src1_chan_val,
865 int src2_sel, unsigned src2_chan_val)
866 {
867 struct r600_bytecode_alu alu;
868 int r;
869
870 /* validate this for other ops */
871 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
872 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
873 alu.op = op;
874 alu.src[0].sel = src0_sel;
875 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
876 alu.src[0].value = src0_chan_val;
877 else
878 alu.src[0].chan = src0_chan_val;
879 alu.src[1].sel = src1_sel;
880 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
881 alu.src[1].value = src1_chan_val;
882 else
883 alu.src[1].chan = src1_chan_val;
884 alu.src[2].sel = src2_sel;
885 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
886 alu.src[2].value = src2_chan_val;
887 else
888 alu.src[2].chan = src2_chan_val;
889 alu.dst.sel = dst_sel;
890 alu.dst.chan = dst_chan;
891 alu.is_op3 = 1;
892 alu.last = 1;
893 r = r600_bytecode_add_alu(ctx->bc, &alu);
894 if (r)
895 return r;
896 return 0;
897 }
898
899 /* put it in temp_reg.x */
900 static int get_lds_offset0(struct r600_shader_ctx *ctx,
901 int rel_patch_chan,
902 int temp_reg, bool is_patch_var)
903 {
904 int r;
905
906 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
907 /* ADD
908 Dimension - patch0_offset (input_vals.z),
909 Non-dim - patch0_data_offset (input_vals.w)
910 */
911 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
912 temp_reg, 0,
913 ctx->tess_output_info, 0,
914 0, rel_patch_chan,
915 ctx->tess_output_info, is_patch_var ? 3 : 2);
916 if (r)
917 return r;
918 return 0;
919 }
920
921 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
922 {
923 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
924 }
925
926 static int r600_get_temp(struct r600_shader_ctx *ctx)
927 {
928 return ctx->temp_reg + ctx->max_driver_temp_used++;
929 }
930
931 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
932 {
933 int i;
934 i = ctx->shader->noutput++;
935 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
936 ctx->shader->output[i].sid = 0;
937 ctx->shader->output[i].gpr = 0;
938 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
939 ctx->shader->output[i].write_mask = 0x4;
940 ctx->shader->output[i].spi_sid = prim_id_sid;
941
942 return 0;
943 }
944
945 static int tgsi_barrier(struct r600_shader_ctx *ctx)
946 {
947 struct r600_bytecode_alu alu;
948 int r;
949
950 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
951 alu.op = ctx->inst_info->op;
952 alu.last = 1;
953
954 r = r600_bytecode_add_alu(ctx->bc, &alu);
955 if (r)
956 return r;
957 return 0;
958 }
959
960 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
961 {
962 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
963 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
964 unsigned narrays_left = n;
965 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
966
967 *scratch_space_needed = 0;
968 while (*regno > 124 && narrays_left) {
969 unsigned i;
970 unsigned largest = 0;
971 unsigned largest_index = 0;
972
973 for (i = 0; i < n; i++) {
974 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
975 if (!spilled[i] && size > largest) {
976 largest = size;
977 largest_index = i;
978 }
979 }
980
981 spilled[largest_index] = true;
982 *regno -= largest;
983 *scratch_space_needed += largest;
984
985 narrays_left --;
986 }
987
988 if (narrays_left == 0) {
989 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
990 }
991 }
992
993 /* Take spilled temp arrays into account when translating tgsi register
994 * indexes into r600 gprs if spilled is false, or scratch array offset if
995 * spilled is true */
996 static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
997 {
998 unsigned i;
999 unsigned spilled_size = 0;
1000
1001 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1002 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1003 if (ctx->spilled_arrays[i]) {
1004 /* vec4 index into spilled scratch memory */
1005 *spilled = true;
1006 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1007 }
1008 else {
1009 /* regular GPR array */
1010 *spilled = false;
1011 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1012 }
1013 }
1014
1015 if (tgsi_reg_index < ctx->array_infos[i].range.First)
1016 break;
1017 if (ctx->spilled_arrays[i]) {
1018 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1019 }
1020 }
1021
1022 /* regular GPR index, minus the holes from spilled arrays */
1023 *spilled = false;
1024
1025 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1026 }
1027
1028 /* look up spill area base offset and array size for a spilled temp array */
1029 static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1030 unsigned *array_base, unsigned *array_size)
1031 {
1032 unsigned i;
1033 unsigned offset = 0;
1034
1035 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1036 if (ctx->spilled_arrays[i]) {
1037 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1038
1039 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1040 *array_base = offset;
1041 *array_size = size - 1; /* hw counts from 1 */
1042
1043 return;
1044 }
1045
1046 offset += size;
1047 }
1048 }
1049 }
1050
1051 static int tgsi_declaration(struct r600_shader_ctx *ctx)
1052 {
1053 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1054 int r, i, j, count = d->Range.Last - d->Range.First + 1;
1055
1056 switch (d->Declaration.File) {
1057 case TGSI_FILE_INPUT:
1058 for (j = 0; j < count; j++) {
1059 i = ctx->shader->ninput + j;
1060 assert(i < ARRAY_SIZE(ctx->shader->input));
1061 ctx->shader->input[i].name = d->Semantic.Name;
1062 ctx->shader->input[i].sid = d->Semantic.Index + j;
1063 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1064 ctx->shader->input[i].interpolate_location = d->Interp.Location;
1065 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1066 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1067 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1068 switch (ctx->shader->input[i].name) {
1069 case TGSI_SEMANTIC_FACE:
1070 if (ctx->face_gpr != -1)
1071 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1072 else
1073 ctx->face_gpr = ctx->shader->input[i].gpr;
1074 break;
1075 case TGSI_SEMANTIC_COLOR:
1076 ctx->colors_used++;
1077 break;
1078 case TGSI_SEMANTIC_POSITION:
1079 ctx->fragcoord_input = i;
1080 break;
1081 case TGSI_SEMANTIC_PRIMID:
1082 /* set this for now */
1083 ctx->shader->gs_prim_id_input = true;
1084 ctx->shader->ps_prim_id_input = i;
1085 break;
1086 }
1087 if (ctx->bc->chip_class >= EVERGREEN) {
1088 if ((r = evergreen_interp_input(ctx, i)))
1089 return r;
1090 }
1091 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1092 /* FIXME probably skip inputs if they aren't passed in the ring */
1093 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1094 ctx->next_ring_offset += 16;
1095 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1096 ctx->shader->gs_prim_id_input = true;
1097 }
1098 }
1099 ctx->shader->ninput += count;
1100 break;
1101 case TGSI_FILE_OUTPUT:
1102 for (j = 0; j < count; j++) {
1103 i = ctx->shader->noutput + j;
1104 assert(i < ARRAY_SIZE(ctx->shader->output));
1105 ctx->shader->output[i].name = d->Semantic.Name;
1106 ctx->shader->output[i].sid = d->Semantic.Index + j;
1107 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1108 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1109 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1110 if (ctx->type == PIPE_SHADER_VERTEX ||
1111 ctx->type == PIPE_SHADER_GEOMETRY ||
1112 ctx->type == PIPE_SHADER_TESS_EVAL) {
1113 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1114 switch (d->Semantic.Name) {
1115 case TGSI_SEMANTIC_CLIPDIST:
1116 break;
1117 case TGSI_SEMANTIC_PSIZE:
1118 ctx->shader->vs_out_misc_write = 1;
1119 ctx->shader->vs_out_point_size = 1;
1120 break;
1121 case TGSI_SEMANTIC_EDGEFLAG:
1122 ctx->shader->vs_out_misc_write = 1;
1123 ctx->shader->vs_out_edgeflag = 1;
1124 ctx->edgeflag_output = i;
1125 break;
1126 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1127 ctx->shader->vs_out_misc_write = 1;
1128 ctx->shader->vs_out_viewport = 1;
1129 break;
1130 case TGSI_SEMANTIC_LAYER:
1131 ctx->shader->vs_out_misc_write = 1;
1132 ctx->shader->vs_out_layer = 1;
1133 break;
1134 case TGSI_SEMANTIC_CLIPVERTEX:
1135 ctx->clip_vertex_write = TRUE;
1136 ctx->cv_output = i;
1137 break;
1138 }
1139 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1140 ctx->gs_out_ring_offset += 16;
1141 }
1142 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1143 switch (d->Semantic.Name) {
1144 case TGSI_SEMANTIC_COLOR:
1145 ctx->shader->nr_ps_max_color_exports++;
1146 break;
1147 }
1148 }
1149 }
1150 ctx->shader->noutput += count;
1151 break;
1152 case TGSI_FILE_TEMPORARY:
1153 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1154 if (d->Array.ArrayID) {
1155 bool spilled;
1156 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1157 d->Range.First,
1158 &spilled);
1159
1160 if (!spilled) {
1161 r600_add_gpr_array(ctx->shader, idx,
1162 d->Range.Last - d->Range.First + 1, 0x0F);
1163 }
1164 }
1165 }
1166 break;
1167
1168 case TGSI_FILE_CONSTANT:
1169 case TGSI_FILE_SAMPLER:
1170 case TGSI_FILE_SAMPLER_VIEW:
1171 case TGSI_FILE_ADDRESS:
1172 case TGSI_FILE_BUFFER:
1173 case TGSI_FILE_IMAGE:
1174 case TGSI_FILE_MEMORY:
1175 break;
1176
1177 case TGSI_FILE_HW_ATOMIC:
1178 i = ctx->shader->nhwatomic_ranges;
1179 ctx->shader->atomics[i].start = d->Range.First;
1180 ctx->shader->atomics[i].end = d->Range.Last;
1181 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1182 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1183 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1184 ctx->shader->nhwatomic_ranges++;
1185 ctx->shader->nhwatomic += count;
1186 break;
1187
1188 case TGSI_FILE_SYSTEM_VALUE:
1189 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1190 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1191 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1192 break; /* Already handled from allocate_system_value_inputs */
1193 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1194 break;
1195 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1196 break;
1197 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1198 break;
1199 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1200 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1201 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1202 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1203 unsigned temp_reg = r600_get_temp(ctx);
1204
1205 r = get_lds_offset0(ctx, 2, temp_reg, true);
1206 if (r)
1207 return r;
1208
1209 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1210 temp_reg, 0,
1211 temp_reg, 0,
1212 V_SQ_ALU_SRC_LITERAL, param * 16);
1213 if (r)
1214 return r;
1215
1216 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1217 }
1218 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1219 /* MOV r1.x, r0.x;
1220 MOV r1.y, r0.y;
1221 */
1222 for (i = 0; i < 2; i++) {
1223 struct r600_bytecode_alu alu;
1224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1225 alu.op = ALU_OP1_MOV;
1226 alu.src[0].sel = 0;
1227 alu.src[0].chan = 0 + i;
1228 alu.dst.sel = 1;
1229 alu.dst.chan = 0 + i;
1230 alu.dst.write = 1;
1231 alu.last = (i == 1) ? 1 : 0;
1232 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1233 return r;
1234 }
1235 /* ADD r1.z, 1.0f, -r0.x */
1236 struct r600_bytecode_alu alu;
1237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1238 alu.op = ALU_OP2_ADD;
1239 alu.src[0].sel = V_SQ_ALU_SRC_1;
1240 alu.src[1].sel = 1;
1241 alu.src[1].chan = 0;
1242 alu.src[1].neg = 1;
1243 alu.dst.sel = 1;
1244 alu.dst.chan = 2;
1245 alu.dst.write = 1;
1246 alu.last = 1;
1247 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1248 return r;
1249
1250 /* ADD r1.z, r1.z, -r1.y */
1251 alu.op = ALU_OP2_ADD;
1252 alu.src[0].sel = 1;
1253 alu.src[0].chan = 2;
1254 alu.src[1].sel = 1;
1255 alu.src[1].chan = 1;
1256 alu.src[1].neg = 1;
1257 alu.dst.sel = 1;
1258 alu.dst.chan = 2;
1259 alu.dst.write = 1;
1260 alu.last = 1;
1261 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1262 return r;
1263 break;
1264 }
1265 break;
1266 default:
1267 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1268 return -EINVAL;
1269 }
1270 return 0;
1271 }
1272
1273 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1274 {
1275 struct tgsi_parse_context parse;
1276 struct {
1277 boolean enabled;
1278 int *reg;
1279 unsigned name, alternate_name;
1280 } inputs[2] = {
1281 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1282
1283 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1284 };
1285 int num_regs = 0;
1286 unsigned k, i;
1287
1288 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1289 return 0;
1290 }
1291
1292 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1293 while (!tgsi_parse_end_of_tokens(&parse)) {
1294 tgsi_parse_token(&parse);
1295
1296 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1297 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1298 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1299 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1300 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1301 {
1302 int interpolate, location, k;
1303
1304 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1305 location = TGSI_INTERPOLATE_LOC_CENTER;
1306 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1307 location = TGSI_INTERPOLATE_LOC_CENTER;
1308 /* Needs sample positions, currently those are always available */
1309 } else {
1310 location = TGSI_INTERPOLATE_LOC_CENTROID;
1311 }
1312
1313 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1314 k = eg_get_interpolator_index(interpolate, location);
1315 if (k >= 0)
1316 ctx->eg_interpolators[k].enabled = true;
1317 }
1318 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1319 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1320 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1321 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1322 if (d->Semantic.Name == inputs[k].name ||
1323 d->Semantic.Name == inputs[k].alternate_name) {
1324 inputs[k].enabled = true;
1325 }
1326 }
1327 }
1328 }
1329 }
1330
1331 tgsi_parse_free(&parse);
1332
1333 if (ctx->info.reads_samplemask &&
1334 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1335 inputs[1].enabled = true;
1336 }
1337
1338 if (ctx->bc->chip_class >= EVERGREEN) {
1339 int num_baryc = 0;
1340 /* assign gpr to each interpolator according to priority */
1341 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1342 if (ctx->eg_interpolators[i].enabled) {
1343 ctx->eg_interpolators[i].ij_index = num_baryc;
1344 num_baryc++;
1345 }
1346 }
1347 num_baryc = (num_baryc + 1) >> 1;
1348 gpr_offset += num_baryc;
1349 }
1350
1351 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1352 boolean enabled = inputs[i].enabled;
1353 int *reg = inputs[i].reg;
1354 unsigned name = inputs[i].name;
1355
1356 if (enabled) {
1357 int gpr = gpr_offset + num_regs++;
1358 ctx->shader->nsys_inputs++;
1359
1360 // add to inputs, allocate a gpr
1361 k = ctx->shader->ninput++;
1362 ctx->shader->input[k].name = name;
1363 ctx->shader->input[k].sid = 0;
1364 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1365 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1366 *reg = ctx->shader->input[k].gpr = gpr;
1367 }
1368 }
1369
1370 return gpr_offset + num_regs;
1371 }
1372
1373 /*
1374 * for evergreen we need to scan the shader to find the number of GPRs we need to
1375 * reserve for interpolation and system values
1376 *
1377 * we need to know if we are going to emit any sample or centroid inputs
1378 * if perspective and linear are required
1379 */
1380 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1381 {
1382 unsigned i;
1383
1384 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1385
1386 /*
1387 * Could get this information from the shader info. But right now
1388 * we interpolate all declared inputs, whereas the shader info will
1389 * only contain the bits if the inputs are actually used, so it might
1390 * not be safe...
1391 */
1392 for (i = 0; i < ctx->info.num_inputs; i++) {
1393 int k;
1394 /* skip position/face/mask/sampleid */
1395 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1396 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1397 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1398 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1399 continue;
1400
1401 k = eg_get_interpolator_index(
1402 ctx->info.input_interpolate[i],
1403 ctx->info.input_interpolate_loc[i]);
1404 if (k >= 0)
1405 ctx->eg_interpolators[k].enabled = TRUE;
1406 }
1407
1408 /* XXX PULL MODEL and LINE STIPPLE */
1409
1410 return allocate_system_value_inputs(ctx, 0);
1411 }
1412
1413 /* sample_id_sel == NULL means fetch for current sample */
1414 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1415 {
1416 struct r600_bytecode_vtx vtx;
1417 int r, t1;
1418
1419 t1 = r600_get_temp(ctx);
1420
1421 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1422 vtx.op = FETCH_OP_VFETCH;
1423 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1424 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1425 if (sample_id == NULL) {
1426 assert(ctx->fixed_pt_position_gpr != -1);
1427
1428 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1429 vtx.src_sel_x = 3;
1430 }
1431 else {
1432 struct r600_bytecode_alu alu;
1433
1434 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1435 alu.op = ALU_OP1_MOV;
1436 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1437 alu.dst.sel = t1;
1438 alu.dst.write = 1;
1439 alu.last = 1;
1440 r = r600_bytecode_add_alu(ctx->bc, &alu);
1441 if (r)
1442 return r;
1443
1444 vtx.src_gpr = t1;
1445 vtx.src_sel_x = 0;
1446 }
1447 vtx.mega_fetch_count = 16;
1448 vtx.dst_gpr = t1;
1449 vtx.dst_sel_x = 0;
1450 vtx.dst_sel_y = 1;
1451 vtx.dst_sel_z = 2;
1452 vtx.dst_sel_w = 3;
1453 vtx.data_format = FMT_32_32_32_32_FLOAT;
1454 vtx.num_format_all = 2;
1455 vtx.format_comp_all = 1;
1456 vtx.use_const_fields = 0;
1457 vtx.offset = 0;
1458 vtx.endian = r600_endian_swap(32);
1459 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1460
1461 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1462 if (r)
1463 return r;
1464
1465 return t1;
1466 }
1467
1468 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1469 {
1470 int r;
1471 struct r600_bytecode_alu alu;
1472
1473 /* do a vtx fetch with wqm set on the vtx fetch */
1474 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1475 alu.op = ALU_OP1_MOV;
1476 alu.dst.sel = ctx->helper_invoc_reg;
1477 alu.dst.chan = 0;
1478 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1479 alu.src[0].value = 0xffffffff;
1480 alu.dst.write = 1;
1481 alu.last = 1;
1482 r = r600_bytecode_add_alu(ctx->bc, &alu);
1483 if (r)
1484 return r;
1485
1486 /* do a vtx fetch in VPM mode */
1487 struct r600_bytecode_vtx vtx;
1488 memset(&vtx, 0, sizeof(vtx));
1489 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1490 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1491 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1492 vtx.src_gpr = 0;
1493 vtx.mega_fetch_count = 16; /* no idea here really... */
1494 vtx.dst_gpr = ctx->helper_invoc_reg;
1495 vtx.dst_sel_x = 4;
1496 vtx.dst_sel_y = 7; /* SEL_Y */
1497 vtx.dst_sel_z = 7; /* SEL_Z */
1498 vtx.dst_sel_w = 7; /* SEL_W */
1499 vtx.data_format = FMT_32;
1500 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1501 return r;
1502 ctx->bc->cf_last->vpm = 1;
1503 return 0;
1504 }
1505
1506 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1507 {
1508 int r;
1509 struct r600_bytecode_alu alu;
1510
1511 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1512 alu.op = ALU_OP1_MOV;
1513 alu.dst.sel = ctx->helper_invoc_reg;
1514 alu.dst.chan = 0;
1515 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1516 alu.src[0].value = 0xffffffff;
1517 alu.dst.write = 1;
1518 alu.last = 1;
1519 r = r600_bytecode_add_alu(ctx->bc, &alu);
1520 if (r)
1521 return r;
1522
1523 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1524 alu.op = ALU_OP1_MOV;
1525 alu.dst.sel = ctx->helper_invoc_reg;
1526 alu.dst.chan = 0;
1527 alu.src[0].sel = V_SQ_ALU_SRC_0;
1528 alu.dst.write = 1;
1529 alu.last = 1;
1530 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1531 if (r)
1532 return r;
1533
1534 return ctx->helper_invoc_reg;
1535 }
1536
1537 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1538 {
1539 struct r600_bytecode_vtx vtx;
1540 int r, t1;
1541
1542 if (ctx->cs_block_size_loaded)
1543 return ctx->cs_block_size_reg;
1544 if (ctx->cs_grid_size_loaded)
1545 return ctx->cs_grid_size_reg;
1546
1547 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1548 struct r600_bytecode_alu alu;
1549 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1550 alu.op = ALU_OP1_MOV;
1551 alu.src[0].sel = V_SQ_ALU_SRC_0;
1552 alu.dst.sel = t1;
1553 alu.dst.write = 1;
1554 alu.last = 1;
1555 r = r600_bytecode_add_alu(ctx->bc, &alu);
1556 if (r)
1557 return r;
1558
1559 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1560 vtx.op = FETCH_OP_VFETCH;
1561 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1562 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1563 vtx.src_gpr = t1;
1564 vtx.src_sel_x = 0;
1565
1566 vtx.mega_fetch_count = 16;
1567 vtx.dst_gpr = t1;
1568 vtx.dst_sel_x = 0;
1569 vtx.dst_sel_y = 1;
1570 vtx.dst_sel_z = 2;
1571 vtx.dst_sel_w = 7;
1572 vtx.data_format = FMT_32_32_32_32;
1573 vtx.num_format_all = 1;
1574 vtx.format_comp_all = 0;
1575 vtx.use_const_fields = 0;
1576 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1577 vtx.endian = r600_endian_swap(32);
1578 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1579
1580 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1581 if (r)
1582 return r;
1583
1584 if (load_block)
1585 ctx->cs_block_size_loaded = true;
1586 else
1587 ctx->cs_grid_size_loaded = true;
1588 return t1;
1589 }
1590
1591 static void tgsi_src(struct r600_shader_ctx *ctx,
1592 const struct tgsi_full_src_register *tgsi_src,
1593 struct r600_shader_src *r600_src)
1594 {
1595 memset(r600_src, 0, sizeof(*r600_src));
1596 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1597 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1598 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1599 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1600 r600_src->neg = tgsi_src->Register.Negate;
1601 r600_src->abs = tgsi_src->Register.Absolute;
1602
1603 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1604 bool spilled;
1605 unsigned idx;
1606
1607 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1608
1609 if (spilled) {
1610 int reg = r600_get_temp(ctx);
1611 int r;
1612
1613 r600_src->sel = reg;
1614
1615 if (ctx->bc->chip_class < R700) {
1616 struct r600_bytecode_output cf;
1617
1618 memset(&cf, 0, sizeof(struct r600_bytecode_output));
1619 cf.op = CF_OP_MEM_SCRATCH;
1620 cf.elem_size = 3;
1621 cf.gpr = reg;
1622 cf.comp_mask = 0xF;
1623 cf.swizzle_x = 0;
1624 cf.swizzle_y = 1;
1625 cf.swizzle_z = 2;
1626 cf.swizzle_w = 3;
1627 cf.burst_count = 1;
1628
1629 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1630 &cf.array_base, &cf.array_size);
1631
1632 if (tgsi_src->Register.Indirect) {
1633 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1634 cf.index_gpr = ctx->bc->ar_reg;
1635 }
1636 else {
1637 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1638 cf.array_base += idx;
1639 cf.array_size = 0;
1640 }
1641
1642 r = r600_bytecode_add_output(ctx->bc, &cf);
1643 }
1644 else {
1645 struct r600_bytecode_vtx vtx;
1646
1647 if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1648 r600_bytecode_need_wait_ack(ctx->bc, false);
1649 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1650 }
1651
1652 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1653 vtx.op = FETCH_OP_READ_SCRATCH;
1654 vtx.dst_gpr = reg;
1655 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1656 vtx.elem_size = 3;
1657 vtx.data_format = FMT_32_32_32_32;
1658 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1659 vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1660 vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1661 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1662 vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1663
1664 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1665 &vtx.array_base, &vtx.array_size);
1666
1667 if (tgsi_src->Register.Indirect) {
1668 vtx.indexed = 1;
1669 vtx.src_gpr = ctx->bc->ar_reg;
1670 }
1671 else {
1672 vtx.array_base += idx;
1673 vtx.array_size = 0;
1674 }
1675
1676 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1677 }
1678
1679 if (r)
1680 return;
1681 }
1682 else {
1683 if (tgsi_src->Register.Indirect)
1684 r600_src->rel = V_SQ_REL_RELATIVE;
1685
1686 r600_src->sel = idx;
1687 }
1688
1689 return;
1690 }
1691
1692 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1693 int index;
1694 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1695 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1696 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1697
1698 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1699 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1700 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1701 return;
1702 }
1703 index = tgsi_src->Register.Index;
1704 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1705 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1706 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1707 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1708 r600_src->swizzle[0] = 2; // Z value
1709 r600_src->swizzle[1] = 2;
1710 r600_src->swizzle[2] = 2;
1711 r600_src->swizzle[3] = 2;
1712 r600_src->sel = ctx->face_gpr;
1713 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1714 r600_src->swizzle[0] = 3; // W value
1715 r600_src->swizzle[1] = 3;
1716 r600_src->swizzle[2] = 3;
1717 r600_src->swizzle[3] = 3;
1718 r600_src->sel = ctx->fixed_pt_position_gpr;
1719 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1720 r600_src->swizzle[0] = 0;
1721 r600_src->swizzle[1] = 1;
1722 r600_src->swizzle[2] = 4;
1723 r600_src->swizzle[3] = 4;
1724 r600_src->sel = load_sample_position(ctx, NULL, -1);
1725 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1726 r600_src->swizzle[0] = 3;
1727 r600_src->swizzle[1] = 3;
1728 r600_src->swizzle[2] = 3;
1729 r600_src->swizzle[3] = 3;
1730 r600_src->sel = 0;
1731 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1732 r600_src->swizzle[0] = 0;
1733 r600_src->swizzle[1] = 0;
1734 r600_src->swizzle[2] = 0;
1735 r600_src->swizzle[3] = 0;
1736 r600_src->sel = 0;
1737 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1738 r600_src->sel = 0;
1739 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1740 r600_src->sel = 1;
1741 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1742 r600_src->swizzle[0] = 3;
1743 r600_src->swizzle[1] = 3;
1744 r600_src->swizzle[2] = 3;
1745 r600_src->swizzle[3] = 3;
1746 r600_src->sel = 1;
1747 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1748 r600_src->swizzle[0] = 2;
1749 r600_src->swizzle[1] = 2;
1750 r600_src->swizzle[2] = 2;
1751 r600_src->swizzle[3] = 2;
1752 r600_src->sel = 0;
1753 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1754 r600_src->sel = 1;
1755 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1756 r600_src->sel = 3;
1757 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1758 r600_src->sel = 2;
1759 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1760 r600_src->sel = ctx->tess_input_info;
1761 r600_src->swizzle[0] = 2;
1762 r600_src->swizzle[1] = 2;
1763 r600_src->swizzle[2] = 2;
1764 r600_src->swizzle[3] = 2;
1765 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1766 r600_src->sel = 0;
1767 r600_src->swizzle[0] = 0;
1768 r600_src->swizzle[1] = 0;
1769 r600_src->swizzle[2] = 0;
1770 r600_src->swizzle[3] = 0;
1771 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1772 r600_src->sel = 0;
1773 r600_src->swizzle[0] = 3;
1774 r600_src->swizzle[1] = 3;
1775 r600_src->swizzle[2] = 3;
1776 r600_src->swizzle[3] = 3;
1777 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1778 r600_src->sel = load_block_grid_size(ctx, false);
1779 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1780 r600_src->sel = load_block_grid_size(ctx, true);
1781 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1782 r600_src->sel = ctx->helper_invoc_reg;
1783 r600_src->swizzle[0] = 0;
1784 r600_src->swizzle[1] = 0;
1785 r600_src->swizzle[2] = 0;
1786 r600_src->swizzle[3] = 0;
1787 }
1788 } else {
1789 if (tgsi_src->Register.Indirect)
1790 r600_src->rel = V_SQ_REL_RELATIVE;
1791 r600_src->sel = tgsi_src->Register.Index;
1792 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1793 }
1794 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1795 if (tgsi_src->Register.Dimension) {
1796 r600_src->kc_bank = tgsi_src->Dimension.Index;
1797 if (tgsi_src->Dimension.Indirect) {
1798 r600_src->kc_rel = 1;
1799 }
1800 }
1801 }
1802 }
1803
1804 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1805 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1806 unsigned int dst_reg)
1807 {
1808 struct r600_bytecode_vtx vtx;
1809 unsigned int ar_reg;
1810 int r;
1811
1812 if (offset) {
1813 struct r600_bytecode_alu alu;
1814
1815 memset(&alu, 0, sizeof(alu));
1816
1817 alu.op = ALU_OP2_ADD_INT;
1818 alu.src[0].sel = ctx->bc->ar_reg;
1819 alu.src[0].chan = ar_chan;
1820
1821 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1822 alu.src[1].value = offset;
1823
1824 alu.dst.sel = dst_reg;
1825 alu.dst.chan = ar_chan;
1826 alu.dst.write = 1;
1827 alu.last = 1;
1828
1829 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1830 return r;
1831
1832 ar_reg = dst_reg;
1833 } else {
1834 ar_reg = ctx->bc->ar_reg;
1835 }
1836
1837 memset(&vtx, 0, sizeof(vtx));
1838 vtx.buffer_id = cb_idx;
1839 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1840 vtx.src_gpr = ar_reg;
1841 vtx.src_sel_x = ar_chan;
1842 vtx.mega_fetch_count = 16;
1843 vtx.dst_gpr = dst_reg;
1844 vtx.dst_sel_x = 0; /* SEL_X */
1845 vtx.dst_sel_y = 1; /* SEL_Y */
1846 vtx.dst_sel_z = 2; /* SEL_Z */
1847 vtx.dst_sel_w = 3; /* SEL_W */
1848 vtx.data_format = FMT_32_32_32_32_FLOAT;
1849 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1850 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1851 vtx.endian = r600_endian_swap(32);
1852 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1853
1854 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1855 return r;
1856
1857 return 0;
1858 }
1859
1860 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1861 {
1862 struct r600_bytecode_vtx vtx;
1863 int r;
1864 unsigned index = src->Register.Index;
1865 unsigned vtx_id = src->Dimension.Index;
1866 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1867 int offset_chan = vtx_id % 3;
1868 int t2 = 0;
1869
1870 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1871 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1872
1873 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1874 offset_chan = 3;
1875
1876 if (src->Dimension.Indirect || src->Register.Indirect)
1877 t2 = r600_get_temp(ctx);
1878
1879 if (src->Dimension.Indirect) {
1880 int treg[3];
1881 struct r600_bytecode_alu alu;
1882 int r, i;
1883 unsigned addr_reg;
1884 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1885 if (src->DimIndirect.Index > 0) {
1886 r = single_alu_op2(ctx, ALU_OP1_MOV,
1887 ctx->bc->ar_reg, 0,
1888 addr_reg, 0,
1889 0, 0);
1890 if (r)
1891 return r;
1892 }
1893 /*
1894 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1895 at least this is what fglrx seems to do. */
1896 for (i = 0; i < 3; i++) {
1897 treg[i] = r600_get_temp(ctx);
1898 }
1899 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1900
1901 for (i = 0; i < 3; i++) {
1902 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1903 alu.op = ALU_OP1_MOV;
1904 alu.src[0].sel = ctx->gs_rotated_input[0];
1905 alu.src[0].chan = i == 2 ? 3 : i;
1906 alu.dst.sel = treg[i];
1907 alu.dst.chan = 0;
1908 alu.dst.write = 1;
1909 alu.last = 1;
1910 r = r600_bytecode_add_alu(ctx->bc, &alu);
1911 if (r)
1912 return r;
1913 }
1914 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1915 alu.op = ALU_OP1_MOV;
1916 alu.src[0].sel = treg[0];
1917 alu.src[0].rel = 1;
1918 alu.dst.sel = t2;
1919 alu.dst.write = 1;
1920 alu.last = 1;
1921 r = r600_bytecode_add_alu(ctx->bc, &alu);
1922 if (r)
1923 return r;
1924 offset_reg = t2;
1925 offset_chan = 0;
1926 }
1927
1928 if (src->Register.Indirect) {
1929 int addr_reg;
1930 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1931
1932 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1933
1934 /* pull the value from index_reg */
1935 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1936 t2, 1,
1937 addr_reg, 0,
1938 V_SQ_ALU_SRC_LITERAL, first);
1939 if (r)
1940 return r;
1941 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1942 t2, 0,
1943 t2, 1,
1944 V_SQ_ALU_SRC_LITERAL, 4,
1945 offset_reg, offset_chan);
1946 if (r)
1947 return r;
1948 offset_reg = t2;
1949 offset_chan = 0;
1950 index = src->Register.Index - first;
1951 }
1952
1953 memset(&vtx, 0, sizeof(vtx));
1954 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1955 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1956 vtx.src_gpr = offset_reg;
1957 vtx.src_sel_x = offset_chan;
1958 vtx.offset = index * 16; /*bytes*/
1959 vtx.mega_fetch_count = 16;
1960 vtx.dst_gpr = dst_reg;
1961 vtx.dst_sel_x = 0; /* SEL_X */
1962 vtx.dst_sel_y = 1; /* SEL_Y */
1963 vtx.dst_sel_z = 2; /* SEL_Z */
1964 vtx.dst_sel_w = 3; /* SEL_W */
1965 if (ctx->bc->chip_class >= EVERGREEN) {
1966 vtx.use_const_fields = 1;
1967 } else {
1968 vtx.data_format = FMT_32_32_32_32_FLOAT;
1969 }
1970
1971 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1972 return r;
1973
1974 return 0;
1975 }
1976
1977 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1978 {
1979 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1980 unsigned i;
1981
1982 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1983 struct tgsi_full_src_register *src = &inst->Src[i];
1984
1985 if (src->Register.File == TGSI_FILE_INPUT) {
1986 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1987 /* primitive id is in R0.z */
1988 ctx->src[i].sel = 0;
1989 ctx->src[i].swizzle[0] = 2;
1990 }
1991 }
1992 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1993 int treg = r600_get_temp(ctx);
1994
1995 fetch_gs_input(ctx, src, treg);
1996 ctx->src[i].sel = treg;
1997 ctx->src[i].rel = 0;
1998 }
1999 }
2000 return 0;
2001 }
2002
2003
2004 /* Tessellation shaders pass outputs to the next shader using LDS.
2005 *
2006 * LS outputs = TCS(HS) inputs
2007 * TCS(HS) outputs = TES(DS) inputs
2008 *
2009 * The LDS layout is:
2010 * - TCS inputs for patch 0
2011 * - TCS inputs for patch 1
2012 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
2013 * - ...
2014 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
2015 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
2016 * - TCS outputs for patch 1
2017 * - Per-patch TCS outputs for patch 1
2018 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
2019 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2020 * - ...
2021 *
2022 * All three shaders VS(LS), TCS, TES share the same LDS space.
2023 */
2024 /* this will return with the dw address in temp_reg.x */
2025 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2026 const struct tgsi_full_dst_register *dst,
2027 const struct tgsi_full_src_register *src,
2028 int stride_bytes_reg, int stride_bytes_chan)
2029 {
2030 struct tgsi_full_dst_register reg;
2031 ubyte *name, *index, *array_first;
2032 int r;
2033 int param;
2034 struct tgsi_shader_info *info = &ctx->info;
2035 /* Set the register description. The address computation is the same
2036 * for sources and destinations. */
2037 if (src) {
2038 reg.Register.File = src->Register.File;
2039 reg.Register.Index = src->Register.Index;
2040 reg.Register.Indirect = src->Register.Indirect;
2041 reg.Register.Dimension = src->Register.Dimension;
2042 reg.Indirect = src->Indirect;
2043 reg.Dimension = src->Dimension;
2044 reg.DimIndirect = src->DimIndirect;
2045 } else
2046 reg = *dst;
2047
2048 /* If the register is 2-dimensional (e.g. an array of vertices
2049 * in a primitive), calculate the base address of the vertex. */
2050 if (reg.Register.Dimension) {
2051 int sel, chan;
2052 if (reg.Dimension.Indirect) {
2053 unsigned addr_reg;
2054 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2055
2056 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2057 /* pull the value from index_reg */
2058 sel = addr_reg;
2059 chan = 0;
2060 } else {
2061 sel = V_SQ_ALU_SRC_LITERAL;
2062 chan = reg.Dimension.Index;
2063 }
2064
2065 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2066 temp_reg, 0,
2067 stride_bytes_reg, stride_bytes_chan,
2068 sel, chan,
2069 temp_reg, 0);
2070 if (r)
2071 return r;
2072 }
2073
2074 if (reg.Register.File == TGSI_FILE_INPUT) {
2075 name = info->input_semantic_name;
2076 index = info->input_semantic_index;
2077 array_first = info->input_array_first;
2078 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2079 name = info->output_semantic_name;
2080 index = info->output_semantic_index;
2081 array_first = info->output_array_first;
2082 } else {
2083 assert(0);
2084 return -1;
2085 }
2086 if (reg.Register.Indirect) {
2087 int addr_reg;
2088 int first;
2089 /* Add the relative address of the element. */
2090 if (reg.Indirect.ArrayID)
2091 first = array_first[reg.Indirect.ArrayID];
2092 else
2093 first = reg.Register.Index;
2094
2095 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2096
2097 /* pull the value from index_reg */
2098 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2099 temp_reg, 0,
2100 V_SQ_ALU_SRC_LITERAL, 16,
2101 addr_reg, 0,
2102 temp_reg, 0);
2103 if (r)
2104 return r;
2105
2106 param = r600_get_lds_unique_index(name[first],
2107 index[first]);
2108
2109 } else {
2110 param = r600_get_lds_unique_index(name[reg.Register.Index],
2111 index[reg.Register.Index]);
2112 }
2113
2114 /* add to base_addr - passed in temp_reg.x */
2115 if (param) {
2116 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2117 temp_reg, 0,
2118 temp_reg, 0,
2119 V_SQ_ALU_SRC_LITERAL, param * 16);
2120 if (r)
2121 return r;
2122
2123 }
2124 return 0;
2125 }
2126
2127 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2128 unsigned dst_reg, unsigned mask)
2129 {
2130 struct r600_bytecode_alu alu;
2131 int r, i, lasti;
2132
2133 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2134 ctx->bc->force_add_cf = 1;
2135
2136 lasti = tgsi_last_instruction(mask);
2137 for (i = 1; i <= lasti; i++) {
2138 if (!(mask & (1 << i)))
2139 continue;
2140
2141 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2142 temp_reg, i,
2143 temp_reg, 0,
2144 V_SQ_ALU_SRC_LITERAL, 4 * i);
2145 if (r)
2146 return r;
2147 }
2148 for (i = 0; i <= lasti; i++) {
2149 if (!(mask & (1 << i)))
2150 continue;
2151
2152 /* emit an LDS_READ_RET */
2153 memset(&alu, 0, sizeof(alu));
2154 alu.op = LDS_OP1_LDS_READ_RET;
2155 alu.src[0].sel = temp_reg;
2156 alu.src[0].chan = i;
2157 alu.src[1].sel = V_SQ_ALU_SRC_0;
2158 alu.src[2].sel = V_SQ_ALU_SRC_0;
2159 alu.dst.chan = 0;
2160 alu.is_lds_idx_op = true;
2161 alu.last = 1;
2162 r = r600_bytecode_add_alu(ctx->bc, &alu);
2163 if (r)
2164 return r;
2165 }
2166 for (i = 0; i <= lasti; i++) {
2167 if (!(mask & (1 << i)))
2168 continue;
2169
2170 /* then read from LDS_OQ_A_POP */
2171 memset(&alu, 0, sizeof(alu));
2172
2173 alu.op = ALU_OP1_MOV;
2174 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2175 alu.src[0].chan = 0;
2176 alu.dst.sel = dst_reg;
2177 alu.dst.chan = i;
2178 alu.dst.write = 1;
2179 alu.last = 1;
2180 r = r600_bytecode_add_alu(ctx->bc, &alu);
2181 if (r)
2182 return r;
2183 }
2184 return 0;
2185 }
2186
2187 static int fetch_mask(struct tgsi_src_register *reg)
2188 {
2189 int mask = 0;
2190 mask |= 1 << reg->SwizzleX;
2191 mask |= 1 << reg->SwizzleY;
2192 mask |= 1 << reg->SwizzleZ;
2193 mask |= 1 << reg->SwizzleW;
2194 return mask;
2195 }
2196
2197 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2198 {
2199 int r;
2200 unsigned temp_reg = r600_get_temp(ctx);
2201
2202 r = get_lds_offset0(ctx, 2, temp_reg,
2203 src->Register.Dimension ? false : true);
2204 if (r)
2205 return r;
2206
2207 /* the base address is now in temp.x */
2208 r = r600_get_byte_address(ctx, temp_reg,
2209 NULL, src, ctx->tess_output_info, 1);
2210 if (r)
2211 return r;
2212
2213 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2214 if (r)
2215 return r;
2216 return 0;
2217 }
2218
2219 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2220 {
2221 int r;
2222 unsigned temp_reg = r600_get_temp(ctx);
2223
2224 /* t.x = ips * r0.y */
2225 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2226 temp_reg, 0,
2227 ctx->tess_input_info, 0,
2228 0, 1);
2229
2230 if (r)
2231 return r;
2232
2233 /* the base address is now in temp.x */
2234 r = r600_get_byte_address(ctx, temp_reg,
2235 NULL, src, ctx->tess_input_info, 1);
2236 if (r)
2237 return r;
2238
2239 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2240 if (r)
2241 return r;
2242 return 0;
2243 }
2244
2245 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2246 {
2247 int r;
2248 unsigned temp_reg = r600_get_temp(ctx);
2249
2250 r = get_lds_offset0(ctx, 1, temp_reg,
2251 src->Register.Dimension ? false : true);
2252 if (r)
2253 return r;
2254 /* the base address is now in temp.x */
2255 r = r600_get_byte_address(ctx, temp_reg,
2256 NULL, src,
2257 ctx->tess_output_info, 1);
2258 if (r)
2259 return r;
2260
2261 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2262 if (r)
2263 return r;
2264 return 0;
2265 }
2266
2267 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2268 {
2269 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2270 unsigned i;
2271
2272 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2273 struct tgsi_full_src_register *src = &inst->Src[i];
2274
2275 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2276 int treg = r600_get_temp(ctx);
2277 fetch_tes_input(ctx, src, treg);
2278 ctx->src[i].sel = treg;
2279 ctx->src[i].rel = 0;
2280 }
2281 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2282 int treg = r600_get_temp(ctx);
2283 fetch_tcs_input(ctx, src, treg);
2284 ctx->src[i].sel = treg;
2285 ctx->src[i].rel = 0;
2286 }
2287 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2288 int treg = r600_get_temp(ctx);
2289 fetch_tcs_output(ctx, src, treg);
2290 ctx->src[i].sel = treg;
2291 ctx->src[i].rel = 0;
2292 }
2293 }
2294 return 0;
2295 }
2296
2297 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2298 {
2299 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2300 struct r600_bytecode_alu alu;
2301 int i, j, k, nconst, r;
2302
2303 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2304 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2305 nconst++;
2306 }
2307 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2308 }
2309 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2310 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2311 continue;
2312 }
2313
2314 if (ctx->src[i].rel) {
2315 int chan = inst->Src[i].Indirect.Swizzle;
2316 int treg = r600_get_temp(ctx);
2317 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2318 return r;
2319
2320 ctx->src[i].kc_bank = 0;
2321 ctx->src[i].kc_rel = 0;
2322 ctx->src[i].sel = treg;
2323 ctx->src[i].rel = 0;
2324 j--;
2325 } else if (j > 0) {
2326 int treg = r600_get_temp(ctx);
2327 for (k = 0; k < 4; k++) {
2328 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2329 alu.op = ALU_OP1_MOV;
2330 alu.src[0].sel = ctx->src[i].sel;
2331 alu.src[0].chan = k;
2332 alu.src[0].rel = ctx->src[i].rel;
2333 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2334 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2335 alu.dst.sel = treg;
2336 alu.dst.chan = k;
2337 alu.dst.write = 1;
2338 if (k == 3)
2339 alu.last = 1;
2340 r = r600_bytecode_add_alu(ctx->bc, &alu);
2341 if (r)
2342 return r;
2343 }
2344 ctx->src[i].sel = treg;
2345 ctx->src[i].rel =0;
2346 j--;
2347 }
2348 }
2349 return 0;
2350 }
2351
2352 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2353 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2354 {
2355 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2356 struct r600_bytecode_alu alu;
2357 int i, j, k, nliteral, r;
2358
2359 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2360 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2361 nliteral++;
2362 }
2363 }
2364 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2365 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2366 int treg = r600_get_temp(ctx);
2367 for (k = 0; k < 4; k++) {
2368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2369 alu.op = ALU_OP1_MOV;
2370 alu.src[0].sel = ctx->src[i].sel;
2371 alu.src[0].chan = k;
2372 alu.src[0].value = ctx->src[i].value[k];
2373 alu.dst.sel = treg;
2374 alu.dst.chan = k;
2375 alu.dst.write = 1;
2376 if (k == 3)
2377 alu.last = 1;
2378 r = r600_bytecode_add_alu(ctx->bc, &alu);
2379 if (r)
2380 return r;
2381 }
2382 ctx->src[i].sel = treg;
2383 j--;
2384 }
2385 }
2386 return 0;
2387 }
2388
2389 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2390 {
2391 int i, r, count = ctx->shader->ninput;
2392
2393 for (i = 0; i < count; i++) {
2394 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2395 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2396 if (r)
2397 return r;
2398 }
2399 }
2400 return 0;
2401 }
2402
2403 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2404 int stream, unsigned *stream_item_size UNUSED)
2405 {
2406 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2407 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2408 int j, r;
2409 unsigned i;
2410
2411 /* Sanity checking. */
2412 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2413 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2414 r = -EINVAL;
2415 goto out_err;
2416 }
2417 for (i = 0; i < so->num_outputs; i++) {
2418 if (so->output[i].output_buffer >= 4) {
2419 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2420 so->output[i].output_buffer);
2421 r = -EINVAL;
2422 goto out_err;
2423 }
2424 }
2425
2426 /* Initialize locations where the outputs are stored. */
2427 for (i = 0; i < so->num_outputs; i++) {
2428
2429 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2430 start_comp[i] = so->output[i].start_component;
2431 /* Lower outputs with dst_offset < start_component.
2432 *
2433 * We can only output 4D vectors with a write mask, e.g. we can
2434 * only output the W component at offset 3, etc. If we want
2435 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2436 * to move it to X and output X. */
2437 if (so->output[i].dst_offset < so->output[i].start_component) {
2438 unsigned tmp = r600_get_temp(ctx);
2439
2440 for (j = 0; j < so->output[i].num_components; j++) {
2441 struct r600_bytecode_alu alu;
2442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2443 alu.op = ALU_OP1_MOV;
2444 alu.src[0].sel = so_gpr[i];
2445 alu.src[0].chan = so->output[i].start_component + j;
2446
2447 alu.dst.sel = tmp;
2448 alu.dst.chan = j;
2449 alu.dst.write = 1;
2450 if (j == so->output[i].num_components - 1)
2451 alu.last = 1;
2452 r = r600_bytecode_add_alu(ctx->bc, &alu);
2453 if (r)
2454 return r;
2455 }
2456 start_comp[i] = 0;
2457 so_gpr[i] = tmp;
2458 }
2459 }
2460
2461 /* Write outputs to buffers. */
2462 for (i = 0; i < so->num_outputs; i++) {
2463 struct r600_bytecode_output output;
2464
2465 if (stream != -1 && stream != so->output[i].stream)
2466 continue;
2467
2468 memset(&output, 0, sizeof(struct r600_bytecode_output));
2469 output.gpr = so_gpr[i];
2470 output.elem_size = so->output[i].num_components - 1;
2471 if (output.elem_size == 2)
2472 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2473 output.array_base = so->output[i].dst_offset - start_comp[i];
2474 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2475 output.burst_count = 1;
2476 /* array_size is an upper limit for the burst_count
2477 * with MEM_STREAM instructions */
2478 output.array_size = 0xFFF;
2479 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2480
2481 if (ctx->bc->chip_class >= EVERGREEN) {
2482 switch (so->output[i].output_buffer) {
2483 case 0:
2484 output.op = CF_OP_MEM_STREAM0_BUF0;
2485 break;
2486 case 1:
2487 output.op = CF_OP_MEM_STREAM0_BUF1;
2488 break;
2489 case 2:
2490 output.op = CF_OP_MEM_STREAM0_BUF2;
2491 break;
2492 case 3:
2493 output.op = CF_OP_MEM_STREAM0_BUF3;
2494 break;
2495 }
2496 output.op += so->output[i].stream * 4;
2497 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2498 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2499 } else {
2500 switch (so->output[i].output_buffer) {
2501 case 0:
2502 output.op = CF_OP_MEM_STREAM0;
2503 break;
2504 case 1:
2505 output.op = CF_OP_MEM_STREAM1;
2506 break;
2507 case 2:
2508 output.op = CF_OP_MEM_STREAM2;
2509 break;
2510 case 3:
2511 output.op = CF_OP_MEM_STREAM3;
2512 break;
2513 }
2514 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2515 }
2516 r = r600_bytecode_add_output(ctx->bc, &output);
2517 if (r)
2518 goto out_err;
2519 }
2520 return 0;
2521 out_err:
2522 return r;
2523 }
2524
2525 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2526 {
2527 struct r600_bytecode_alu alu;
2528 unsigned reg;
2529
2530 if (!ctx->shader->vs_out_edgeflag)
2531 return;
2532
2533 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2534
2535 /* clamp(x, 0, 1) */
2536 memset(&alu, 0, sizeof(alu));
2537 alu.op = ALU_OP1_MOV;
2538 alu.src[0].sel = reg;
2539 alu.dst.sel = reg;
2540 alu.dst.write = 1;
2541 alu.dst.clamp = 1;
2542 alu.last = 1;
2543 r600_bytecode_add_alu(ctx->bc, &alu);
2544
2545 memset(&alu, 0, sizeof(alu));
2546 alu.op = ALU_OP1_FLT_TO_INT;
2547 alu.src[0].sel = reg;
2548 alu.dst.sel = reg;
2549 alu.dst.write = 1;
2550 alu.last = 1;
2551 r600_bytecode_add_alu(ctx->bc, &alu);
2552 }
2553
2554 int generate_gs_copy_shader(struct r600_context *rctx,
2555 struct r600_pipe_shader *gs,
2556 struct pipe_stream_output_info *so)
2557 {
2558 struct r600_shader_ctx ctx = {};
2559 struct r600_shader *gs_shader = &gs->shader;
2560 struct r600_pipe_shader *cshader;
2561 unsigned ocnt = gs_shader->noutput;
2562 struct r600_bytecode_alu alu;
2563 struct r600_bytecode_vtx vtx;
2564 struct r600_bytecode_output output;
2565 struct r600_bytecode_cf *cf_jump, *cf_pop,
2566 *last_exp_pos = NULL, *last_exp_param = NULL;
2567 int next_clip_pos = 61, next_param = 0;
2568 unsigned i, j;
2569 int ring;
2570 bool only_ring_0 = true;
2571 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2572 if (!cshader)
2573 return 0;
2574
2575 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2576 sizeof(struct r600_shader_io));
2577
2578 cshader->shader.noutput = ocnt;
2579
2580 ctx.shader = &cshader->shader;
2581 ctx.bc = &ctx.shader->bc;
2582 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2583
2584 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2585 rctx->screen->has_compressed_msaa_texturing);
2586
2587 ctx.bc->isa = rctx->isa;
2588
2589 cf_jump = NULL;
2590 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2591
2592 /* R0.x = R0.x & 0x3fffffff */
2593 memset(&alu, 0, sizeof(alu));
2594 alu.op = ALU_OP2_AND_INT;
2595 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2596 alu.src[1].value = 0x3fffffff;
2597 alu.dst.write = 1;
2598 r600_bytecode_add_alu(ctx.bc, &alu);
2599
2600 /* R0.y = R0.x >> 30 */
2601 memset(&alu, 0, sizeof(alu));
2602 alu.op = ALU_OP2_LSHR_INT;
2603 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2604 alu.src[1].value = 0x1e;
2605 alu.dst.chan = 1;
2606 alu.dst.write = 1;
2607 alu.last = 1;
2608 r600_bytecode_add_alu(ctx.bc, &alu);
2609
2610 /* fetch vertex data from GSVS ring */
2611 for (i = 0; i < ocnt; ++i) {
2612 struct r600_shader_io *out = &ctx.shader->output[i];
2613
2614 out->gpr = i + 1;
2615 out->ring_offset = i * 16;
2616
2617 memset(&vtx, 0, sizeof(vtx));
2618 vtx.op = FETCH_OP_VFETCH;
2619 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2620 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2621 vtx.mega_fetch_count = 16;
2622 vtx.offset = out->ring_offset;
2623 vtx.dst_gpr = out->gpr;
2624 vtx.src_gpr = 0;
2625 vtx.dst_sel_x = 0;
2626 vtx.dst_sel_y = 1;
2627 vtx.dst_sel_z = 2;
2628 vtx.dst_sel_w = 3;
2629 if (rctx->b.chip_class >= EVERGREEN) {
2630 vtx.use_const_fields = 1;
2631 } else {
2632 vtx.data_format = FMT_32_32_32_32_FLOAT;
2633 }
2634
2635 r600_bytecode_add_vtx(ctx.bc, &vtx);
2636 }
2637 ctx.temp_reg = i + 1;
2638 for (ring = 3; ring >= 0; --ring) {
2639 bool enabled = false;
2640 for (i = 0; i < so->num_outputs; i++) {
2641 if (so->output[i].stream == ring) {
2642 enabled = true;
2643 if (ring > 0)
2644 only_ring_0 = false;
2645 break;
2646 }
2647 }
2648 if (ring != 0 && !enabled) {
2649 cshader->shader.ring_item_sizes[ring] = 0;
2650 continue;
2651 }
2652
2653 if (cf_jump) {
2654 // Patch up jump label
2655 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2656 cf_pop = ctx.bc->cf_last;
2657
2658 cf_jump->cf_addr = cf_pop->id + 2;
2659 cf_jump->pop_count = 1;
2660 cf_pop->cf_addr = cf_pop->id + 2;
2661 cf_pop->pop_count = 1;
2662 }
2663
2664 /* PRED_SETE_INT __, R0.y, ring */
2665 memset(&alu, 0, sizeof(alu));
2666 alu.op = ALU_OP2_PRED_SETE_INT;
2667 alu.src[0].chan = 1;
2668 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2669 alu.src[1].value = ring;
2670 alu.execute_mask = 1;
2671 alu.update_pred = 1;
2672 alu.last = 1;
2673 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2674
2675 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2676 cf_jump = ctx.bc->cf_last;
2677
2678 if (enabled)
2679 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2680 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2681 }
2682
2683 /* bc adds nops - copy it */
2684 if (ctx.bc->chip_class == R600) {
2685 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2686 alu.op = ALU_OP0_NOP;
2687 alu.last = 1;
2688 r600_bytecode_add_alu(ctx.bc, &alu);
2689
2690 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2691 }
2692
2693 /* export vertex data */
2694 /* XXX factor out common code with r600_shader_from_tgsi ? */
2695 for (i = 0; i < ocnt; ++i) {
2696 struct r600_shader_io *out = &ctx.shader->output[i];
2697 bool instream0 = true;
2698 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2699 continue;
2700
2701 for (j = 0; j < so->num_outputs; j++) {
2702 if (so->output[j].register_index == i) {
2703 if (so->output[j].stream == 0)
2704 break;
2705 if (so->output[j].stream > 0)
2706 instream0 = false;
2707 }
2708 }
2709 if (!instream0)
2710 continue;
2711 memset(&output, 0, sizeof(output));
2712 output.gpr = out->gpr;
2713 output.elem_size = 3;
2714 output.swizzle_x = 0;
2715 output.swizzle_y = 1;
2716 output.swizzle_z = 2;
2717 output.swizzle_w = 3;
2718 output.burst_count = 1;
2719 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2720 output.op = CF_OP_EXPORT;
2721 switch (out->name) {
2722 case TGSI_SEMANTIC_POSITION:
2723 output.array_base = 60;
2724 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2725 break;
2726
2727 case TGSI_SEMANTIC_PSIZE:
2728 output.array_base = 61;
2729 if (next_clip_pos == 61)
2730 next_clip_pos = 62;
2731 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2732 output.swizzle_y = 7;
2733 output.swizzle_z = 7;
2734 output.swizzle_w = 7;
2735 ctx.shader->vs_out_misc_write = 1;
2736 ctx.shader->vs_out_point_size = 1;
2737 break;
2738 case TGSI_SEMANTIC_LAYER:
2739 if (out->spi_sid) {
2740 /* duplicate it as PARAM to pass to the pixel shader */
2741 output.array_base = next_param++;
2742 r600_bytecode_add_output(ctx.bc, &output);
2743 last_exp_param = ctx.bc->cf_last;
2744 }
2745 output.array_base = 61;
2746 if (next_clip_pos == 61)
2747 next_clip_pos = 62;
2748 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2749 output.swizzle_x = 7;
2750 output.swizzle_y = 7;
2751 output.swizzle_z = 0;
2752 output.swizzle_w = 7;
2753 ctx.shader->vs_out_misc_write = 1;
2754 ctx.shader->vs_out_layer = 1;
2755 break;
2756 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2757 if (out->spi_sid) {
2758 /* duplicate it as PARAM to pass to the pixel shader */
2759 output.array_base = next_param++;
2760 r600_bytecode_add_output(ctx.bc, &output);
2761 last_exp_param = ctx.bc->cf_last;
2762 }
2763 output.array_base = 61;
2764 if (next_clip_pos == 61)
2765 next_clip_pos = 62;
2766 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2767 ctx.shader->vs_out_misc_write = 1;
2768 ctx.shader->vs_out_viewport = 1;
2769 output.swizzle_x = 7;
2770 output.swizzle_y = 7;
2771 output.swizzle_z = 7;
2772 output.swizzle_w = 0;
2773 break;
2774 case TGSI_SEMANTIC_CLIPDIST:
2775 /* spi_sid is 0 for clipdistance outputs that were generated
2776 * for clipvertex - we don't need to pass them to PS */
2777 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2778 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2779 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2780 if (out->spi_sid) {
2781 /* duplicate it as PARAM to pass to the pixel shader */
2782 output.array_base = next_param++;
2783 r600_bytecode_add_output(ctx.bc, &output);
2784 last_exp_param = ctx.bc->cf_last;
2785 }
2786 output.array_base = next_clip_pos++;
2787 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2788 break;
2789 case TGSI_SEMANTIC_FOG:
2790 output.swizzle_y = 4; /* 0 */
2791 output.swizzle_z = 4; /* 0 */
2792 output.swizzle_w = 5; /* 1 */
2793 break;
2794 default:
2795 output.array_base = next_param++;
2796 break;
2797 }
2798 r600_bytecode_add_output(ctx.bc, &output);
2799 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2800 last_exp_param = ctx.bc->cf_last;
2801 else
2802 last_exp_pos = ctx.bc->cf_last;
2803 }
2804
2805 if (!last_exp_pos) {
2806 memset(&output, 0, sizeof(output));
2807 output.gpr = 0;
2808 output.elem_size = 3;
2809 output.swizzle_x = 7;
2810 output.swizzle_y = 7;
2811 output.swizzle_z = 7;
2812 output.swizzle_w = 7;
2813 output.burst_count = 1;
2814 output.type = 2;
2815 output.op = CF_OP_EXPORT;
2816 output.array_base = 60;
2817 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2818 r600_bytecode_add_output(ctx.bc, &output);
2819 last_exp_pos = ctx.bc->cf_last;
2820 }
2821
2822 if (!last_exp_param) {
2823 memset(&output, 0, sizeof(output));
2824 output.gpr = 0;
2825 output.elem_size = 3;
2826 output.swizzle_x = 7;
2827 output.swizzle_y = 7;
2828 output.swizzle_z = 7;
2829 output.swizzle_w = 7;
2830 output.burst_count = 1;
2831 output.type = 2;
2832 output.op = CF_OP_EXPORT;
2833 output.array_base = next_param++;
2834 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2835 r600_bytecode_add_output(ctx.bc, &output);
2836 last_exp_param = ctx.bc->cf_last;
2837 }
2838
2839 last_exp_pos->op = CF_OP_EXPORT_DONE;
2840 last_exp_param->op = CF_OP_EXPORT_DONE;
2841
2842 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2843 cf_pop = ctx.bc->cf_last;
2844
2845 cf_jump->cf_addr = cf_pop->id + 2;
2846 cf_jump->pop_count = 1;
2847 cf_pop->cf_addr = cf_pop->id + 2;
2848 cf_pop->pop_count = 1;
2849
2850 if (ctx.bc->chip_class == CAYMAN)
2851 cm_bytecode_add_cf_end(ctx.bc);
2852 else {
2853 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2854 ctx.bc->cf_last->end_of_program = 1;
2855 }
2856
2857 gs->gs_copy_shader = cshader;
2858 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2859
2860 ctx.bc->nstack = 1;
2861
2862 return r600_bytecode_build(ctx.bc);
2863 }
2864
2865 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2866 {
2867 if (ind) {
2868 struct r600_bytecode_alu alu;
2869 int r;
2870
2871 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2872 alu.op = ALU_OP2_ADD_INT;
2873 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2874 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2875 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2876 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2877 alu.dst.write = 1;
2878 alu.last = 1;
2879 r = r600_bytecode_add_alu(ctx->bc, &alu);
2880 if (r)
2881 return r;
2882 }
2883 return 0;
2884 }
2885
2886 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2887 {
2888 struct r600_bytecode_output output;
2889 int ring_offset;
2890 unsigned i, k;
2891 int effective_stream = stream == -1 ? 0 : stream;
2892 int idx = 0;
2893
2894 for (i = 0; i < ctx->shader->noutput; i++) {
2895 if (ctx->gs_for_vs) {
2896 /* for ES we need to lookup corresponding ring offset expected by GS
2897 * (map this output to GS input by name and sid) */
2898 /* FIXME precompute offsets */
2899 ring_offset = -1;
2900 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2901 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2902 struct r600_shader_io *out = &ctx->shader->output[i];
2903 if (in->name == out->name && in->sid == out->sid)
2904 ring_offset = in->ring_offset;
2905 }
2906
2907 if (ring_offset == -1)
2908 continue;
2909 } else {
2910 ring_offset = idx * 16;
2911 idx++;
2912 }
2913
2914 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2915 continue;
2916 /* next_ring_offset after parsing input decls contains total size of
2917 * single vertex data, gs_next_vertex - current vertex index */
2918 if (!ind)
2919 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2920
2921 memset(&output, 0, sizeof(struct r600_bytecode_output));
2922 output.gpr = ctx->shader->output[i].gpr;
2923 output.elem_size = 3;
2924 output.comp_mask = 0xF;
2925 output.burst_count = 1;
2926
2927 if (ind)
2928 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2929 else
2930 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2931
2932 switch (stream) {
2933 default:
2934 case 0:
2935 output.op = CF_OP_MEM_RING; break;
2936 case 1:
2937 output.op = CF_OP_MEM_RING1; break;
2938 case 2:
2939 output.op = CF_OP_MEM_RING2; break;
2940 case 3:
2941 output.op = CF_OP_MEM_RING3; break;
2942 }
2943
2944 if (ind) {
2945 output.array_base = ring_offset >> 2; /* in dwords */
2946 output.array_size = 0xfff;
2947 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2948 } else
2949 output.array_base = ring_offset >> 2; /* in dwords */
2950 r600_bytecode_add_output(ctx->bc, &output);
2951 }
2952
2953 ++ctx->gs_next_vertex;
2954 return 0;
2955 }
2956
2957
2958 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2959 {
2960 int r;
2961 struct r600_bytecode_vtx vtx;
2962 int temp_val = ctx->temp_reg;
2963 /* need to store the TCS output somewhere */
2964 r = single_alu_op2(ctx, ALU_OP1_MOV,
2965 temp_val, 0,
2966 V_SQ_ALU_SRC_LITERAL, 0,
2967 0, 0);
2968 if (r)
2969 return r;
2970
2971 /* used by VS/TCS */
2972 if (ctx->tess_input_info) {
2973 /* fetch tcs input values into resv space */
2974 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2975 vtx.op = FETCH_OP_VFETCH;
2976 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2977 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2978 vtx.mega_fetch_count = 16;
2979 vtx.data_format = FMT_32_32_32_32;
2980 vtx.num_format_all = 2;
2981 vtx.format_comp_all = 1;
2982 vtx.use_const_fields = 0;
2983 vtx.endian = r600_endian_swap(32);
2984 vtx.srf_mode_all = 1;
2985 vtx.offset = 0;
2986 vtx.dst_gpr = ctx->tess_input_info;
2987 vtx.dst_sel_x = 0;
2988 vtx.dst_sel_y = 1;
2989 vtx.dst_sel_z = 2;
2990 vtx.dst_sel_w = 3;
2991 vtx.src_gpr = temp_val;
2992 vtx.src_sel_x = 0;
2993
2994 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2995 if (r)
2996 return r;
2997 }
2998
2999 /* used by TCS/TES */
3000 if (ctx->tess_output_info) {
3001 /* fetch tcs output values into resv space */
3002 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3003 vtx.op = FETCH_OP_VFETCH;
3004 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3005 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3006 vtx.mega_fetch_count = 16;
3007 vtx.data_format = FMT_32_32_32_32;
3008 vtx.num_format_all = 2;
3009 vtx.format_comp_all = 1;
3010 vtx.use_const_fields = 0;
3011 vtx.endian = r600_endian_swap(32);
3012 vtx.srf_mode_all = 1;
3013 vtx.offset = 16;
3014 vtx.dst_gpr = ctx->tess_output_info;
3015 vtx.dst_sel_x = 0;
3016 vtx.dst_sel_y = 1;
3017 vtx.dst_sel_z = 2;
3018 vtx.dst_sel_w = 3;
3019 vtx.src_gpr = temp_val;
3020 vtx.src_sel_x = 0;
3021
3022 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3023 if (r)
3024 return r;
3025 }
3026 return 0;
3027 }
3028
3029 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3030 {
3031 int j, r;
3032 int temp_reg;
3033 unsigned i;
3034
3035 /* fetch tcs input values into input_vals */
3036 ctx->tess_input_info = r600_get_temp(ctx);
3037 ctx->tess_output_info = 0;
3038 r = r600_fetch_tess_io_info(ctx);
3039 if (r)
3040 return r;
3041
3042 temp_reg = r600_get_temp(ctx);
3043 /* dst reg contains LDS address stride * idx */
3044 /* MUL vertexID, vertex_dw_stride */
3045 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3046 temp_reg, 0,
3047 ctx->tess_input_info, 1,
3048 0, 1); /* rel id in r0.y? */
3049 if (r)
3050 return r;
3051
3052 for (i = 0; i < ctx->shader->noutput; i++) {
3053 struct r600_bytecode_alu alu;
3054 int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3055 ctx->shader->output[i].sid);
3056
3057 if (param) {
3058 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3059 temp_reg, 1,
3060 temp_reg, 0,
3061 V_SQ_ALU_SRC_LITERAL, param * 16);
3062 if (r)
3063 return r;
3064 }
3065
3066 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3067 temp_reg, 2,
3068 temp_reg, param ? 1 : 0,
3069 V_SQ_ALU_SRC_LITERAL, 8);
3070 if (r)
3071 return r;
3072
3073
3074 for (j = 0; j < 2; j++) {
3075 int chan = (j == 1) ? 2 : (param ? 1 : 0);
3076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3077 alu.op = LDS_OP3_LDS_WRITE_REL;
3078 alu.src[0].sel = temp_reg;
3079 alu.src[0].chan = chan;
3080 alu.src[1].sel = ctx->shader->output[i].gpr;
3081 alu.src[1].chan = j * 2;
3082 alu.src[2].sel = ctx->shader->output[i].gpr;
3083 alu.src[2].chan = (j * 2) + 1;
3084 alu.last = 1;
3085 alu.dst.chan = 0;
3086 alu.lds_idx = 1;
3087 alu.is_lds_idx_op = true;
3088 r = r600_bytecode_add_alu(ctx->bc, &alu);
3089 if (r)
3090 return r;
3091 }
3092 }
3093 return 0;
3094 }
3095
3096 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3097 {
3098 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3099 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3100 int i, r, lasti;
3101 int temp_reg = r600_get_temp(ctx);
3102 struct r600_bytecode_alu alu;
3103 unsigned write_mask = dst->Register.WriteMask;
3104
3105 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3106 return 0;
3107
3108 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3109 if (r)
3110 return r;
3111
3112 /* the base address is now in temp.x */
3113 r = r600_get_byte_address(ctx, temp_reg,
3114 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3115 if (r)
3116 return r;
3117
3118 /* LDS write */
3119 lasti = tgsi_last_instruction(write_mask);
3120 for (i = 1; i <= lasti; i++) {
3121
3122 if (!(write_mask & (1 << i)))
3123 continue;
3124 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3125 temp_reg, i,
3126 temp_reg, 0,
3127 V_SQ_ALU_SRC_LITERAL, 4 * i);
3128 if (r)
3129 return r;
3130 }
3131
3132 for (i = 0; i <= lasti; i++) {
3133 if (!(write_mask & (1 << i)))
3134 continue;
3135
3136 if ((i == 0 && ((write_mask & 3) == 3)) ||
3137 (i == 2 && ((write_mask & 0xc) == 0xc))) {
3138 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3139 alu.op = LDS_OP3_LDS_WRITE_REL;
3140 alu.src[0].sel = temp_reg;
3141 alu.src[0].chan = i;
3142
3143 alu.src[1].sel = dst->Register.Index;
3144 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3145 alu.src[1].chan = i;
3146
3147 alu.src[2].sel = dst->Register.Index;
3148 alu.src[2].sel += ctx->file_offset[dst->Register.File];
3149 alu.src[2].chan = i + 1;
3150 alu.lds_idx = 1;
3151 alu.dst.chan = 0;
3152 alu.last = 1;
3153 alu.is_lds_idx_op = true;
3154 r = r600_bytecode_add_alu(ctx->bc, &alu);
3155 if (r)
3156 return r;
3157 i += 1;
3158 continue;
3159 }
3160 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3161 alu.op = LDS_OP2_LDS_WRITE;
3162 alu.src[0].sel = temp_reg;
3163 alu.src[0].chan = i;
3164
3165 alu.src[1].sel = dst->Register.Index;
3166 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3167 alu.src[1].chan = i;
3168
3169 alu.src[2].sel = V_SQ_ALU_SRC_0;
3170 alu.dst.chan = 0;
3171 alu.last = 1;
3172 alu.is_lds_idx_op = true;
3173 r = r600_bytecode_add_alu(ctx->bc, &alu);
3174 if (r)
3175 return r;
3176 }
3177 return 0;
3178 }
3179
3180 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3181 int output_idx, int nc)
3182 {
3183 int param;
3184 unsigned temp_reg = r600_get_temp(ctx);
3185 unsigned name = ctx->shader->output[output_idx].name;
3186 int dreg = ctx->shader->output[output_idx].gpr;
3187 int r;
3188
3189 param = r600_get_lds_unique_index(name, 0);
3190 r = get_lds_offset0(ctx, 1, temp_reg, true);
3191 if (r)
3192 return r;
3193
3194 if (param) {
3195 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3196 temp_reg, 0,
3197 temp_reg, 0,
3198 V_SQ_ALU_SRC_LITERAL, param * 16);
3199 if (r)
3200 return r;
3201 }
3202
3203 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3204 return 0;
3205 }
3206
3207 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3208 {
3209 int stride, outer_comps, inner_comps;
3210 int tessinner_idx = -1, tessouter_idx = -1;
3211 int i, r;
3212 unsigned j;
3213 int temp_reg = r600_get_temp(ctx);
3214 int treg[3] = {-1, -1, -1};
3215 struct r600_bytecode_alu alu;
3216 struct r600_bytecode_cf *cf_jump, *cf_pop;
3217
3218 /* only execute factor emission for invocation 0 */
3219 /* PRED_SETE_INT __, R0.x, 0 */
3220 memset(&alu, 0, sizeof(alu));
3221 alu.op = ALU_OP2_PRED_SETE_INT;
3222 alu.src[0].chan = 2;
3223 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3224 alu.execute_mask = 1;
3225 alu.update_pred = 1;
3226 alu.last = 1;
3227 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3228
3229 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3230 cf_jump = ctx->bc->cf_last;
3231
3232 treg[0] = r600_get_temp(ctx);
3233 switch (ctx->shader->tcs_prim_mode) {
3234 case PIPE_PRIM_LINES:
3235 stride = 8; /* 2 dwords, 1 vec2 store */
3236 outer_comps = 2;
3237 inner_comps = 0;
3238 break;
3239 case PIPE_PRIM_TRIANGLES:
3240 stride = 16; /* 4 dwords, 1 vec4 store */
3241 outer_comps = 3;
3242 inner_comps = 1;
3243 treg[1] = r600_get_temp(ctx);
3244 break;
3245 case PIPE_PRIM_QUADS:
3246 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3247 outer_comps = 4;
3248 inner_comps = 2;
3249 treg[1] = r600_get_temp(ctx);
3250 treg[2] = r600_get_temp(ctx);
3251 break;
3252 default:
3253 assert(0);
3254 return -1;
3255 }
3256
3257 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3258 /* TF_WRITE takes index in R.x, value in R.y */
3259 for (j = 0; j < ctx->shader->noutput; j++) {
3260 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3261 tessinner_idx = j;
3262 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3263 tessouter_idx = j;
3264 }
3265
3266 if (tessouter_idx == -1)
3267 return -1;
3268
3269 if (tessinner_idx == -1 && inner_comps)
3270 return -1;
3271
3272 if (tessouter_idx != -1) {
3273 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3274 if (r)
3275 return r;
3276 }
3277
3278 if (tessinner_idx != -1) {
3279 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3280 if (r)
3281 return r;
3282 }
3283
3284 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3285 /* r.x = relpatchid(r0.y) * tf_stride */
3286
3287 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
3288 /* add incoming r0.w to it: t.x = t.x + r0.w */
3289 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3290 temp_reg, 0,
3291 0, 1,
3292 V_SQ_ALU_SRC_LITERAL, stride,
3293 0, 3);
3294 if (r)
3295 return r;
3296
3297 for (i = 0; i < outer_comps + inner_comps; i++) {
3298 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3299 int out_comp = i >= outer_comps ? i - outer_comps : i;
3300
3301 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3302 if (out_comp == 1)
3303 out_comp = 0;
3304 else if (out_comp == 0)
3305 out_comp = 1;
3306 }
3307
3308 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3309 treg[i / 2], (2 * (i % 2)),
3310 temp_reg, 0,
3311 V_SQ_ALU_SRC_LITERAL, 4 * i);
3312 if (r)
3313 return r;
3314 r = single_alu_op2(ctx, ALU_OP1_MOV,
3315 treg[i / 2], 1 + (2 * (i%2)),
3316 ctx->shader->output[out_idx].gpr, out_comp,
3317 0, 0);
3318 if (r)
3319 return r;
3320 }
3321 for (i = 0; i < outer_comps + inner_comps; i++) {
3322 struct r600_bytecode_gds gds;
3323
3324 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3325 gds.src_gpr = treg[i / 2];
3326 gds.src_sel_x = 2 * (i % 2);
3327 gds.src_sel_y = 1 + (2 * (i % 2));
3328 gds.src_sel_z = 4;
3329 gds.dst_sel_x = 7;
3330 gds.dst_sel_y = 7;
3331 gds.dst_sel_z = 7;
3332 gds.dst_sel_w = 7;
3333 gds.op = FETCH_OP_TF_WRITE;
3334 r = r600_bytecode_add_gds(ctx->bc, &gds);
3335 if (r)
3336 return r;
3337 }
3338
3339 // Patch up jump label
3340 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3341 cf_pop = ctx->bc->cf_last;
3342
3343 cf_jump->cf_addr = cf_pop->id + 2;
3344 cf_jump->pop_count = 1;
3345 cf_pop->cf_addr = cf_pop->id + 2;
3346 cf_pop->pop_count = 1;
3347
3348 return 0;
3349 }
3350
3351 /*
3352 * We have to work out the thread ID for load and atomic
3353 * operations, which store the returned value to an index
3354 * in an intermediate buffer.
3355 * The index is calculated by taking the thread id,
3356 * calculated from the MBCNT instructions.
3357 * Then the shader engine ID is multiplied by 256,
3358 * and the wave id is added.
3359 * Then the result is multipled by 64 and thread id is
3360 * added.
3361 */
3362 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3363 {
3364 struct r600_bytecode_alu alu;
3365 int r;
3366
3367 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3368 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3369 alu.dst.sel = ctx->temp_reg;
3370 alu.dst.chan = 0;
3371 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3372 alu.src[0].value = 0xffffffff;
3373 alu.dst.write = 1;
3374 r = r600_bytecode_add_alu(ctx->bc, &alu);
3375 if (r)
3376 return r;
3377
3378 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3379 alu.op = ALU_OP1_MBCNT_32HI_INT;
3380 alu.dst.sel = ctx->temp_reg;
3381 alu.dst.chan = 1;
3382 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3383 alu.src[0].value = 0xffffffff;
3384 alu.dst.write = 1;
3385 r = r600_bytecode_add_alu(ctx->bc, &alu);
3386 if (r)
3387 return r;
3388
3389 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3390 alu.op = ALU_OP3_MULADD_UINT24;
3391 alu.dst.sel = ctx->temp_reg;
3392 alu.dst.chan = 2;
3393 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3394 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3395 alu.src[1].value = 256;
3396 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3397 alu.dst.write = 1;
3398 alu.is_op3 = 1;
3399 alu.last = 1;
3400 r = r600_bytecode_add_alu(ctx->bc, &alu);
3401 if (r)
3402 return r;
3403
3404 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3405 ctx->thread_id_gpr, 1,
3406 ctx->temp_reg, 2,
3407 V_SQ_ALU_SRC_LITERAL, 0x40,
3408 ctx->temp_reg, 0);
3409 if (r)
3410 return r;
3411 return 0;
3412 }
3413
3414 static int r600_shader_from_tgsi(struct r600_context *rctx,
3415 struct r600_pipe_shader *pipeshader,
3416 union r600_shader_key key)
3417 {
3418 struct r600_screen *rscreen = rctx->screen;
3419 struct r600_shader *shader = &pipeshader->shader;
3420 struct tgsi_token *tokens = pipeshader->selector->tokens;
3421 struct pipe_stream_output_info so = pipeshader->selector->so;
3422 struct tgsi_full_immediate *immediate;
3423 struct r600_shader_ctx ctx;
3424 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3425 unsigned output_done, noutput;
3426 unsigned opcode;
3427 int j, k, r = 0;
3428 unsigned i;
3429 int next_param_base = 0, next_clip_base;
3430 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3431 bool indirect_gprs;
3432 bool ring_outputs = false;
3433 bool lds_outputs = false;
3434 bool lds_inputs = false;
3435 bool pos_emitted = false;
3436
3437 ctx.bc = &shader->bc;
3438 ctx.shader = shader;
3439
3440 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3441 rscreen->has_compressed_msaa_texturing);
3442 ctx.tokens = tokens;
3443 tgsi_scan_shader(tokens, &ctx.info);
3444 shader->indirect_files = ctx.info.indirect_files;
3445
3446 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3447 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3448 ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3449 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3450
3451 shader->uses_helper_invocation = false;
3452 shader->uses_doubles = ctx.info.uses_doubles;
3453 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3454 shader->nsys_inputs = 0;
3455
3456 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3457 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3458 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3459 tgsi_parse_init(&ctx.parse, tokens);
3460 ctx.type = ctx.info.processor;
3461 shader->processor_type = ctx.type;
3462 ctx.bc->type = shader->processor_type;
3463
3464 switch (ctx.type) {
3465 case PIPE_SHADER_VERTEX:
3466 shader->vs_as_gs_a = key.vs.as_gs_a;
3467 shader->vs_as_es = key.vs.as_es;
3468 shader->vs_as_ls = key.vs.as_ls;
3469 shader->atomic_base = key.vs.first_atomic_counter;
3470 if (shader->vs_as_es)
3471 ring_outputs = true;
3472 if (shader->vs_as_ls)
3473 lds_outputs = true;
3474 break;
3475 case PIPE_SHADER_GEOMETRY:
3476 ring_outputs = true;
3477 shader->atomic_base = key.gs.first_atomic_counter;
3478 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3479 break;
3480 case PIPE_SHADER_TESS_CTRL:
3481 shader->tcs_prim_mode = key.tcs.prim_mode;
3482 shader->atomic_base = key.tcs.first_atomic_counter;
3483 lds_outputs = true;
3484 lds_inputs = true;
3485 break;
3486 case PIPE_SHADER_TESS_EVAL:
3487 shader->tes_as_es = key.tes.as_es;
3488 shader->atomic_base = key.tes.first_atomic_counter;
3489 lds_inputs = true;
3490 if (shader->tes_as_es)
3491 ring_outputs = true;
3492 break;
3493 case PIPE_SHADER_FRAGMENT:
3494 shader->two_side = key.ps.color_two_side;
3495 shader->atomic_base = key.ps.first_atomic_counter;
3496 shader->rat_base = key.ps.nr_cbufs;
3497 shader->image_size_const_offset = key.ps.image_size_const_offset;
3498 break;
3499 case PIPE_SHADER_COMPUTE:
3500 shader->rat_base = 0;
3501 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3502 break;
3503 default:
3504 break;
3505 }
3506
3507 if (shader->vs_as_es || shader->tes_as_es) {
3508 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3509 } else {
3510 ctx.gs_for_vs = NULL;
3511 }
3512
3513 ctx.next_ring_offset = 0;
3514 ctx.gs_out_ring_offset = 0;
3515 ctx.gs_next_vertex = 0;
3516 ctx.gs_stream_output_info = &so;
3517
3518 ctx.thread_id_gpr = -1;
3519 ctx.face_gpr = -1;
3520 ctx.fixed_pt_position_gpr = -1;
3521 ctx.fragcoord_input = -1;
3522 ctx.colors_used = 0;
3523 ctx.clip_vertex_write = 0;
3524
3525 ctx.helper_invoc_reg = -1;
3526 ctx.cs_block_size_reg = -1;
3527 ctx.cs_grid_size_reg = -1;
3528 ctx.cs_block_size_loaded = false;
3529 ctx.cs_grid_size_loaded = false;
3530
3531 shader->nr_ps_color_exports = 0;
3532 shader->nr_ps_max_color_exports = 0;
3533
3534
3535 /* register allocations */
3536 /* Values [0,127] correspond to GPR[0..127].
3537 * Values [128,159] correspond to constant buffer bank 0
3538 * Values [160,191] correspond to constant buffer bank 1
3539 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3540 * Values [256,287] correspond to constant buffer bank 2 (EG)
3541 * Values [288,319] correspond to constant buffer bank 3 (EG)
3542 * Other special values are shown in the list below.
3543 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3544 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3545 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3546 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3547 * 248 SQ_ALU_SRC_0: special constant 0.0.
3548 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3549 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3550 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3551 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3552 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3553 * 254 SQ_ALU_SRC_PV: previous vector result.
3554 * 255 SQ_ALU_SRC_PS: previous scalar result.
3555 */
3556 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3557 ctx.file_offset[i] = 0;
3558 }
3559
3560 if (ctx.type == PIPE_SHADER_VERTEX) {
3561
3562 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3563 if (ctx.info.num_inputs)
3564 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3565 }
3566 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3567 if (ctx.bc->chip_class >= EVERGREEN)
3568 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3569 else
3570 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3571
3572 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3573 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3574 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3575 shader->uses_helper_invocation = true;
3576 }
3577 }
3578 }
3579 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3580 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3581 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3582 }
3583 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3584 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3585 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3586 bool add_tesscoord = false, add_tess_inout = false;
3587 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3588 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3589 /* if we have tesscoord save one reg */
3590 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3591 add_tesscoord = true;
3592 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3593 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3594 add_tess_inout = true;
3595 }
3596 if (add_tesscoord || add_tess_inout)
3597 ctx.file_offset[TGSI_FILE_INPUT]++;
3598 if (add_tess_inout)
3599 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3600 }
3601 if (ctx.type == PIPE_SHADER_COMPUTE) {
3602 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3603 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3604 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3605 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3606 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3607 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3608 }
3609 }
3610
3611 ctx.file_offset[TGSI_FILE_OUTPUT] =
3612 ctx.file_offset[TGSI_FILE_INPUT] +
3613 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3614 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3615 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3616
3617 /* Outside the GPR range. This will be translated to one of the
3618 * kcache banks later. */
3619 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3620 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3621
3622 pipeshader->scratch_space_needed = 0;
3623 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3624 ctx.info.file_max[TGSI_FILE_TEMPORARY];
3625 if (regno > 124) {
3626 choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3627 shader->indirect_files = ctx.info.indirect_files;
3628 }
3629 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3630
3631 ctx.bc->ar_reg = ++regno;
3632 ctx.bc->index_reg[0] = ++regno;
3633 ctx.bc->index_reg[1] = ++regno;
3634
3635 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3636 ctx.tess_input_info = ++regno;
3637 ctx.tess_output_info = ++regno;
3638 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3639 ctx.tess_input_info = ++regno;
3640 ctx.tess_output_info = ++regno;
3641 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3642 ctx.gs_export_gpr_tregs[0] = ++regno;
3643 ctx.gs_export_gpr_tregs[1] = ++regno;
3644 ctx.gs_export_gpr_tregs[2] = ++regno;
3645 ctx.gs_export_gpr_tregs[3] = ++regno;
3646 if (ctx.shader->gs_tri_strip_adj_fix) {
3647 ctx.gs_rotated_input[0] = ++regno;
3648 ctx.gs_rotated_input[1] = ++regno;
3649 } else {
3650 ctx.gs_rotated_input[0] = 0;
3651 ctx.gs_rotated_input[1] = 1;
3652 }
3653 }
3654
3655 if (shader->uses_images) {
3656 ctx.thread_id_gpr = ++regno;
3657 }
3658 ctx.temp_reg = ++regno;
3659
3660 shader->max_arrays = 0;
3661 shader->num_arrays = 0;
3662 if (indirect_gprs) {
3663
3664 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3665 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3666 ctx.file_offset[TGSI_FILE_OUTPUT] -
3667 ctx.file_offset[TGSI_FILE_INPUT],
3668 0x0F);
3669 }
3670 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3671 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3672 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3673 ctx.file_offset[TGSI_FILE_OUTPUT],
3674 0x0F);
3675 }
3676 }
3677
3678 ctx.nliterals = 0;
3679 ctx.literals = NULL;
3680 ctx.max_driver_temp_used = 0;
3681
3682 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3683 ctx.info.colors_written == 1;
3684 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3685 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3686
3687 if (ctx.type == PIPE_SHADER_VERTEX ||
3688 ctx.type == PIPE_SHADER_GEOMETRY ||
3689 ctx.type == PIPE_SHADER_TESS_EVAL) {
3690 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3691 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3692 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3693 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3694 }
3695
3696 if (shader->vs_as_gs_a)
3697 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3698
3699 if (ctx.thread_id_gpr != -1) {
3700 r = load_thread_id_gpr(&ctx);
3701 if (r)
3702 return r;
3703 }
3704
3705 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3706 r600_fetch_tess_io_info(&ctx);
3707
3708 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3709 tgsi_parse_token(&ctx.parse);
3710 switch (ctx.parse.FullToken.Token.Type) {
3711 case TGSI_TOKEN_TYPE_IMMEDIATE:
3712 immediate = &ctx.parse.FullToken.FullImmediate;
3713 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3714 if(ctx.literals == NULL) {
3715 r = -ENOMEM;
3716 goto out_err;
3717 }
3718 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3719 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3720 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3721 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3722 ctx.nliterals++;
3723 break;
3724 case TGSI_TOKEN_TYPE_DECLARATION:
3725 r = tgsi_declaration(&ctx);
3726 if (r)
3727 goto out_err;
3728 break;
3729 case TGSI_TOKEN_TYPE_INSTRUCTION:
3730 case TGSI_TOKEN_TYPE_PROPERTY:
3731 break;
3732 default:
3733 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3734 r = -EINVAL;
3735 goto out_err;
3736 }
3737 }
3738
3739 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3740 shader->ring_item_sizes[1] = 0;
3741 shader->ring_item_sizes[2] = 0;
3742 shader->ring_item_sizes[3] = 0;
3743
3744 /* Process two side if needed */
3745 if (shader->two_side && ctx.colors_used) {
3746 int i, count = ctx.shader->ninput;
3747 unsigned next_lds_loc = ctx.shader->nlds;
3748
3749 /* additional inputs will be allocated right after the existing inputs,
3750 * we won't need them after the color selection, so we don't need to
3751 * reserve these gprs for the rest of the shader code and to adjust
3752 * output offsets etc. */
3753 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3754 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3755
3756 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3757 if (ctx.face_gpr == -1) {
3758 i = ctx.shader->ninput++;
3759 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3760 ctx.shader->input[i].spi_sid = 0;
3761 ctx.shader->input[i].gpr = gpr++;
3762 ctx.face_gpr = ctx.shader->input[i].gpr;
3763 }
3764
3765 for (i = 0; i < count; i++) {
3766 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3767 int ni = ctx.shader->ninput++;
3768 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3769 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3770 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3771 ctx.shader->input[ni].gpr = gpr++;
3772 // TGSI to LLVM needs to know the lds position of inputs.
3773 // Non LLVM path computes it later (in process_twoside_color)
3774 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3775 ctx.shader->input[i].back_color_input = ni;
3776 if (ctx.bc->chip_class >= EVERGREEN) {
3777 if ((r = evergreen_interp_input(&ctx, ni)))
3778 return r;
3779 }
3780 }
3781 }
3782 }
3783
3784 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3785 shader->nr_ps_max_color_exports = 8;
3786
3787 if (ctx.shader->uses_helper_invocation) {
3788 if (ctx.bc->chip_class == CAYMAN)
3789 r = cm_load_helper_invocation(&ctx);
3790 else
3791 r = eg_load_helper_invocation(&ctx);
3792 if (r)
3793 return r;
3794 }
3795
3796 /*
3797 * XXX this relies on fixed_pt_position_gpr only being present when
3798 * this shader should be executed per sample. Should be the case for now...
3799 */
3800 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3801 /*
3802 * Fix up sample mask. The hw always gives us coverage mask for
3803 * the pixel. However, for per-sample shading, we need the
3804 * coverage for the shader invocation only.
3805 * Also, with disabled msaa, only the first bit should be set
3806 * (luckily the same fixup works for both problems).
3807 * For now, we can only do it if we know this shader is always
3808 * executed per sample (due to usage of bits in the shader
3809 * forcing per-sample execution).
3810 * If the fb is not multisampled, we'd do unnecessary work but
3811 * it should still be correct.
3812 * It will however do nothing for sample shading according
3813 * to MinSampleShading.
3814 */
3815 struct r600_bytecode_alu alu;
3816 int tmp = r600_get_temp(&ctx);
3817 assert(ctx.face_gpr != -1);
3818 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3819
3820 alu.op = ALU_OP2_LSHL_INT;
3821 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3822 alu.src[0].value = 0x1;
3823 alu.src[1].sel = ctx.fixed_pt_position_gpr;
3824 alu.src[1].chan = 3;
3825 alu.dst.sel = tmp;
3826 alu.dst.chan = 0;
3827 alu.dst.write = 1;
3828 alu.last = 1;
3829 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3830 return r;
3831
3832 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3833 alu.op = ALU_OP2_AND_INT;
3834 alu.src[0].sel = tmp;
3835 alu.src[1].sel = ctx.face_gpr;
3836 alu.src[1].chan = 2;
3837 alu.dst.sel = ctx.face_gpr;
3838 alu.dst.chan = 2;
3839 alu.dst.write = 1;
3840 alu.last = 1;
3841 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3842 return r;
3843 }
3844
3845 if (ctx.fragcoord_input >= 0) {
3846 if (ctx.bc->chip_class == CAYMAN) {
3847 for (j = 0 ; j < 4; j++) {
3848 struct r600_bytecode_alu alu;
3849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3850 alu.op = ALU_OP1_RECIP_IEEE;
3851 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3852 alu.src[0].chan = 3;
3853
3854 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3855 alu.dst.chan = j;
3856 alu.dst.write = (j == 3);
3857 alu.last = (j == 3);
3858 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3859 return r;
3860 }
3861 } else {
3862 struct r600_bytecode_alu alu;
3863 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3864 alu.op = ALU_OP1_RECIP_IEEE;
3865 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3866 alu.src[0].chan = 3;
3867
3868 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3869 alu.dst.chan = 3;
3870 alu.dst.write = 1;
3871 alu.last = 1;
3872 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3873 return r;
3874 }
3875 }
3876
3877 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3878 struct r600_bytecode_alu alu;
3879 int r;
3880
3881 /* GS thread with no output workaround - emit a cut at start of GS */
3882 if (ctx.bc->chip_class == R600)
3883 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3884
3885 for (j = 0; j < 4; j++) {
3886 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3887 alu.op = ALU_OP1_MOV;
3888 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3889 alu.src[0].value = 0;
3890 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3891 alu.dst.write = 1;
3892 alu.last = 1;
3893 r = r600_bytecode_add_alu(ctx.bc, &alu);
3894 if (r)
3895 return r;
3896 }
3897
3898 if (ctx.shader->gs_tri_strip_adj_fix) {
3899 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3900 ctx.gs_rotated_input[0], 2,
3901 0, 2,
3902 V_SQ_ALU_SRC_LITERAL, 1);
3903 if (r)
3904 return r;
3905
3906 for (i = 0; i < 6; i++) {
3907 int rotated = (i + 4) % 6;
3908 int offset_reg = i / 3;
3909 int offset_chan = i % 3;
3910 int rotated_offset_reg = rotated / 3;
3911 int rotated_offset_chan = rotated % 3;
3912
3913 if (offset_reg == 0 && offset_chan == 2)
3914 offset_chan = 3;
3915 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3916 rotated_offset_chan = 3;
3917
3918 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3919 ctx.gs_rotated_input[offset_reg], offset_chan,
3920 ctx.gs_rotated_input[0], 2,
3921 offset_reg, offset_chan,
3922 rotated_offset_reg, rotated_offset_chan);
3923 if (r)
3924 return r;
3925 }
3926 }
3927 }
3928
3929 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3930 r600_fetch_tess_io_info(&ctx);
3931
3932 if (shader->two_side && ctx.colors_used) {
3933 if ((r = process_twoside_color_inputs(&ctx)))
3934 return r;
3935 }
3936
3937 tgsi_parse_init(&ctx.parse, tokens);
3938 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3939 tgsi_parse_token(&ctx.parse);
3940 switch (ctx.parse.FullToken.Token.Type) {
3941 case TGSI_TOKEN_TYPE_INSTRUCTION:
3942 r = tgsi_is_supported(&ctx);
3943 if (r)
3944 goto out_err;
3945 ctx.max_driver_temp_used = 0;
3946 /* reserve first tmp for everyone */
3947 r600_get_temp(&ctx);
3948
3949 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3950 if ((r = tgsi_split_constant(&ctx)))
3951 goto out_err;
3952 if ((r = tgsi_split_literal_constant(&ctx)))
3953 goto out_err;
3954 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3955 if ((r = tgsi_split_gs_inputs(&ctx)))
3956 goto out_err;
3957 } else if (lds_inputs) {
3958 if ((r = tgsi_split_lds_inputs(&ctx)))
3959 goto out_err;
3960 }
3961 if (ctx.bc->chip_class == CAYMAN)
3962 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3963 else if (ctx.bc->chip_class >= EVERGREEN)
3964 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3965 else
3966 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3967
3968 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3969
3970 r = ctx.inst_info->process(&ctx);
3971 if (r)
3972 goto out_err;
3973
3974 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3975 r = r600_store_tcs_output(&ctx);
3976 if (r)
3977 goto out_err;
3978 }
3979 break;
3980 default:
3981 break;
3982 }
3983 }
3984
3985 /* Reset the temporary register counter. */
3986 ctx.max_driver_temp_used = 0;
3987
3988 noutput = shader->noutput;
3989
3990 if (!ring_outputs && ctx.clip_vertex_write) {
3991 unsigned clipdist_temp[2];
3992
3993 clipdist_temp[0] = r600_get_temp(&ctx);
3994 clipdist_temp[1] = r600_get_temp(&ctx);
3995
3996 /* need to convert a clipvertex write into clipdistance writes and not export
3997 the clip vertex anymore */
3998
3999 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
4000 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4001 shader->output[noutput].gpr = clipdist_temp[0];
4002 noutput++;
4003 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4004 shader->output[noutput].gpr = clipdist_temp[1];
4005 noutput++;
4006
4007 /* reset spi_sid for clipvertex output to avoid confusing spi */
4008 shader->output[ctx.cv_output].spi_sid = 0;
4009
4010 shader->clip_dist_write = 0xFF;
4011 shader->cc_dist_mask = 0xFF;
4012
4013 for (i = 0; i < 8; i++) {
4014 int oreg = i >> 2;
4015 int ochan = i & 3;
4016
4017 for (j = 0; j < 4; j++) {
4018 struct r600_bytecode_alu alu;
4019 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4020 alu.op = ALU_OP2_DOT4;
4021 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
4022 alu.src[0].chan = j;
4023
4024 alu.src[1].sel = 512 + i;
4025 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4026 alu.src[1].chan = j;
4027
4028 alu.dst.sel = clipdist_temp[oreg];
4029 alu.dst.chan = j;
4030 alu.dst.write = (j == ochan);
4031 if (j == 3)
4032 alu.last = 1;
4033 r = r600_bytecode_add_alu(ctx.bc, &alu);
4034 if (r)
4035 return r;
4036 }
4037 }
4038 }
4039
4040 /* Add stream outputs. */
4041 if (so.num_outputs) {
4042 bool emit = false;
4043 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
4044 emit = true;
4045 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
4046 emit = true;
4047 if (emit)
4048 emit_streamout(&ctx, &so, -1, NULL);
4049 }
4050 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
4051 convert_edgeflag_to_int(&ctx);
4052
4053 if (ctx.type == PIPE_SHADER_TESS_CTRL)
4054 r600_emit_tess_factor(&ctx);
4055
4056 if (lds_outputs) {
4057 if (ctx.type == PIPE_SHADER_VERTEX) {
4058 if (ctx.shader->noutput)
4059 emit_lds_vs_writes(&ctx);
4060 }
4061 } else if (ring_outputs) {
4062 if (shader->vs_as_es || shader->tes_as_es) {
4063 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
4064 ctx.gs_export_gpr_tregs[1] = -1;
4065 ctx.gs_export_gpr_tregs[2] = -1;
4066 ctx.gs_export_gpr_tregs[3] = -1;
4067
4068 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
4069 }
4070 } else {
4071 /* Export output */
4072 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
4073
4074 for (i = 0, j = 0; i < noutput; i++, j++) {
4075 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4076 output[j].gpr = shader->output[i].gpr;
4077 output[j].elem_size = 3;
4078 output[j].swizzle_x = 0;
4079 output[j].swizzle_y = 1;
4080 output[j].swizzle_z = 2;
4081 output[j].swizzle_w = 3;
4082 output[j].burst_count = 1;
4083 output[j].type = 0xffffffff;
4084 output[j].op = CF_OP_EXPORT;
4085 switch (ctx.type) {
4086 case PIPE_SHADER_VERTEX:
4087 case PIPE_SHADER_TESS_EVAL:
4088 switch (shader->output[i].name) {
4089 case TGSI_SEMANTIC_POSITION:
4090 output[j].array_base = 60;
4091 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4092 pos_emitted = true;
4093 break;
4094
4095 case TGSI_SEMANTIC_PSIZE:
4096 output[j].array_base = 61;
4097 output[j].swizzle_y = 7;
4098 output[j].swizzle_z = 7;
4099 output[j].swizzle_w = 7;
4100 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4101 pos_emitted = true;
4102 break;
4103 case TGSI_SEMANTIC_EDGEFLAG:
4104 output[j].array_base = 61;
4105 output[j].swizzle_x = 7;
4106 output[j].swizzle_y = 0;
4107 output[j].swizzle_z = 7;
4108 output[j].swizzle_w = 7;
4109 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4110 pos_emitted = true;
4111 break;
4112 case TGSI_SEMANTIC_LAYER:
4113 /* spi_sid is 0 for outputs that are
4114 * not consumed by PS */
4115 if (shader->output[i].spi_sid) {
4116 output[j].array_base = next_param_base++;
4117 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4118 j++;
4119 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4120 }
4121 output[j].array_base = 61;
4122 output[j].swizzle_x = 7;
4123 output[j].swizzle_y = 7;
4124 output[j].swizzle_z = 0;
4125 output[j].swizzle_w = 7;
4126 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4127 pos_emitted = true;
4128 break;
4129 case TGSI_SEMANTIC_VIEWPORT_INDEX:
4130 /* spi_sid is 0 for outputs that are
4131 * not consumed by PS */
4132 if (shader->output[i].spi_sid) {
4133 output[j].array_base = next_param_base++;
4134 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4135 j++;
4136 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4137 }
4138 output[j].array_base = 61;
4139 output[j].swizzle_x = 7;
4140 output[j].swizzle_y = 7;
4141 output[j].swizzle_z = 7;
4142 output[j].swizzle_w = 0;
4143 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4144 pos_emitted = true;
4145 break;
4146 case TGSI_SEMANTIC_CLIPVERTEX:
4147 j--;
4148 break;
4149 case TGSI_SEMANTIC_CLIPDIST:
4150 output[j].array_base = next_clip_base++;
4151 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4152 pos_emitted = true;
4153 /* spi_sid is 0 for clipdistance outputs that were generated
4154 * for clipvertex - we don't need to pass them to PS */
4155 if (shader->output[i].spi_sid) {
4156 j++;
4157 /* duplicate it as PARAM to pass to the pixel shader */
4158 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4159 output[j].array_base = next_param_base++;
4160 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4161 }
4162 break;
4163 case TGSI_SEMANTIC_FOG:
4164 output[j].swizzle_y = 4; /* 0 */
4165 output[j].swizzle_z = 4; /* 0 */
4166 output[j].swizzle_w = 5; /* 1 */
4167 break;
4168 case TGSI_SEMANTIC_PRIMID:
4169 output[j].swizzle_x = 2;
4170 output[j].swizzle_y = 4; /* 0 */
4171 output[j].swizzle_z = 4; /* 0 */
4172 output[j].swizzle_w = 4; /* 0 */
4173 break;
4174 }
4175
4176 break;
4177 case PIPE_SHADER_FRAGMENT:
4178 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4179 /* never export more colors than the number of CBs */
4180 if (shader->output[i].sid >= max_color_exports) {
4181 /* skip export */
4182 j--;
4183 continue;
4184 }
4185 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4186 output[j].array_base = shader->output[i].sid;
4187 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4188 shader->nr_ps_color_exports++;
4189 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4190
4191 /* If the i-th target format is set, all previous target formats must
4192 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4193 */
4194 if (shader->output[i].sid > 0)
4195 for (unsigned x = 0; x < shader->output[i].sid; x++)
4196 shader->ps_color_export_mask |= (1 << (x*4));
4197
4198 if (shader->output[i].sid > shader->ps_export_highest)
4199 shader->ps_export_highest = shader->output[i].sid;
4200 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4201 for (k = 1; k < max_color_exports; k++) {
4202 j++;
4203 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4204 output[j].gpr = shader->output[i].gpr;
4205 output[j].elem_size = 3;
4206 output[j].swizzle_x = 0;
4207 output[j].swizzle_y = 1;
4208 output[j].swizzle_z = 2;
4209 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4210 output[j].burst_count = 1;
4211 output[j].array_base = k;
4212 output[j].op = CF_OP_EXPORT;
4213 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4214 shader->nr_ps_color_exports++;
4215 if (k > shader->ps_export_highest)
4216 shader->ps_export_highest = k;
4217 shader->ps_color_export_mask |= (0xf << (j * 4));
4218 }
4219 }
4220 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4221 output[j].array_base = 61;
4222 output[j].swizzle_x = 2;
4223 output[j].swizzle_y = 7;
4224 output[j].swizzle_z = output[j].swizzle_w = 7;
4225 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4226 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4227 output[j].array_base = 61;
4228 output[j].swizzle_x = 7;
4229 output[j].swizzle_y = 1;
4230 output[j].swizzle_z = output[j].swizzle_w = 7;
4231 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4232 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4233 output[j].array_base = 61;
4234 output[j].swizzle_x = 7;
4235 output[j].swizzle_y = 7;
4236 output[j].swizzle_z = 0;
4237 output[j].swizzle_w = 7;
4238 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4239 } else {
4240 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4241 r = -EINVAL;
4242 goto out_err;
4243 }
4244 break;
4245 case PIPE_SHADER_TESS_CTRL:
4246 break;
4247 default:
4248 R600_ERR("unsupported processor type %d\n", ctx.type);
4249 r = -EINVAL;
4250 goto out_err;
4251 }
4252
4253 if (output[j].type == 0xffffffff) {
4254 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4255 output[j].array_base = next_param_base++;
4256 }
4257 }
4258
4259 /* add fake position export */
4260 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4261 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4262 output[j].gpr = 0;
4263 output[j].elem_size = 3;
4264 output[j].swizzle_x = 7;
4265 output[j].swizzle_y = 7;
4266 output[j].swizzle_z = 7;
4267 output[j].swizzle_w = 7;
4268 output[j].burst_count = 1;
4269 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4270 output[j].array_base = 60;
4271 output[j].op = CF_OP_EXPORT;
4272 j++;
4273 }
4274
4275 /* add fake param output for vertex shader if no param is exported */
4276 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4277 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4278 output[j].gpr = 0;
4279 output[j].elem_size = 3;
4280 output[j].swizzle_x = 7;
4281 output[j].swizzle_y = 7;
4282 output[j].swizzle_z = 7;
4283 output[j].swizzle_w = 7;
4284 output[j].burst_count = 1;
4285 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4286 output[j].array_base = 0;
4287 output[j].op = CF_OP_EXPORT;
4288 j++;
4289 }
4290
4291 /* add fake pixel export */
4292 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4293 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4294 output[j].gpr = 0;
4295 output[j].elem_size = 3;
4296 output[j].swizzle_x = 7;
4297 output[j].swizzle_y = 7;
4298 output[j].swizzle_z = 7;
4299 output[j].swizzle_w = 7;
4300 output[j].burst_count = 1;
4301 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4302 output[j].array_base = 0;
4303 output[j].op = CF_OP_EXPORT;
4304 j++;
4305 shader->nr_ps_color_exports++;
4306 shader->ps_color_export_mask = 0xf;
4307 }
4308
4309 noutput = j;
4310
4311 /* set export done on last export of each type */
4312 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4313 if (!(output_done & (1 << output[k].type))) {
4314 output_done |= (1 << output[k].type);
4315 output[k].op = CF_OP_EXPORT_DONE;
4316 }
4317 }
4318 /* add output to bytecode */
4319 for (i = 0; i < noutput; i++) {
4320 r = r600_bytecode_add_output(ctx.bc, &output[i]);
4321 if (r)
4322 goto out_err;
4323 }
4324 }
4325
4326 /* add program end */
4327 if (ctx.bc->chip_class == CAYMAN)
4328 cm_bytecode_add_cf_end(ctx.bc);
4329 else {
4330 const struct cf_op_info *last = NULL;
4331
4332 if (ctx.bc->cf_last)
4333 last = r600_isa_cf(ctx.bc->cf_last->op);
4334
4335 /* alu clause instructions don't have EOP bit, so add NOP */
4336 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4337 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4338
4339 ctx.bc->cf_last->end_of_program = 1;
4340 }
4341
4342 /* check GPR limit - we have 124 = 128 - 4
4343 * (4 are reserved as alu clause temporary registers) */
4344 if (ctx.bc->ngpr > 124) {
4345 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4346 r = -ENOMEM;
4347 goto out_err;
4348 }
4349
4350 if (ctx.type == PIPE_SHADER_GEOMETRY) {
4351 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4352 return r;
4353 }
4354
4355 free(ctx.spilled_arrays);
4356 free(ctx.array_infos);
4357 free(ctx.literals);
4358 tgsi_parse_free(&ctx.parse);
4359 return 0;
4360 out_err:
4361 free(ctx.spilled_arrays);
4362 free(ctx.array_infos);
4363 free(ctx.literals);
4364 tgsi_parse_free(&ctx.parse);
4365 return r;
4366 }
4367
4368 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4369 {
4370 const unsigned tgsi_opcode =
4371 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4372 R600_ERR("%s tgsi opcode unsupported\n",
4373 tgsi_get_opcode_name(tgsi_opcode));
4374 return -EINVAL;
4375 }
4376
4377 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4378 {
4379 return 0;
4380 }
4381
4382 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4383 const struct r600_shader_src *shader_src,
4384 unsigned chan)
4385 {
4386 bc_src->sel = shader_src->sel;
4387 bc_src->chan = shader_src->swizzle[chan];
4388 bc_src->neg = shader_src->neg;
4389 bc_src->abs = shader_src->abs;
4390 bc_src->rel = shader_src->rel;
4391 bc_src->value = shader_src->value[bc_src->chan];
4392 bc_src->kc_bank = shader_src->kc_bank;
4393 bc_src->kc_rel = shader_src->kc_rel;
4394 }
4395
4396 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4397 {
4398 bc_src->abs = 1;
4399 bc_src->neg = 0;
4400 }
4401
4402 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4403 {
4404 bc_src->neg = !bc_src->neg;
4405 }
4406
4407 static void tgsi_dst(struct r600_shader_ctx *ctx,
4408 const struct tgsi_full_dst_register *tgsi_dst,
4409 unsigned swizzle,
4410 struct r600_bytecode_alu_dst *r600_dst)
4411 {
4412 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4413
4414 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4415 bool spilled;
4416 unsigned idx;
4417
4418 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4419
4420 if (spilled) {
4421 struct r600_bytecode_output cf;
4422 int reg = 0;
4423 int r;
4424 bool add_pending_output = true;
4425
4426 memset(&cf, 0, sizeof(struct r600_bytecode_output));
4427 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4428 &cf.array_base, &cf.array_size);
4429
4430 /* If no component has spilled, reserve a register and add the spill code
4431 * ctx->bc->n_pending_outputs is cleared after each instruction group */
4432 if (ctx->bc->n_pending_outputs == 0) {
4433 reg = r600_get_temp(ctx);
4434 } else {
4435 /* If we are already spilling and the output address is the same like
4436 * before then just reuse the same slot */
4437 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4438 if ((cf.array_base + idx == tmpl->array_base) ||
4439 (cf.array_base == tmpl->array_base &&
4440 tmpl->index_gpr == ctx->bc->ar_reg &&
4441 tgsi_dst->Register.Indirect)) {
4442 reg = ctx->bc->pending_outputs[0].gpr;
4443 add_pending_output = false;
4444 } else {
4445 reg = r600_get_temp(ctx);
4446 }
4447 }
4448
4449 r600_dst->sel = reg;
4450 r600_dst->chan = swizzle;
4451 r600_dst->write = 1;
4452 if (inst->Instruction.Saturate) {
4453 r600_dst->clamp = 1;
4454 }
4455
4456 /* Add new outputs as pending */
4457 if (add_pending_output) {
4458 cf.op = CF_OP_MEM_SCRATCH;
4459 cf.elem_size = 3;
4460 cf.gpr = reg;
4461 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4462 cf.mark = 1;
4463 cf.comp_mask = inst->Dst[0].Register.WriteMask;
4464 cf.swizzle_x = 0;
4465 cf.swizzle_y = 1;
4466 cf.swizzle_z = 2;
4467 cf.swizzle_w = 3;
4468 cf.burst_count = 1;
4469
4470 if (tgsi_dst->Register.Indirect) {
4471 if (ctx->bc->chip_class < R700)
4472 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4473 else
4474 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4475 cf.index_gpr = ctx->bc->ar_reg;
4476 }
4477 else {
4478 cf.array_base += idx;
4479 cf.array_size = 0;
4480 }
4481
4482 r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4483 if (r)
4484 return;
4485
4486 if (ctx->bc->chip_class >= R700)
4487 r600_bytecode_need_wait_ack(ctx->bc, true);
4488 }
4489 return;
4490 }
4491 else {
4492 r600_dst->sel = idx;
4493 }
4494 }
4495 else {
4496 r600_dst->sel = tgsi_dst->Register.Index;
4497 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4498 }
4499 r600_dst->chan = swizzle;
4500 r600_dst->write = 1;
4501 if (inst->Instruction.Saturate) {
4502 r600_dst->clamp = 1;
4503 }
4504 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4505 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4506 return;
4507 }
4508 }
4509 if (tgsi_dst->Register.Indirect)
4510 r600_dst->rel = V_SQ_REL_RELATIVE;
4511
4512 }
4513
4514 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4515 {
4516 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4517 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4518 struct r600_bytecode_alu alu;
4519 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4520 int use_tmp = 0;
4521 int swizzle_x = inst->Src[0].Register.SwizzleX;
4522
4523 if (singledest) {
4524 switch (write_mask) {
4525 case 0x1:
4526 if (swizzle_x == 2) {
4527 write_mask = 0xc;
4528 use_tmp = 3;
4529 } else
4530 write_mask = 0x3;
4531 break;
4532 case 0x2:
4533 if (swizzle_x == 2) {
4534 write_mask = 0xc;
4535 use_tmp = 3;
4536 } else {
4537 write_mask = 0x3;
4538 use_tmp = 1;
4539 }
4540 break;
4541 case 0x4:
4542 if (swizzle_x == 0) {
4543 write_mask = 0x3;
4544 use_tmp = 1;
4545 } else
4546 write_mask = 0xc;
4547 break;
4548 case 0x8:
4549 if (swizzle_x == 0) {
4550 write_mask = 0x3;
4551 use_tmp = 1;
4552 } else {
4553 write_mask = 0xc;
4554 use_tmp = 3;
4555 }
4556 break;
4557 }
4558 }
4559
4560 lasti = tgsi_last_instruction(write_mask);
4561 for (i = 0; i <= lasti; i++) {
4562
4563 if (!(write_mask & (1 << i)))
4564 continue;
4565
4566 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4567
4568 if (singledest) {
4569 if (use_tmp || dest_temp) {
4570 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4571 alu.dst.chan = i;
4572 alu.dst.write = 1;
4573 } else {
4574 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4575 }
4576 if (i == 1 || i == 3)
4577 alu.dst.write = 0;
4578 } else
4579 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4580
4581 alu.op = op_override ? op_override : ctx->inst_info->op;
4582 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4583 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4584 } else if (!swap) {
4585 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4586 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4587 }
4588 } else {
4589 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4590 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4591 }
4592
4593 /* handle some special cases */
4594 if (i == 1 || i == 3) {
4595 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4596 case TGSI_OPCODE_DABS:
4597 r600_bytecode_src_set_abs(&alu.src[0]);
4598 break;
4599 default:
4600 break;
4601 }
4602 }
4603 if (i == lasti) {
4604 alu.last = 1;
4605 }
4606 r = r600_bytecode_add_alu(ctx->bc, &alu);
4607 if (r)
4608 return r;
4609 }
4610
4611 if (use_tmp) {
4612 write_mask = inst->Dst[0].Register.WriteMask;
4613
4614 lasti = tgsi_last_instruction(write_mask);
4615 /* move result from temp to dst */
4616 for (i = 0; i <= lasti; i++) {
4617 if (!(write_mask & (1 << i)))
4618 continue;
4619
4620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4621 alu.op = ALU_OP1_MOV;
4622
4623 if (dest_temp) {
4624 alu.dst.sel = dest_temp;
4625 alu.dst.chan = i;
4626 alu.dst.write = 1;
4627 } else
4628 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4629 alu.src[0].sel = ctx->temp_reg;
4630 alu.src[0].chan = use_tmp - 1;
4631 alu.last = (i == lasti);
4632
4633 r = r600_bytecode_add_alu(ctx->bc, &alu);
4634 if (r)
4635 return r;
4636 }
4637 }
4638 return 0;
4639 }
4640
4641 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4642 {
4643 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4644 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4645 /* confirm writemasking */
4646 if ((write_mask & 0x3) != 0x3 &&
4647 (write_mask & 0xc) != 0xc) {
4648 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4649 return -1;
4650 }
4651 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4652 }
4653
4654 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4655 {
4656 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4657 }
4658
4659 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4660 {
4661 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4662 }
4663
4664 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4665 {
4666 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4667 struct r600_bytecode_alu alu;
4668 int i, j, r;
4669 int lasti = 3;
4670 int tmp = r600_get_temp(ctx);
4671
4672 for (i = 0; i < lasti + 1; i++) {
4673
4674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4675 alu.op = ctx->inst_info->op;
4676 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4677 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4678 }
4679
4680 if (inst->Dst[0].Register.WriteMask & (1 << i))
4681 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4682 else
4683 alu.dst.sel = tmp;
4684
4685 alu.dst.chan = i;
4686 alu.is_op3 = 1;
4687 if (i == lasti) {
4688 alu.last = 1;
4689 }
4690 r = r600_bytecode_add_alu(ctx->bc, &alu);
4691 if (r)
4692 return r;
4693 }
4694 return 0;
4695 }
4696
4697 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4698 {
4699 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4700 struct r600_bytecode_alu alu;
4701 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4702 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4703 /* use temp register if trans_only and more than one dst component */
4704 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4705 unsigned op = ctx->inst_info->op;
4706
4707 if (op == ALU_OP2_MUL_IEEE &&
4708 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4709 op = ALU_OP2_MUL;
4710
4711 for (i = 0; i <= lasti; i++) {
4712 if (!(write_mask & (1 << i)))
4713 continue;
4714
4715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4716 if (use_tmp) {
4717 alu.dst.sel = ctx->temp_reg;
4718 alu.dst.chan = i;
4719 alu.dst.write = 1;
4720 } else
4721 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4722
4723 alu.op = op;
4724 if (!swap) {
4725 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4726 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4727 }
4728 } else {
4729 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4730 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4731 }
4732 if (i == lasti || trans_only) {
4733 alu.last = 1;
4734 }
4735 r = r600_bytecode_add_alu(ctx->bc, &alu);
4736 if (r)
4737 return r;
4738 }
4739
4740 if (use_tmp) {
4741 /* move result from temp to dst */
4742 for (i = 0; i <= lasti; i++) {
4743 if (!(write_mask & (1 << i)))
4744 continue;
4745
4746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4747 alu.op = ALU_OP1_MOV;
4748 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4749 alu.src[0].sel = ctx->temp_reg;
4750 alu.src[0].chan = i;
4751 alu.last = (i == lasti);
4752
4753 r = r600_bytecode_add_alu(ctx->bc, &alu);
4754 if (r)
4755 return r;
4756 }
4757 }
4758 return 0;
4759 }
4760
4761 static int tgsi_op2(struct r600_shader_ctx *ctx)
4762 {
4763 return tgsi_op2_s(ctx, 0, 0);
4764 }
4765
4766 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4767 {
4768 return tgsi_op2_s(ctx, 1, 0);
4769 }
4770
4771 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4772 {
4773 return tgsi_op2_s(ctx, 0, 1);
4774 }
4775
4776 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4777 {
4778 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4779 struct r600_bytecode_alu alu;
4780 int i, r;
4781 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4782
4783 for (i = 0; i < lasti + 1; i++) {
4784
4785 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4786 continue;
4787 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4788 alu.op = ctx->inst_info->op;
4789
4790 alu.src[0].sel = V_SQ_ALU_SRC_0;
4791
4792 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4793
4794 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4795
4796 if (i == lasti) {
4797 alu.last = 1;
4798 }
4799 r = r600_bytecode_add_alu(ctx->bc, &alu);
4800 if (r)
4801 return r;
4802 }
4803 return 0;
4804
4805 }
4806
4807 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4808 {
4809 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4810 struct r600_bytecode_alu alu;
4811 int i, r;
4812 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4813
4814 for (i = 0; i < lasti + 1; i++) {
4815
4816 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4817 continue;
4818 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4819 alu.op = ALU_OP1_MOV;
4820
4821 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4822
4823 if (i == 1 || i == 3)
4824 r600_bytecode_src_toggle_neg(&alu.src[0]);
4825 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4826
4827 if (i == lasti) {
4828 alu.last = 1;
4829 }
4830 r = r600_bytecode_add_alu(ctx->bc, &alu);
4831 if (r)
4832 return r;
4833 }
4834 return 0;
4835
4836 }
4837
4838 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4839 {
4840 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4841 struct r600_bytecode_alu alu;
4842 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4843 int i, j, r;
4844
4845 for (i = 0; i <= 3; i++) {
4846 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4847 alu.op = ctx->inst_info->op;
4848
4849 alu.dst.sel = ctx->temp_reg;
4850 alu.dst.chan = i;
4851 alu.dst.write = 1;
4852 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4853 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4854 }
4855
4856 if (i == 3)
4857 alu.last = 1;
4858
4859 r = r600_bytecode_add_alu(ctx->bc, &alu);
4860 if (r)
4861 return r;
4862 }
4863
4864 /* Replicate significand result across channels. */
4865 for (i = 0; i <= 3; i++) {
4866 if (!(write_mask & (1 << i)))
4867 continue;
4868
4869 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4870 alu.op = ALU_OP1_MOV;
4871 alu.src[0].chan = (i & 1) + 2;
4872 alu.src[0].sel = ctx->temp_reg;
4873
4874 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4875 alu.dst.write = 1;
4876 alu.last = 1;
4877 r = r600_bytecode_add_alu(ctx->bc, &alu);
4878 if (r)
4879 return r;
4880 }
4881
4882 for (i = 0; i <= 3; i++) {
4883 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4884 /* MOV third channels to writemask dst1 */
4885 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4886 alu.op = ALU_OP1_MOV;
4887 alu.src[0].chan = 1;
4888 alu.src[0].sel = ctx->temp_reg;
4889
4890 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4891 alu.last = 1;
4892 r = r600_bytecode_add_alu(ctx->bc, &alu);
4893 if (r)
4894 return r;
4895 break;
4896 }
4897 }
4898 return 0;
4899 }
4900
4901
4902 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4903 {
4904 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4905 struct r600_bytecode_alu alu;
4906 int i, c, r;
4907 int write_mask = inst->Dst[0].Register.WriteMask;
4908 int temp_reg = r600_get_temp(ctx);
4909
4910 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4911 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4912
4913 for (c = 0; c < 2; c++) {
4914 int dchan = c * 2;
4915 if (write_mask & (0x3 << dchan)) {
4916 /* split into 24-bit int and 8-bit int */
4917 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4918 alu.op = ALU_OP2_AND_INT;
4919 alu.dst.sel = temp_reg;
4920 alu.dst.chan = dchan;
4921 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4922 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4923 alu.src[1].value = 0xffffff00;
4924 alu.dst.write = 1;
4925 r = r600_bytecode_add_alu(ctx->bc, &alu);
4926 if (r)
4927 return r;
4928
4929 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4930 alu.op = ALU_OP2_AND_INT;
4931 alu.dst.sel = temp_reg;
4932 alu.dst.chan = dchan + 1;
4933 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4934 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4935 alu.src[1].value = 0xff;
4936 alu.dst.write = 1;
4937 alu.last = 1;
4938 r = r600_bytecode_add_alu(ctx->bc, &alu);
4939 if (r)
4940 return r;
4941 }
4942 }
4943
4944 for (c = 0; c < 2; c++) {
4945 int dchan = c * 2;
4946 if (write_mask & (0x3 << dchan)) {
4947 for (i = dchan; i <= dchan + 1; i++) {
4948 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4949 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4950
4951 alu.src[0].sel = temp_reg;
4952 alu.src[0].chan = i;
4953 alu.dst.sel = temp_reg;
4954 alu.dst.chan = i;
4955 alu.dst.write = 1;
4956 if (ctx->bc->chip_class == CAYMAN)
4957 alu.last = i == dchan + 1;
4958 else
4959 alu.last = 1; /* trans only ops on evergreen */
4960
4961 r = r600_bytecode_add_alu(ctx->bc, &alu);
4962 if (r)
4963 return r;
4964 }
4965 }
4966 }
4967
4968 for (c = 0; c < 2; c++) {
4969 int dchan = c * 2;
4970 if (write_mask & (0x3 << dchan)) {
4971 for (i = 0; i < 4; i++) {
4972 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4973 alu.op = ALU_OP1_FLT32_TO_FLT64;
4974
4975 alu.src[0].chan = dchan + (i / 2);
4976 if (i == 0 || i == 2)
4977 alu.src[0].sel = temp_reg;
4978 else {
4979 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4980 alu.src[0].value = 0x0;
4981 }
4982 alu.dst.sel = ctx->temp_reg;
4983 alu.dst.chan = i;
4984 alu.last = i == 3;
4985 alu.dst.write = 1;
4986
4987 r = r600_bytecode_add_alu(ctx->bc, &alu);
4988 if (r)
4989 return r;
4990 }
4991
4992 for (i = 0; i <= 1; i++) {
4993 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4994 alu.op = ALU_OP2_ADD_64;
4995
4996 alu.src[0].chan = fp64_switch(i);
4997 alu.src[0].sel = ctx->temp_reg;
4998
4999 alu.src[1].chan = fp64_switch(i + 2);
5000 alu.src[1].sel = ctx->temp_reg;
5001 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
5002 alu.last = i == 1;
5003
5004 r = r600_bytecode_add_alu(ctx->bc, &alu);
5005 if (r)
5006 return r;
5007 }
5008 }
5009 }
5010
5011 return 0;
5012 }
5013
5014 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
5015 {
5016 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5017 struct r600_bytecode_alu alu;
5018 int i, r;
5019 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5020 int treg = r600_get_temp(ctx);
5021 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
5022 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
5023
5024 /* do a 64->32 into a temp register */
5025 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
5026 if (r)
5027 return r;
5028
5029 for (i = 0; i <= lasti; i++) {
5030 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5031 continue;
5032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5033 alu.op = ctx->inst_info->op;
5034
5035 alu.src[0].chan = i;
5036 alu.src[0].sel = treg;
5037 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5038 alu.last = (i == lasti);
5039
5040 r = r600_bytecode_add_alu(ctx->bc, &alu);
5041 if (r)
5042 return r;
5043 }
5044
5045 return 0;
5046 }
5047
5048 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
5049 unsigned op,
5050 int dst_reg,
5051 struct r600_shader_src *src,
5052 bool abs)
5053 {
5054 struct r600_bytecode_alu alu;
5055 const int last_slot = 3;
5056 int r;
5057
5058 /* these have to write the result to X/Y by the looks of it */
5059 for (int i = 0 ; i < last_slot; i++) {
5060 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5061 alu.op = op;
5062
5063 r600_bytecode_src(&alu.src[0], src, 1);
5064 r600_bytecode_src(&alu.src[1], src, 0);
5065
5066 if (abs)
5067 r600_bytecode_src_set_abs(&alu.src[1]);
5068
5069 alu.dst.sel = dst_reg;
5070 alu.dst.chan = i;
5071 alu.dst.write = (i == 0 || i == 1);
5072
5073 if (bc->chip_class != CAYMAN || i == last_slot - 1)
5074 alu.last = 1;
5075 r = r600_bytecode_add_alu(bc, &alu);
5076 if (r)
5077 return r;
5078 }
5079
5080 return 0;
5081 }
5082
5083 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5084 {
5085 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5086 int i, r;
5087 struct r600_bytecode_alu alu;
5088 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5089 int t1 = ctx->temp_reg;
5090
5091 /* should only be one src regs */
5092 assert(inst->Instruction.NumSrcRegs == 1);
5093
5094 /* only support one double at a time */
5095 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5096 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5097
5098 r = cayman_emit_unary_double_raw(
5099 ctx->bc, ctx->inst_info->op, t1,
5100 &ctx->src[0],
5101 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5102 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5103 if (r)
5104 return r;
5105
5106 for (i = 0 ; i <= lasti; i++) {
5107 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5108 continue;
5109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5110 alu.op = ALU_OP1_MOV;
5111 alu.src[0].sel = t1;
5112 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5113 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5114 alu.dst.write = 1;
5115 if (i == lasti)
5116 alu.last = 1;
5117 r = r600_bytecode_add_alu(ctx->bc, &alu);
5118 if (r)
5119 return r;
5120 }
5121 return 0;
5122 }
5123
5124 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5125 {
5126 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5127 int i, j, r;
5128 struct r600_bytecode_alu alu;
5129 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5130
5131 for (i = 0 ; i < last_slot; i++) {
5132 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5133 alu.op = ctx->inst_info->op;
5134 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5135 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5136
5137 /* RSQ should take the absolute value of src */
5138 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5139 r600_bytecode_src_set_abs(&alu.src[j]);
5140 }
5141 }
5142 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5143 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5144
5145 if (i == last_slot - 1)
5146 alu.last = 1;
5147 r = r600_bytecode_add_alu(ctx->bc, &alu);
5148 if (r)
5149 return r;
5150 }
5151 return 0;
5152 }
5153
5154 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5155 {
5156 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5157 int i, j, k, r;
5158 struct r600_bytecode_alu alu;
5159 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5160 int t1 = ctx->temp_reg;
5161
5162 for (k = 0; k <= lasti; k++) {
5163 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5164 continue;
5165
5166 for (i = 0 ; i < 4; i++) {
5167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5168 alu.op = ctx->inst_info->op;
5169 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5170 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5171 }
5172 alu.dst.sel = t1;
5173 alu.dst.chan = i;
5174 alu.dst.write = (i == k);
5175 if (i == 3)
5176 alu.last = 1;
5177 r = r600_bytecode_add_alu(ctx->bc, &alu);
5178 if (r)
5179 return r;
5180 }
5181 }
5182
5183 for (i = 0 ; i <= lasti; i++) {
5184 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5185 continue;
5186 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5187 alu.op = ALU_OP1_MOV;
5188 alu.src[0].sel = t1;
5189 alu.src[0].chan = i;
5190 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5191 alu.dst.write = 1;
5192 if (i == lasti)
5193 alu.last = 1;
5194 r = r600_bytecode_add_alu(ctx->bc, &alu);
5195 if (r)
5196 return r;
5197 }
5198
5199 return 0;
5200 }
5201
5202
5203 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5204 {
5205 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5206 int i, j, k, r;
5207 struct r600_bytecode_alu alu;
5208 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5209 int t1 = ctx->temp_reg;
5210
5211 /* t1 would get overwritten below if we actually tried to
5212 * multiply two pairs of doubles at a time. */
5213 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5214 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5215
5216 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5217
5218 for (i = 0; i < 4; i++) {
5219 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5220 alu.op = ctx->inst_info->op;
5221 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5222 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5223 }
5224 alu.dst.sel = t1;
5225 alu.dst.chan = i;
5226 alu.dst.write = 1;
5227 if (i == 3)
5228 alu.last = 1;
5229 r = r600_bytecode_add_alu(ctx->bc, &alu);
5230 if (r)
5231 return r;
5232 }
5233
5234 for (i = 0; i <= lasti; i++) {
5235 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5236 continue;
5237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5238 alu.op = ALU_OP1_MOV;
5239 alu.src[0].sel = t1;
5240 alu.src[0].chan = i;
5241 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5242 alu.dst.write = 1;
5243 if (i == lasti)
5244 alu.last = 1;
5245 r = r600_bytecode_add_alu(ctx->bc, &alu);
5246 if (r)
5247 return r;
5248 }
5249
5250 return 0;
5251 }
5252
5253 /*
5254 * Emit RECIP_64 + MUL_64 to implement division.
5255 */
5256 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5257 {
5258 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5259 int r;
5260 struct r600_bytecode_alu alu;
5261 int t1 = ctx->temp_reg;
5262 int k;
5263
5264 /* Only support one double at a time. This is the same constraint as
5265 * in DMUL lowering. */
5266 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5267 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5268
5269 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5270
5271 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5272 if (r)
5273 return r;
5274
5275 for (int i = 0; i < 4; i++) {
5276 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5277 alu.op = ALU_OP2_MUL_64;
5278
5279 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5280
5281 alu.src[1].sel = t1;
5282 alu.src[1].chan = (i == 3) ? 0 : 1;
5283
5284 alu.dst.sel = t1;
5285 alu.dst.chan = i;
5286 alu.dst.write = 1;
5287 if (i == 3)
5288 alu.last = 1;
5289 r = r600_bytecode_add_alu(ctx->bc, &alu);
5290 if (r)
5291 return r;
5292 }
5293
5294 for (int i = 0; i < 2; i++) {
5295 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5296 alu.op = ALU_OP1_MOV;
5297 alu.src[0].sel = t1;
5298 alu.src[0].chan = i;
5299 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5300 alu.dst.write = 1;
5301 if (i == 1)
5302 alu.last = 1;
5303 r = r600_bytecode_add_alu(ctx->bc, &alu);
5304 if (r)
5305 return r;
5306 }
5307 return 0;
5308 }
5309
5310 /*
5311 * r600 - trunc to -PI..PI range
5312 * r700 - normalize by dividing by 2PI
5313 * see fdo bug 27901
5314 */
5315 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5316 {
5317 int r;
5318 struct r600_bytecode_alu alu;
5319
5320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5321 alu.op = ALU_OP3_MULADD;
5322 alu.is_op3 = 1;
5323
5324 alu.dst.chan = 0;
5325 alu.dst.sel = ctx->temp_reg;
5326 alu.dst.write = 1;
5327
5328 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5329
5330 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5331 alu.src[1].chan = 0;
5332 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5333 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5334 alu.src[2].chan = 0;
5335 alu.last = 1;
5336 r = r600_bytecode_add_alu(ctx->bc, &alu);
5337 if (r)
5338 return r;
5339
5340 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5341 alu.op = ALU_OP1_FRACT;
5342
5343 alu.dst.chan = 0;
5344 alu.dst.sel = ctx->temp_reg;
5345 alu.dst.write = 1;
5346
5347 alu.src[0].sel = ctx->temp_reg;
5348 alu.src[0].chan = 0;
5349 alu.last = 1;
5350 r = r600_bytecode_add_alu(ctx->bc, &alu);
5351 if (r)
5352 return r;
5353
5354 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5355 alu.op = ALU_OP3_MULADD;
5356 alu.is_op3 = 1;
5357
5358 alu.dst.chan = 0;
5359 alu.dst.sel = ctx->temp_reg;
5360 alu.dst.write = 1;
5361
5362 alu.src[0].sel = ctx->temp_reg;
5363 alu.src[0].chan = 0;
5364
5365 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5366 alu.src[1].chan = 0;
5367 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5368 alu.src[2].chan = 0;
5369
5370 if (ctx->bc->chip_class == R600) {
5371 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5372 alu.src[2].value = u_bitcast_f2u(-M_PI);
5373 } else {
5374 alu.src[1].sel = V_SQ_ALU_SRC_1;
5375 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5376 alu.src[2].neg = 1;
5377 }
5378
5379 alu.last = 1;
5380 r = r600_bytecode_add_alu(ctx->bc, &alu);
5381 if (r)
5382 return r;
5383 return 0;
5384 }
5385
5386 static int cayman_trig(struct r600_shader_ctx *ctx)
5387 {
5388 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5389 struct r600_bytecode_alu alu;
5390 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5391 int i, r;
5392
5393 r = tgsi_setup_trig(ctx);
5394 if (r)
5395 return r;
5396
5397
5398 for (i = 0; i < last_slot; i++) {
5399 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5400 alu.op = ctx->inst_info->op;
5401 alu.dst.chan = i;
5402
5403 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5404 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5405
5406 alu.src[0].sel = ctx->temp_reg;
5407 alu.src[0].chan = 0;
5408 if (i == last_slot - 1)
5409 alu.last = 1;
5410 r = r600_bytecode_add_alu(ctx->bc, &alu);
5411 if (r)
5412 return r;
5413 }
5414 return 0;
5415 }
5416
5417 static int tgsi_trig(struct r600_shader_ctx *ctx)
5418 {
5419 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5420 struct r600_bytecode_alu alu;
5421 int i, r;
5422 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5423
5424 r = tgsi_setup_trig(ctx);
5425 if (r)
5426 return r;
5427
5428 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5429 alu.op = ctx->inst_info->op;
5430 alu.dst.chan = 0;
5431 alu.dst.sel = ctx->temp_reg;
5432 alu.dst.write = 1;
5433
5434 alu.src[0].sel = ctx->temp_reg;
5435 alu.src[0].chan = 0;
5436 alu.last = 1;
5437 r = r600_bytecode_add_alu(ctx->bc, &alu);
5438 if (r)
5439 return r;
5440
5441 /* replicate result */
5442 for (i = 0; i < lasti + 1; i++) {
5443 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5444 continue;
5445
5446 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5447 alu.op = ALU_OP1_MOV;
5448
5449 alu.src[0].sel = ctx->temp_reg;
5450 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5451 if (i == lasti)
5452 alu.last = 1;
5453 r = r600_bytecode_add_alu(ctx->bc, &alu);
5454 if (r)
5455 return r;
5456 }
5457 return 0;
5458 }
5459
5460 static int tgsi_kill(struct r600_shader_ctx *ctx)
5461 {
5462 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5463 struct r600_bytecode_alu alu;
5464 int i, r;
5465
5466 for (i = 0; i < 4; i++) {
5467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5468 alu.op = ctx->inst_info->op;
5469
5470 alu.dst.chan = i;
5471
5472 alu.src[0].sel = V_SQ_ALU_SRC_0;
5473
5474 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5475 alu.src[1].sel = V_SQ_ALU_SRC_1;
5476 alu.src[1].neg = 1;
5477 } else {
5478 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5479 }
5480 if (i == 3) {
5481 alu.last = 1;
5482 }
5483 r = r600_bytecode_add_alu(ctx->bc, &alu);
5484 if (r)
5485 return r;
5486 }
5487
5488 /* kill must be last in ALU */
5489 ctx->bc->force_add_cf = 1;
5490 ctx->shader->uses_kill = TRUE;
5491 return 0;
5492 }
5493
5494 static int tgsi_lit(struct r600_shader_ctx *ctx)
5495 {
5496 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5497 struct r600_bytecode_alu alu;
5498 int r;
5499
5500 /* tmp.x = max(src.y, 0.0) */
5501 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5502 alu.op = ALU_OP2_MAX;
5503 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5504 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5505 alu.src[1].chan = 1;
5506
5507 alu.dst.sel = ctx->temp_reg;
5508 alu.dst.chan = 0;
5509 alu.dst.write = 1;
5510
5511 alu.last = 1;
5512 r = r600_bytecode_add_alu(ctx->bc, &alu);
5513 if (r)
5514 return r;
5515
5516 if (inst->Dst[0].Register.WriteMask & (1 << 2))
5517 {
5518 int chan;
5519 int sel;
5520 unsigned i;
5521
5522 if (ctx->bc->chip_class == CAYMAN) {
5523 for (i = 0; i < 3; i++) {
5524 /* tmp.z = log(tmp.x) */
5525 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5526 alu.op = ALU_OP1_LOG_CLAMPED;
5527 alu.src[0].sel = ctx->temp_reg;
5528 alu.src[0].chan = 0;
5529 alu.dst.sel = ctx->temp_reg;
5530 alu.dst.chan = i;
5531 if (i == 2) {
5532 alu.dst.write = 1;
5533 alu.last = 1;
5534 } else
5535 alu.dst.write = 0;
5536
5537 r = r600_bytecode_add_alu(ctx->bc, &alu);
5538 if (r)
5539 return r;
5540 }
5541 } else {
5542 /* tmp.z = log(tmp.x) */
5543 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5544 alu.op = ALU_OP1_LOG_CLAMPED;
5545 alu.src[0].sel = ctx->temp_reg;
5546 alu.src[0].chan = 0;
5547 alu.dst.sel = ctx->temp_reg;
5548 alu.dst.chan = 2;
5549 alu.dst.write = 1;
5550 alu.last = 1;
5551 r = r600_bytecode_add_alu(ctx->bc, &alu);
5552 if (r)
5553 return r;
5554 }
5555
5556 chan = alu.dst.chan;
5557 sel = alu.dst.sel;
5558
5559 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5560 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5561 alu.op = ALU_OP3_MUL_LIT;
5562 alu.src[0].sel = sel;
5563 alu.src[0].chan = chan;
5564 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5565 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5566 alu.dst.sel = ctx->temp_reg;
5567 alu.dst.chan = 0;
5568 alu.dst.write = 1;
5569 alu.is_op3 = 1;
5570 alu.last = 1;
5571 r = r600_bytecode_add_alu(ctx->bc, &alu);
5572 if (r)
5573 return r;
5574
5575 if (ctx->bc->chip_class == CAYMAN) {
5576 for (i = 0; i < 3; i++) {
5577 /* dst.z = exp(tmp.x) */
5578 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5579 alu.op = ALU_OP1_EXP_IEEE;
5580 alu.src[0].sel = ctx->temp_reg;
5581 alu.src[0].chan = 0;
5582 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5583 if (i == 2) {
5584 alu.dst.write = 1;
5585 alu.last = 1;
5586 } else
5587 alu.dst.write = 0;
5588 r = r600_bytecode_add_alu(ctx->bc, &alu);
5589 if (r)
5590 return r;
5591 }
5592 } else {
5593 /* dst.z = exp(tmp.x) */
5594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5595 alu.op = ALU_OP1_EXP_IEEE;
5596 alu.src[0].sel = ctx->temp_reg;
5597 alu.src[0].chan = 0;
5598 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5599 alu.last = 1;
5600 r = r600_bytecode_add_alu(ctx->bc, &alu);
5601 if (r)
5602 return r;
5603 }
5604 }
5605
5606 /* dst.x, <- 1.0 */
5607 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5608 alu.op = ALU_OP1_MOV;
5609 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5610 alu.src[0].chan = 0;
5611 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5612 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5613 r = r600_bytecode_add_alu(ctx->bc, &alu);
5614 if (r)
5615 return r;
5616
5617 /* dst.y = max(src.x, 0.0) */
5618 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5619 alu.op = ALU_OP2_MAX;
5620 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5621 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5622 alu.src[1].chan = 0;
5623 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5624 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5625 r = r600_bytecode_add_alu(ctx->bc, &alu);
5626 if (r)
5627 return r;
5628
5629 /* dst.w, <- 1.0 */
5630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5631 alu.op = ALU_OP1_MOV;
5632 alu.src[0].sel = V_SQ_ALU_SRC_1;
5633 alu.src[0].chan = 0;
5634 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5635 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5636 alu.last = 1;
5637 r = r600_bytecode_add_alu(ctx->bc, &alu);
5638 if (r)
5639 return r;
5640
5641 return 0;
5642 }
5643
5644 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5645 {
5646 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5647 struct r600_bytecode_alu alu;
5648 int i, r;
5649
5650 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5651
5652 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5653
5654 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5655 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5656 r600_bytecode_src_set_abs(&alu.src[i]);
5657 }
5658 alu.dst.sel = ctx->temp_reg;
5659 alu.dst.write = 1;
5660 alu.last = 1;
5661 r = r600_bytecode_add_alu(ctx->bc, &alu);
5662 if (r)
5663 return r;
5664 /* replicate result */
5665 return tgsi_helper_tempx_replicate(ctx);
5666 }
5667
5668 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5669 {
5670 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5671 struct r600_bytecode_alu alu;
5672 int i, r;
5673
5674 for (i = 0; i < 4; i++) {
5675 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5676 alu.src[0].sel = ctx->temp_reg;
5677 alu.op = ALU_OP1_MOV;
5678 alu.dst.chan = i;
5679 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5680 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5681 if (i == 3)
5682 alu.last = 1;
5683 r = r600_bytecode_add_alu(ctx->bc, &alu);
5684 if (r)
5685 return r;
5686 }
5687 return 0;
5688 }
5689
5690 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5691 {
5692 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5693 struct r600_bytecode_alu alu;
5694 int i, r;
5695
5696 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5697 alu.op = ctx->inst_info->op;
5698 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5699 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5700 }
5701 alu.dst.sel = ctx->temp_reg;
5702 alu.dst.write = 1;
5703 alu.last = 1;
5704 r = r600_bytecode_add_alu(ctx->bc, &alu);
5705 if (r)
5706 return r;
5707 /* replicate result */
5708 return tgsi_helper_tempx_replicate(ctx);
5709 }
5710
5711 static int cayman_pow(struct r600_shader_ctx *ctx)
5712 {
5713 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5714 int i, r;
5715 struct r600_bytecode_alu alu;
5716 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5717
5718 for (i = 0; i < 3; i++) {
5719 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5720 alu.op = ALU_OP1_LOG_IEEE;
5721 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5722 alu.dst.sel = ctx->temp_reg;
5723 alu.dst.chan = i;
5724 alu.dst.write = 1;
5725 if (i == 2)
5726 alu.last = 1;
5727 r = r600_bytecode_add_alu(ctx->bc, &alu);
5728 if (r)
5729 return r;
5730 }
5731
5732 /* b * LOG2(a) */
5733 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5734 alu.op = ALU_OP2_MUL;
5735 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5736 alu.src[1].sel = ctx->temp_reg;
5737 alu.dst.sel = ctx->temp_reg;
5738 alu.dst.write = 1;
5739 alu.last = 1;
5740 r = r600_bytecode_add_alu(ctx->bc, &alu);
5741 if (r)
5742 return r;
5743
5744 for (i = 0; i < last_slot; i++) {
5745 /* POW(a,b) = EXP2(b * LOG2(a))*/
5746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5747 alu.op = ALU_OP1_EXP_IEEE;
5748 alu.src[0].sel = ctx->temp_reg;
5749
5750 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5751 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5752 if (i == last_slot - 1)
5753 alu.last = 1;
5754 r = r600_bytecode_add_alu(ctx->bc, &alu);
5755 if (r)
5756 return r;
5757 }
5758 return 0;
5759 }
5760
5761 static int tgsi_pow(struct r600_shader_ctx *ctx)
5762 {
5763 struct r600_bytecode_alu alu;
5764 int r;
5765
5766 /* LOG2(a) */
5767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5768 alu.op = ALU_OP1_LOG_IEEE;
5769 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5770 alu.dst.sel = ctx->temp_reg;
5771 alu.dst.write = 1;
5772 alu.last = 1;
5773 r = r600_bytecode_add_alu(ctx->bc, &alu);
5774 if (r)
5775 return r;
5776 /* b * LOG2(a) */
5777 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5778 alu.op = ALU_OP2_MUL;
5779 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5780 alu.src[1].sel = ctx->temp_reg;
5781 alu.dst.sel = ctx->temp_reg;
5782 alu.dst.write = 1;
5783 alu.last = 1;
5784 r = r600_bytecode_add_alu(ctx->bc, &alu);
5785 if (r)
5786 return r;
5787 /* POW(a,b) = EXP2(b * LOG2(a))*/
5788 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5789 alu.op = ALU_OP1_EXP_IEEE;
5790 alu.src[0].sel = ctx->temp_reg;
5791 alu.dst.sel = ctx->temp_reg;
5792 alu.dst.write = 1;
5793 alu.last = 1;
5794 r = r600_bytecode_add_alu(ctx->bc, &alu);
5795 if (r)
5796 return r;
5797 return tgsi_helper_tempx_replicate(ctx);
5798 }
5799
5800 static int emit_mul_int_op(struct r600_bytecode *bc,
5801 struct r600_bytecode_alu *alu_src)
5802 {
5803 struct r600_bytecode_alu alu;
5804 int i, r;
5805 alu = *alu_src;
5806 if (bc->chip_class == CAYMAN) {
5807 for (i = 0; i < 4; i++) {
5808 alu.dst.chan = i;
5809 alu.dst.write = (i == alu_src->dst.chan);
5810 alu.last = (i == 3);
5811
5812 r = r600_bytecode_add_alu(bc, &alu);
5813 if (r)
5814 return r;
5815 }
5816 } else {
5817 alu.last = 1;
5818 r = r600_bytecode_add_alu(bc, &alu);
5819 if (r)
5820 return r;
5821 }
5822 return 0;
5823 }
5824
5825 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5826 {
5827 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5828 struct r600_bytecode_alu alu;
5829 int i, r, j;
5830 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5831 int lasti = tgsi_last_instruction(write_mask);
5832 int tmp0 = ctx->temp_reg;
5833 int tmp1 = r600_get_temp(ctx);
5834 int tmp2 = r600_get_temp(ctx);
5835 int tmp3 = r600_get_temp(ctx);
5836 int tmp4 = 0;
5837
5838 /* Use additional temp if dst register and src register are the same */
5839 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5840 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5841 tmp4 = r600_get_temp(ctx);
5842 }
5843
5844 /* Unsigned path:
5845 *
5846 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5847 *
5848 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5849 * 2. tmp0.z = lo (tmp0.x * src2)
5850 * 3. tmp0.w = -tmp0.z
5851 * 4. tmp0.y = hi (tmp0.x * src2)
5852 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5853 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5854 * 7. tmp1.x = tmp0.x - tmp0.w
5855 * 8. tmp1.y = tmp0.x + tmp0.w
5856 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5857 * 10. tmp0.z = hi(tmp0.x * src1) = q
5858 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5859 *
5860 * 12. tmp0.w = src1 - tmp0.y = r
5861 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5862 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5863 *
5864 * if DIV
5865 *
5866 * 15. tmp1.z = tmp0.z + 1 = q + 1
5867 * 16. tmp1.w = tmp0.z - 1 = q - 1
5868 *
5869 * else MOD
5870 *
5871 * 15. tmp1.z = tmp0.w - src2 = r - src2
5872 * 16. tmp1.w = tmp0.w + src2 = r + src2
5873 *
5874 * endif
5875 *
5876 * 17. tmp1.x = tmp1.x & tmp1.y
5877 *
5878 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5879 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5880 *
5881 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5882 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5883 *
5884 * Signed path:
5885 *
5886 * Same as unsigned, using abs values of the operands,
5887 * and fixing the sign of the result in the end.
5888 */
5889
5890 for (i = 0; i < 4; i++) {
5891 if (!(write_mask & (1<<i)))
5892 continue;
5893
5894 if (signed_op) {
5895
5896 /* tmp2.x = -src0 */
5897 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5898 alu.op = ALU_OP2_SUB_INT;
5899
5900 alu.dst.sel = tmp2;
5901 alu.dst.chan = 0;
5902 alu.dst.write = 1;
5903
5904 alu.src[0].sel = V_SQ_ALU_SRC_0;
5905
5906 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5907
5908 alu.last = 1;
5909 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5910 return r;
5911
5912 /* tmp2.y = -src1 */
5913 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5914 alu.op = ALU_OP2_SUB_INT;
5915
5916 alu.dst.sel = tmp2;
5917 alu.dst.chan = 1;
5918 alu.dst.write = 1;
5919
5920 alu.src[0].sel = V_SQ_ALU_SRC_0;
5921
5922 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5923
5924 alu.last = 1;
5925 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5926 return r;
5927
5928 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5929 /* it will be a sign of the quotient */
5930 if (!mod) {
5931
5932 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5933 alu.op = ALU_OP2_XOR_INT;
5934
5935 alu.dst.sel = tmp2;
5936 alu.dst.chan = 2;
5937 alu.dst.write = 1;
5938
5939 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5940 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5941
5942 alu.last = 1;
5943 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5944 return r;
5945 }
5946
5947 /* tmp2.x = |src0| */
5948 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5949 alu.op = ALU_OP3_CNDGE_INT;
5950 alu.is_op3 = 1;
5951
5952 alu.dst.sel = tmp2;
5953 alu.dst.chan = 0;
5954 alu.dst.write = 1;
5955
5956 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5957 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5958 alu.src[2].sel = tmp2;
5959 alu.src[2].chan = 0;
5960
5961 alu.last = 1;
5962 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5963 return r;
5964
5965 /* tmp2.y = |src1| */
5966 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5967 alu.op = ALU_OP3_CNDGE_INT;
5968 alu.is_op3 = 1;
5969
5970 alu.dst.sel = tmp2;
5971 alu.dst.chan = 1;
5972 alu.dst.write = 1;
5973
5974 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5975 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5976 alu.src[2].sel = tmp2;
5977 alu.src[2].chan = 1;
5978
5979 alu.last = 1;
5980 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5981 return r;
5982
5983 }
5984
5985 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5986 if (ctx->bc->chip_class == CAYMAN) {
5987 /* tmp3.x = u2f(src2) */
5988 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5989 alu.op = ALU_OP1_UINT_TO_FLT;
5990
5991 alu.dst.sel = tmp3;
5992 alu.dst.chan = 0;
5993 alu.dst.write = 1;
5994
5995 if (signed_op) {
5996 alu.src[0].sel = tmp2;
5997 alu.src[0].chan = 1;
5998 } else {
5999 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6000 }
6001
6002 alu.last = 1;
6003 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6004 return r;
6005
6006 /* tmp0.x = recip(tmp3.x) */
6007 for (j = 0 ; j < 3; j++) {
6008 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6009 alu.op = ALU_OP1_RECIP_IEEE;
6010
6011 alu.dst.sel = tmp0;
6012 alu.dst.chan = j;
6013 alu.dst.write = (j == 0);
6014
6015 alu.src[0].sel = tmp3;
6016 alu.src[0].chan = 0;
6017
6018 if (j == 2)
6019 alu.last = 1;
6020 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6021 return r;
6022 }
6023
6024 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6025 alu.op = ALU_OP2_MUL;
6026
6027 alu.src[0].sel = tmp0;
6028 alu.src[0].chan = 0;
6029
6030 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6031 alu.src[1].value = 0x4f800000;
6032
6033 alu.dst.sel = tmp3;
6034 alu.dst.write = 1;
6035 alu.last = 1;
6036 r = r600_bytecode_add_alu(ctx->bc, &alu);
6037 if (r)
6038 return r;
6039
6040 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6041 alu.op = ALU_OP1_FLT_TO_UINT;
6042
6043 alu.dst.sel = tmp0;
6044 alu.dst.chan = 0;
6045 alu.dst.write = 1;
6046
6047 alu.src[0].sel = tmp3;
6048 alu.src[0].chan = 0;
6049
6050 alu.last = 1;
6051 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6052 return r;
6053
6054 } else {
6055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6056 alu.op = ALU_OP1_RECIP_UINT;
6057
6058 alu.dst.sel = tmp0;
6059 alu.dst.chan = 0;
6060 alu.dst.write = 1;
6061
6062 if (signed_op) {
6063 alu.src[0].sel = tmp2;
6064 alu.src[0].chan = 1;
6065 } else {
6066 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6067 }
6068
6069 alu.last = 1;
6070 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6071 return r;
6072 }
6073
6074 /* 2. tmp0.z = lo (tmp0.x * src2) */
6075 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6076 alu.op = ALU_OP2_MULLO_UINT;
6077
6078 alu.dst.sel = tmp0;
6079 alu.dst.chan = 2;
6080 alu.dst.write = 1;
6081
6082 alu.src[0].sel = tmp0;
6083 alu.src[0].chan = 0;
6084 if (signed_op) {
6085 alu.src[1].sel = tmp2;
6086 alu.src[1].chan = 1;
6087 } else {
6088 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6089 }
6090
6091 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6092 return r;
6093
6094 /* 3. tmp0.w = -tmp0.z */
6095 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6096 alu.op = ALU_OP2_SUB_INT;
6097
6098 alu.dst.sel = tmp0;
6099 alu.dst.chan = 3;
6100 alu.dst.write = 1;
6101
6102 alu.src[0].sel = V_SQ_ALU_SRC_0;
6103 alu.src[1].sel = tmp0;
6104 alu.src[1].chan = 2;
6105
6106 alu.last = 1;
6107 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6108 return r;
6109
6110 /* 4. tmp0.y = hi (tmp0.x * src2) */
6111 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6112 alu.op = ALU_OP2_MULHI_UINT;
6113
6114 alu.dst.sel = tmp0;
6115 alu.dst.chan = 1;
6116 alu.dst.write = 1;
6117
6118 alu.src[0].sel = tmp0;
6119 alu.src[0].chan = 0;
6120
6121 if (signed_op) {
6122 alu.src[1].sel = tmp2;
6123 alu.src[1].chan = 1;
6124 } else {
6125 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6126 }
6127
6128 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6129 return r;
6130
6131 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
6132 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6133 alu.op = ALU_OP3_CNDE_INT;
6134 alu.is_op3 = 1;
6135
6136 alu.dst.sel = tmp0;
6137 alu.dst.chan = 2;
6138 alu.dst.write = 1;
6139
6140 alu.src[0].sel = tmp0;
6141 alu.src[0].chan = 1;
6142 alu.src[1].sel = tmp0;
6143 alu.src[1].chan = 3;
6144 alu.src[2].sel = tmp0;
6145 alu.src[2].chan = 2;
6146
6147 alu.last = 1;
6148 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6149 return r;
6150
6151 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
6152 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6153 alu.op = ALU_OP2_MULHI_UINT;
6154
6155 alu.dst.sel = tmp0;
6156 alu.dst.chan = 3;
6157 alu.dst.write = 1;
6158
6159 alu.src[0].sel = tmp0;
6160 alu.src[0].chan = 2;
6161
6162 alu.src[1].sel = tmp0;
6163 alu.src[1].chan = 0;
6164
6165 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6166 return r;
6167
6168 /* 7. tmp1.x = tmp0.x - tmp0.w */
6169 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6170 alu.op = ALU_OP2_SUB_INT;
6171
6172 alu.dst.sel = tmp1;
6173 alu.dst.chan = 0;
6174 alu.dst.write = 1;
6175
6176 alu.src[0].sel = tmp0;
6177 alu.src[0].chan = 0;
6178 alu.src[1].sel = tmp0;
6179 alu.src[1].chan = 3;
6180
6181 alu.last = 1;
6182 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6183 return r;
6184
6185 /* 8. tmp1.y = tmp0.x + tmp0.w */
6186 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6187 alu.op = ALU_OP2_ADD_INT;
6188
6189 alu.dst.sel = tmp1;
6190 alu.dst.chan = 1;
6191 alu.dst.write = 1;
6192
6193 alu.src[0].sel = tmp0;
6194 alu.src[0].chan = 0;
6195 alu.src[1].sel = tmp0;
6196 alu.src[1].chan = 3;
6197
6198 alu.last = 1;
6199 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6200 return r;
6201
6202 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6203 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6204 alu.op = ALU_OP3_CNDE_INT;
6205 alu.is_op3 = 1;
6206
6207 alu.dst.sel = tmp0;
6208 alu.dst.chan = 0;
6209 alu.dst.write = 1;
6210
6211 alu.src[0].sel = tmp0;
6212 alu.src[0].chan = 1;
6213 alu.src[1].sel = tmp1;
6214 alu.src[1].chan = 1;
6215 alu.src[2].sel = tmp1;
6216 alu.src[2].chan = 0;
6217
6218 alu.last = 1;
6219 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6220 return r;
6221
6222 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
6223 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6224 alu.op = ALU_OP2_MULHI_UINT;
6225
6226 alu.dst.sel = tmp0;
6227 alu.dst.chan = 2;
6228 alu.dst.write = 1;
6229
6230 alu.src[0].sel = tmp0;
6231 alu.src[0].chan = 0;
6232
6233 if (signed_op) {
6234 alu.src[1].sel = tmp2;
6235 alu.src[1].chan = 0;
6236 } else {
6237 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6238 }
6239
6240 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6241 return r;
6242
6243 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
6244 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6245 alu.op = ALU_OP2_MULLO_UINT;
6246
6247 alu.dst.sel = tmp0;
6248 alu.dst.chan = 1;
6249 alu.dst.write = 1;
6250
6251 if (signed_op) {
6252 alu.src[0].sel = tmp2;
6253 alu.src[0].chan = 1;
6254 } else {
6255 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6256 }
6257
6258 alu.src[1].sel = tmp0;
6259 alu.src[1].chan = 2;
6260
6261 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6262 return r;
6263
6264 /* 12. tmp0.w = src1 - tmp0.y = r */
6265 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6266 alu.op = ALU_OP2_SUB_INT;
6267
6268 alu.dst.sel = tmp0;
6269 alu.dst.chan = 3;
6270 alu.dst.write = 1;
6271
6272 if (signed_op) {
6273 alu.src[0].sel = tmp2;
6274 alu.src[0].chan = 0;
6275 } else {
6276 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6277 }
6278
6279 alu.src[1].sel = tmp0;
6280 alu.src[1].chan = 1;
6281
6282 alu.last = 1;
6283 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6284 return r;
6285
6286 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
6287 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6288 alu.op = ALU_OP2_SETGE_UINT;
6289
6290 alu.dst.sel = tmp1;
6291 alu.dst.chan = 0;
6292 alu.dst.write = 1;
6293
6294 alu.src[0].sel = tmp0;
6295 alu.src[0].chan = 3;
6296 if (signed_op) {
6297 alu.src[1].sel = tmp2;
6298 alu.src[1].chan = 1;
6299 } else {
6300 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6301 }
6302
6303 alu.last = 1;
6304 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6305 return r;
6306
6307 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
6308 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6309 alu.op = ALU_OP2_SETGE_UINT;
6310
6311 alu.dst.sel = tmp1;
6312 alu.dst.chan = 1;
6313 alu.dst.write = 1;
6314
6315 if (signed_op) {
6316 alu.src[0].sel = tmp2;
6317 alu.src[0].chan = 0;
6318 } else {
6319 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6320 }
6321
6322 alu.src[1].sel = tmp0;
6323 alu.src[1].chan = 1;
6324
6325 alu.last = 1;
6326 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6327 return r;
6328
6329 if (mod) { /* UMOD */
6330
6331 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
6332 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6333 alu.op = ALU_OP2_SUB_INT;
6334
6335 alu.dst.sel = tmp1;
6336 alu.dst.chan = 2;
6337 alu.dst.write = 1;
6338
6339 alu.src[0].sel = tmp0;
6340 alu.src[0].chan = 3;
6341
6342 if (signed_op) {
6343 alu.src[1].sel = tmp2;
6344 alu.src[1].chan = 1;
6345 } else {
6346 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6347 }
6348
6349 alu.last = 1;
6350 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6351 return r;
6352
6353 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
6354 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6355 alu.op = ALU_OP2_ADD_INT;
6356
6357 alu.dst.sel = tmp1;
6358 alu.dst.chan = 3;
6359 alu.dst.write = 1;
6360
6361 alu.src[0].sel = tmp0;
6362 alu.src[0].chan = 3;
6363 if (signed_op) {
6364 alu.src[1].sel = tmp2;
6365 alu.src[1].chan = 1;
6366 } else {
6367 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6368 }
6369
6370 alu.last = 1;
6371 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6372 return r;
6373
6374 } else { /* UDIV */
6375
6376 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
6377 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6378 alu.op = ALU_OP2_ADD_INT;
6379
6380 alu.dst.sel = tmp1;
6381 alu.dst.chan = 2;
6382 alu.dst.write = 1;
6383
6384 alu.src[0].sel = tmp0;
6385 alu.src[0].chan = 2;
6386 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6387
6388 alu.last = 1;
6389 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6390 return r;
6391
6392 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
6393 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6394 alu.op = ALU_OP2_ADD_INT;
6395
6396 alu.dst.sel = tmp1;
6397 alu.dst.chan = 3;
6398 alu.dst.write = 1;
6399
6400 alu.src[0].sel = tmp0;
6401 alu.src[0].chan = 2;
6402 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6403
6404 alu.last = 1;
6405 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6406 return r;
6407
6408 }
6409
6410 /* 17. tmp1.x = tmp1.x & tmp1.y */
6411 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6412 alu.op = ALU_OP2_AND_INT;
6413
6414 alu.dst.sel = tmp1;
6415 alu.dst.chan = 0;
6416 alu.dst.write = 1;
6417
6418 alu.src[0].sel = tmp1;
6419 alu.src[0].chan = 0;
6420 alu.src[1].sel = tmp1;
6421 alu.src[1].chan = 1;
6422
6423 alu.last = 1;
6424 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6425 return r;
6426
6427 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
6428 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
6429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6430 alu.op = ALU_OP3_CNDE_INT;
6431 alu.is_op3 = 1;
6432
6433 alu.dst.sel = tmp0;
6434 alu.dst.chan = 2;
6435 alu.dst.write = 1;
6436
6437 alu.src[0].sel = tmp1;
6438 alu.src[0].chan = 0;
6439 alu.src[1].sel = tmp0;
6440 alu.src[1].chan = mod ? 3 : 2;
6441 alu.src[2].sel = tmp1;
6442 alu.src[2].chan = 2;
6443
6444 alu.last = 1;
6445 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6446 return r;
6447
6448 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6449 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6450 alu.op = ALU_OP3_CNDE_INT;
6451 alu.is_op3 = 1;
6452
6453 if (signed_op) {
6454 alu.dst.sel = tmp0;
6455 alu.dst.chan = 2;
6456 alu.dst.write = 1;
6457 } else {
6458 if (tmp4 > 0) {
6459 alu.dst.sel = tmp4;
6460 alu.dst.chan = i;
6461 alu.dst.write = 1;
6462 } else {
6463 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6464 }
6465 }
6466
6467 alu.src[0].sel = tmp1;
6468 alu.src[0].chan = 1;
6469 alu.src[1].sel = tmp1;
6470 alu.src[1].chan = 3;
6471 alu.src[2].sel = tmp0;
6472 alu.src[2].chan = 2;
6473
6474 alu.last = 1;
6475 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6476 return r;
6477
6478 if (signed_op) {
6479
6480 /* fix the sign of the result */
6481
6482 if (mod) {
6483
6484 /* tmp0.x = -tmp0.z */
6485 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6486 alu.op = ALU_OP2_SUB_INT;
6487
6488 alu.dst.sel = tmp0;
6489 alu.dst.chan = 0;
6490 alu.dst.write = 1;
6491
6492 alu.src[0].sel = V_SQ_ALU_SRC_0;
6493 alu.src[1].sel = tmp0;
6494 alu.src[1].chan = 2;
6495
6496 alu.last = 1;
6497 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6498 return r;
6499
6500 /* sign of the remainder is the same as the sign of src0 */
6501 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6503 alu.op = ALU_OP3_CNDGE_INT;
6504 alu.is_op3 = 1;
6505
6506 if (tmp4 > 0) {
6507 alu.dst.sel = tmp4;
6508 alu.dst.chan = i;
6509 alu.dst.write = 1;
6510 } else {
6511 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6512 }
6513
6514 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6515 alu.src[1].sel = tmp0;
6516 alu.src[1].chan = 2;
6517 alu.src[2].sel = tmp0;
6518 alu.src[2].chan = 0;
6519
6520 alu.last = 1;
6521 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6522 return r;
6523
6524 } else {
6525
6526 /* tmp0.x = -tmp0.z */
6527 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6528 alu.op = ALU_OP2_SUB_INT;
6529
6530 alu.dst.sel = tmp0;
6531 alu.dst.chan = 0;
6532 alu.dst.write = 1;
6533
6534 alu.src[0].sel = V_SQ_ALU_SRC_0;
6535 alu.src[1].sel = tmp0;
6536 alu.src[1].chan = 2;
6537
6538 alu.last = 1;
6539 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6540 return r;
6541
6542 /* fix the quotient sign (same as the sign of src0*src1) */
6543 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6544 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6545 alu.op = ALU_OP3_CNDGE_INT;
6546 alu.is_op3 = 1;
6547
6548 if (tmp4 > 0) {
6549 alu.dst.sel = tmp4;
6550 alu.dst.chan = i;
6551 alu.dst.write = 1;
6552 } else {
6553 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6554 }
6555
6556 alu.src[0].sel = tmp2;
6557 alu.src[0].chan = 2;
6558 alu.src[1].sel = tmp0;
6559 alu.src[1].chan = 2;
6560 alu.src[2].sel = tmp0;
6561 alu.src[2].chan = 0;
6562
6563 alu.last = 1;
6564 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6565 return r;
6566 }
6567 }
6568 }
6569
6570 if (tmp4 > 0) {
6571 for (i = 0; i <= lasti; ++i) {
6572 if (!(write_mask & (1<<i)))
6573 continue;
6574
6575 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6576 alu.op = ALU_OP1_MOV;
6577 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6578 alu.src[0].sel = tmp4;
6579 alu.src[0].chan = i;
6580
6581 if (i == lasti)
6582 alu.last = 1;
6583 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6584 return r;
6585 }
6586 }
6587
6588 return 0;
6589 }
6590
6591 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6592 {
6593 return tgsi_divmod(ctx, 0, 0);
6594 }
6595
6596 static int tgsi_umod(struct r600_shader_ctx *ctx)
6597 {
6598 return tgsi_divmod(ctx, 1, 0);
6599 }
6600
6601 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6602 {
6603 return tgsi_divmod(ctx, 0, 1);
6604 }
6605
6606 static int tgsi_imod(struct r600_shader_ctx *ctx)
6607 {
6608 return tgsi_divmod(ctx, 1, 1);
6609 }
6610
6611
6612 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6613 {
6614 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6615 struct r600_bytecode_alu alu;
6616 int i, r;
6617 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6618 int last_inst = tgsi_last_instruction(write_mask);
6619
6620 for (i = 0; i < 4; i++) {
6621 if (!(write_mask & (1<<i)))
6622 continue;
6623
6624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6625 alu.op = ALU_OP1_TRUNC;
6626
6627 alu.dst.sel = ctx->temp_reg;
6628 alu.dst.chan = i;
6629 alu.dst.write = 1;
6630
6631 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6632 if (i == last_inst)
6633 alu.last = 1;
6634 r = r600_bytecode_add_alu(ctx->bc, &alu);
6635 if (r)
6636 return r;
6637 }
6638
6639 for (i = 0; i < 4; i++) {
6640 if (!(write_mask & (1<<i)))
6641 continue;
6642
6643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6644 alu.op = ctx->inst_info->op;
6645
6646 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6647
6648 alu.src[0].sel = ctx->temp_reg;
6649 alu.src[0].chan = i;
6650
6651 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6652 alu.last = 1;
6653 r = r600_bytecode_add_alu(ctx->bc, &alu);
6654 if (r)
6655 return r;
6656 }
6657
6658 return 0;
6659 }
6660
6661 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6662 {
6663 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6664 struct r600_bytecode_alu alu;
6665 int i, r;
6666 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6667 int last_inst = tgsi_last_instruction(write_mask);
6668
6669 /* tmp = -src */
6670 for (i = 0; i < 4; i++) {
6671 if (!(write_mask & (1<<i)))
6672 continue;
6673
6674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6675 alu.op = ALU_OP2_SUB_INT;
6676
6677 alu.dst.sel = ctx->temp_reg;
6678 alu.dst.chan = i;
6679 alu.dst.write = 1;
6680
6681 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6682 alu.src[0].sel = V_SQ_ALU_SRC_0;
6683
6684 if (i == last_inst)
6685 alu.last = 1;
6686 r = r600_bytecode_add_alu(ctx->bc, &alu);
6687 if (r)
6688 return r;
6689 }
6690
6691 /* dst = (src >= 0 ? src : tmp) */
6692 for (i = 0; i < 4; i++) {
6693 if (!(write_mask & (1<<i)))
6694 continue;
6695
6696 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6697 alu.op = ALU_OP3_CNDGE_INT;
6698 alu.is_op3 = 1;
6699 alu.dst.write = 1;
6700
6701 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6702
6703 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6704 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6705 alu.src[2].sel = ctx->temp_reg;
6706 alu.src[2].chan = i;
6707
6708 if (i == last_inst)
6709 alu.last = 1;
6710 r = r600_bytecode_add_alu(ctx->bc, &alu);
6711 if (r)
6712 return r;
6713 }
6714 return 0;
6715 }
6716
6717 static int tgsi_issg(struct r600_shader_ctx *ctx)
6718 {
6719 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6720 struct r600_bytecode_alu alu;
6721 int i, r;
6722 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6723 int last_inst = tgsi_last_instruction(write_mask);
6724
6725 /* tmp = (src >= 0 ? src : -1) */
6726 for (i = 0; i < 4; i++) {
6727 if (!(write_mask & (1<<i)))
6728 continue;
6729
6730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6731 alu.op = ALU_OP3_CNDGE_INT;
6732 alu.is_op3 = 1;
6733
6734 alu.dst.sel = ctx->temp_reg;
6735 alu.dst.chan = i;
6736 alu.dst.write = 1;
6737
6738 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6739 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6740 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6741
6742 if (i == last_inst)
6743 alu.last = 1;
6744 r = r600_bytecode_add_alu(ctx->bc, &alu);
6745 if (r)
6746 return r;
6747 }
6748
6749 /* dst = (tmp > 0 ? 1 : tmp) */
6750 for (i = 0; i < 4; i++) {
6751 if (!(write_mask & (1<<i)))
6752 continue;
6753
6754 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6755 alu.op = ALU_OP3_CNDGT_INT;
6756 alu.is_op3 = 1;
6757 alu.dst.write = 1;
6758
6759 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6760
6761 alu.src[0].sel = ctx->temp_reg;
6762 alu.src[0].chan = i;
6763
6764 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6765
6766 alu.src[2].sel = ctx->temp_reg;
6767 alu.src[2].chan = i;
6768
6769 if (i == last_inst)
6770 alu.last = 1;
6771 r = r600_bytecode_add_alu(ctx->bc, &alu);
6772 if (r)
6773 return r;
6774 }
6775 return 0;
6776 }
6777
6778
6779
6780 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6781 {
6782 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6783 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6784 int last_inst = tgsi_last_instruction(write_mask);
6785 struct r600_bytecode_alu alu;
6786 int i, r;
6787
6788 /* tmp = (src > 0 ? 1 : src) */
6789 for (i = 0; i <= last_inst; i++) {
6790 if (!(write_mask & (1 << i)))
6791 continue;
6792 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6793 alu.op = ALU_OP3_CNDGT;
6794 alu.is_op3 = 1;
6795
6796 alu.dst.sel = ctx->temp_reg;
6797 alu.dst.chan = i;
6798
6799 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6800 alu.src[1].sel = V_SQ_ALU_SRC_1;
6801 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6802
6803 if (i == last_inst)
6804 alu.last = 1;
6805 r = r600_bytecode_add_alu(ctx->bc, &alu);
6806 if (r)
6807 return r;
6808 }
6809
6810 /* dst = (-tmp > 0 ? -1 : tmp) */
6811 for (i = 0; i <= last_inst; i++) {
6812 if (!(write_mask & (1 << i)))
6813 continue;
6814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6815 alu.op = ALU_OP3_CNDGT;
6816 alu.is_op3 = 1;
6817 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6818
6819 alu.src[0].sel = ctx->temp_reg;
6820 alu.src[0].chan = i;
6821 alu.src[0].neg = 1;
6822
6823 alu.src[1].sel = V_SQ_ALU_SRC_1;
6824 alu.src[1].neg = 1;
6825
6826 alu.src[2].sel = ctx->temp_reg;
6827 alu.src[2].chan = i;
6828
6829 if (i == last_inst)
6830 alu.last = 1;
6831 r = r600_bytecode_add_alu(ctx->bc, &alu);
6832 if (r)
6833 return r;
6834 }
6835 return 0;
6836 }
6837
6838 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6839 {
6840 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6841 struct r600_bytecode_alu alu;
6842 int i, r, t1, t2;
6843
6844 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6845 int last_inst = tgsi_last_instruction(write_mask);
6846
6847 t1 = r600_get_temp(ctx);
6848
6849 for (i = 0; i < 4; i++) {
6850 if (!(write_mask & (1<<i)))
6851 continue;
6852
6853 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6854 alu.op = ALU_OP2_SETGE_INT;
6855 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6856 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6857 alu.src[1].value = 32;
6858 alu.dst.sel = ctx->temp_reg;
6859 alu.dst.chan = i;
6860 alu.dst.write = 1;
6861 alu.last = i == last_inst;
6862 r = r600_bytecode_add_alu(ctx->bc, &alu);
6863 if (r)
6864 return r;
6865 }
6866
6867 for (i = 0; i < 4; i++) {
6868 if (!(write_mask & (1<<i)))
6869 continue;
6870
6871 /* create mask tmp */
6872 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6873 alu.op = ALU_OP2_BFM_INT;
6874 alu.dst.sel = t1;
6875 alu.dst.chan = i;
6876 alu.dst.write = 1;
6877 alu.last = i == last_inst;
6878
6879 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6880 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6881
6882 r = r600_bytecode_add_alu(ctx->bc, &alu);
6883 if (r)
6884 return r;
6885 }
6886
6887 t2 = r600_get_temp(ctx);
6888
6889 for (i = 0; i < 4; i++) {
6890 if (!(write_mask & (1<<i)))
6891 continue;
6892
6893 /* shift insert left */
6894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6895 alu.op = ALU_OP2_LSHL_INT;
6896 alu.dst.sel = t2;
6897 alu.dst.chan = i;
6898 alu.dst.write = 1;
6899 alu.last = i == last_inst;
6900
6901 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6902 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6903
6904 r = r600_bytecode_add_alu(ctx->bc, &alu);
6905 if (r)
6906 return r;
6907 }
6908
6909 for (i = 0; i < 4; i++) {
6910 if (!(write_mask & (1<<i)))
6911 continue;
6912
6913 /* actual bitfield insert */
6914 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6915 alu.op = ALU_OP3_BFI_INT;
6916 alu.is_op3 = 1;
6917 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6918 alu.dst.chan = i;
6919 alu.dst.write = 1;
6920 alu.last = i == last_inst;
6921
6922 alu.src[0].sel = t1;
6923 alu.src[0].chan = i;
6924 alu.src[1].sel = t2;
6925 alu.src[1].chan = i;
6926 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6927
6928 r = r600_bytecode_add_alu(ctx->bc, &alu);
6929 if (r)
6930 return r;
6931 }
6932
6933 for (i = 0; i < 4; i++) {
6934 if (!(write_mask & (1<<i)))
6935 continue;
6936 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6937 alu.op = ALU_OP3_CNDE_INT;
6938 alu.is_op3 = 1;
6939 alu.src[0].sel = ctx->temp_reg;
6940 alu.src[0].chan = i;
6941 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6942
6943 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6944
6945 alu.src[1].sel = alu.dst.sel;
6946 alu.src[1].chan = i;
6947
6948 alu.last = i == last_inst;
6949 r = r600_bytecode_add_alu(ctx->bc, &alu);
6950 if (r)
6951 return r;
6952 }
6953 return 0;
6954 }
6955
6956 static int tgsi_msb(struct r600_shader_ctx *ctx)
6957 {
6958 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6959 struct r600_bytecode_alu alu;
6960 int i, r, t1, t2;
6961
6962 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6963 int last_inst = tgsi_last_instruction(write_mask);
6964
6965 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6966 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6967
6968 t1 = ctx->temp_reg;
6969
6970 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6971 for (i = 0; i < 4; i++) {
6972 if (!(write_mask & (1<<i)))
6973 continue;
6974
6975 /* t1 = FFBH_INT / FFBH_UINT */
6976 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6977 alu.op = ctx->inst_info->op;
6978 alu.dst.sel = t1;
6979 alu.dst.chan = i;
6980 alu.dst.write = 1;
6981 alu.last = i == last_inst;
6982
6983 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6984
6985 r = r600_bytecode_add_alu(ctx->bc, &alu);
6986 if (r)
6987 return r;
6988 }
6989
6990 t2 = r600_get_temp(ctx);
6991
6992 for (i = 0; i < 4; i++) {
6993 if (!(write_mask & (1<<i)))
6994 continue;
6995
6996 /* t2 = 31 - t1 */
6997 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6998 alu.op = ALU_OP2_SUB_INT;
6999 alu.dst.sel = t2;
7000 alu.dst.chan = i;
7001 alu.dst.write = 1;
7002 alu.last = i == last_inst;
7003
7004 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7005 alu.src[0].value = 31;
7006 alu.src[1].sel = t1;
7007 alu.src[1].chan = i;
7008
7009 r = r600_bytecode_add_alu(ctx->bc, &alu);
7010 if (r)
7011 return r;
7012 }
7013
7014 for (i = 0; i < 4; i++) {
7015 if (!(write_mask & (1<<i)))
7016 continue;
7017
7018 /* result = t1 >= 0 ? t2 : t1 */
7019 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7020 alu.op = ALU_OP3_CNDGE_INT;
7021 alu.is_op3 = 1;
7022 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7023 alu.dst.chan = i;
7024 alu.dst.write = 1;
7025 alu.last = i == last_inst;
7026
7027 alu.src[0].sel = t1;
7028 alu.src[0].chan = i;
7029 alu.src[1].sel = t2;
7030 alu.src[1].chan = i;
7031 alu.src[2].sel = t1;
7032 alu.src[2].chan = i;
7033
7034 r = r600_bytecode_add_alu(ctx->bc, &alu);
7035 if (r)
7036 return r;
7037 }
7038
7039 return 0;
7040 }
7041
7042 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
7043 {
7044 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7045 struct r600_bytecode_alu alu;
7046 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
7047 unsigned location;
7048 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
7049
7050 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
7051
7052 /* Interpolators have been marked for use already by allocate_system_value_inputs */
7053 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7054 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7055 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
7056 }
7057 else {
7058 location = TGSI_INTERPOLATE_LOC_CENTROID;
7059 ctx->shader->input[input].uses_interpolate_at_centroid = 1;
7060 }
7061
7062 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
7063 if (k < 0)
7064 k = 0;
7065 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
7066 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
7067
7068 /* NOTE: currently offset is not perspective correct */
7069 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7070 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7071 int sample_gpr = -1;
7072 int gradientsH, gradientsV;
7073 struct r600_bytecode_tex tex;
7074
7075 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7076 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
7077 }
7078
7079 gradientsH = r600_get_temp(ctx);
7080 gradientsV = r600_get_temp(ctx);
7081 for (i = 0; i < 2; i++) {
7082 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7083 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7084 tex.src_gpr = interp_gpr;
7085 tex.src_sel_x = interp_base_chan + 0;
7086 tex.src_sel_y = interp_base_chan + 1;
7087 tex.src_sel_z = 0;
7088 tex.src_sel_w = 0;
7089 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7090 tex.dst_sel_x = 0;
7091 tex.dst_sel_y = 1;
7092 tex.dst_sel_z = 7;
7093 tex.dst_sel_w = 7;
7094 tex.inst_mod = 1; // Use per pixel gradient calculation
7095 tex.sampler_id = 0;
7096 tex.resource_id = tex.sampler_id;
7097 r = r600_bytecode_add_tex(ctx->bc, &tex);
7098 if (r)
7099 return r;
7100 }
7101
7102 for (i = 0; i < 2; i++) {
7103 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7104 alu.op = ALU_OP3_MULADD;
7105 alu.is_op3 = 1;
7106 alu.src[0].sel = gradientsH;
7107 alu.src[0].chan = i;
7108 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7109 alu.src[1].sel = sample_gpr;
7110 alu.src[1].chan = 2;
7111 }
7112 else {
7113 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7114 }
7115 alu.src[2].sel = interp_gpr;
7116 alu.src[2].chan = interp_base_chan + i;
7117 alu.dst.sel = ctx->temp_reg;
7118 alu.dst.chan = i;
7119 alu.last = i == 1;
7120
7121 r = r600_bytecode_add_alu(ctx->bc, &alu);
7122 if (r)
7123 return r;
7124 }
7125
7126 for (i = 0; i < 2; i++) {
7127 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7128 alu.op = ALU_OP3_MULADD;
7129 alu.is_op3 = 1;
7130 alu.src[0].sel = gradientsV;
7131 alu.src[0].chan = i;
7132 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7133 alu.src[1].sel = sample_gpr;
7134 alu.src[1].chan = 3;
7135 }
7136 else {
7137 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7138 }
7139 alu.src[2].sel = ctx->temp_reg;
7140 alu.src[2].chan = i;
7141 alu.dst.sel = ctx->temp_reg;
7142 alu.dst.chan = i;
7143 alu.last = i == 1;
7144
7145 r = r600_bytecode_add_alu(ctx->bc, &alu);
7146 if (r)
7147 return r;
7148 }
7149 }
7150
7151 tmp = r600_get_temp(ctx);
7152 for (i = 0; i < 8; i++) {
7153 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7154 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7155
7156 alu.dst.sel = tmp;
7157 if ((i > 1 && i < 6)) {
7158 alu.dst.write = 1;
7159 }
7160 else {
7161 alu.dst.write = 0;
7162 }
7163 alu.dst.chan = i % 4;
7164
7165 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7166 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7167 alu.src[0].sel = ctx->temp_reg;
7168 alu.src[0].chan = 1 - (i % 2);
7169 } else {
7170 alu.src[0].sel = interp_gpr;
7171 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7172 }
7173 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7174 alu.src[1].chan = 0;
7175
7176 alu.last = i % 4 == 3;
7177 alu.bank_swizzle_force = SQ_ALU_VEC_210;
7178
7179 r = r600_bytecode_add_alu(ctx->bc, &alu);
7180 if (r)
7181 return r;
7182 }
7183
7184 // INTERP can't swizzle dst
7185 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7186 for (i = 0; i <= lasti; i++) {
7187 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7188 continue;
7189
7190 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7191 alu.op = ALU_OP1_MOV;
7192 alu.src[0].sel = tmp;
7193 alu.src[0].chan = ctx->src[0].swizzle[i];
7194 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7195 alu.dst.write = 1;
7196 alu.last = i == lasti;
7197 r = r600_bytecode_add_alu(ctx->bc, &alu);
7198 if (r)
7199 return r;
7200 }
7201
7202 return 0;
7203 }
7204
7205
7206 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7207 {
7208 struct r600_bytecode_alu alu;
7209 int i, r;
7210
7211 for (i = 0; i < 4; i++) {
7212 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7213 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7214 alu.op = ALU_OP0_NOP;
7215 alu.dst.chan = i;
7216 } else {
7217 alu.op = ALU_OP1_MOV;
7218 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7219 alu.src[0].sel = ctx->temp_reg;
7220 alu.src[0].chan = i;
7221 }
7222 if (i == 3) {
7223 alu.last = 1;
7224 }
7225 r = r600_bytecode_add_alu(ctx->bc, &alu);
7226 if (r)
7227 return r;
7228 }
7229 return 0;
7230 }
7231
7232 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7233 unsigned writemask,
7234 struct r600_bytecode_alu_src *bc_src,
7235 const struct r600_shader_src *shader_src)
7236 {
7237 struct r600_bytecode_alu alu;
7238 int i, r;
7239 int lasti = tgsi_last_instruction(writemask);
7240 int temp_reg = 0;
7241
7242 r600_bytecode_src(&bc_src[0], shader_src, 0);
7243 r600_bytecode_src(&bc_src[1], shader_src, 1);
7244 r600_bytecode_src(&bc_src[2], shader_src, 2);
7245 r600_bytecode_src(&bc_src[3], shader_src, 3);
7246
7247 if (bc_src->abs) {
7248 temp_reg = r600_get_temp(ctx);
7249
7250 for (i = 0; i < lasti + 1; i++) {
7251 if (!(writemask & (1 << i)))
7252 continue;
7253 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7254 alu.op = ALU_OP1_MOV;
7255 alu.dst.sel = temp_reg;
7256 alu.dst.chan = i;
7257 alu.dst.write = 1;
7258 alu.src[0] = bc_src[i];
7259 if (i == lasti) {
7260 alu.last = 1;
7261 }
7262 r = r600_bytecode_add_alu(ctx->bc, &alu);
7263 if (r)
7264 return r;
7265 memset(&bc_src[i], 0, sizeof(*bc_src));
7266 bc_src[i].sel = temp_reg;
7267 bc_src[i].chan = i;
7268 }
7269 }
7270 return 0;
7271 }
7272
7273 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7274 {
7275 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7276 struct r600_bytecode_alu alu;
7277 struct r600_bytecode_alu_src srcs[4][4];
7278 int i, j, r;
7279 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7280 unsigned op = ctx->inst_info->op;
7281
7282 if (op == ALU_OP3_MULADD_IEEE &&
7283 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7284 op = ALU_OP3_MULADD;
7285
7286 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7287 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7288 srcs[j], &ctx->src[j]);
7289 if (r)
7290 return r;
7291 }
7292
7293 for (i = 0; i < lasti + 1; i++) {
7294 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7295 continue;
7296
7297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7298 alu.op = op;
7299 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7300 alu.src[j] = srcs[j][i];
7301 }
7302
7303 if (dst == -1) {
7304 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7305 } else {
7306 alu.dst.sel = dst;
7307 }
7308 alu.dst.chan = i;
7309 alu.dst.write = 1;
7310 alu.is_op3 = 1;
7311 if (i == lasti) {
7312 alu.last = 1;
7313 }
7314 r = r600_bytecode_add_alu(ctx->bc, &alu);
7315 if (r)
7316 return r;
7317 }
7318 return 0;
7319 }
7320
7321 static int tgsi_op3(struct r600_shader_ctx *ctx)
7322 {
7323 return tgsi_op3_dst(ctx, -1);
7324 }
7325
7326 static int tgsi_dp(struct r600_shader_ctx *ctx)
7327 {
7328 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7329 struct r600_bytecode_alu alu;
7330 int i, j, r;
7331 unsigned op = ctx->inst_info->op;
7332 if (op == ALU_OP2_DOT4_IEEE &&
7333 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7334 op = ALU_OP2_DOT4;
7335
7336 for (i = 0; i < 4; i++) {
7337 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7338 alu.op = op;
7339 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7340 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7341 }
7342
7343 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7344 alu.dst.chan = i;
7345 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7346 /* handle some special cases */
7347 switch (inst->Instruction.Opcode) {
7348 case TGSI_OPCODE_DP2:
7349 if (i > 1) {
7350 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7351 alu.src[0].chan = alu.src[1].chan = 0;
7352 }
7353 break;
7354 case TGSI_OPCODE_DP3:
7355 if (i > 2) {
7356 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7357 alu.src[0].chan = alu.src[1].chan = 0;
7358 }
7359 break;
7360 default:
7361 break;
7362 }
7363 if (i == 3) {
7364 alu.last = 1;
7365 }
7366 r = r600_bytecode_add_alu(ctx->bc, &alu);
7367 if (r)
7368 return r;
7369 }
7370 return 0;
7371 }
7372
7373 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7374 unsigned index)
7375 {
7376 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7377 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7378 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7379 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7380 ctx->src[index].neg || ctx->src[index].abs ||
7381 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7382 }
7383
7384 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7385 unsigned index)
7386 {
7387 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7388 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7389 }
7390
7391 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7392 {
7393 struct r600_bytecode_vtx vtx;
7394 struct r600_bytecode_alu alu;
7395 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7396 int src_gpr, r, i;
7397 int id = tgsi_tex_get_src_gpr(ctx, 1);
7398 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7399
7400 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7401 if (src_requires_loading) {
7402 for (i = 0; i < 4; i++) {
7403 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7404 alu.op = ALU_OP1_MOV;
7405 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7406 alu.dst.sel = ctx->temp_reg;
7407 alu.dst.chan = i;
7408 if (i == 3)
7409 alu.last = 1;
7410 alu.dst.write = 1;
7411 r = r600_bytecode_add_alu(ctx->bc, &alu);
7412 if (r)
7413 return r;
7414 }
7415 src_gpr = ctx->temp_reg;
7416 }
7417
7418 memset(&vtx, 0, sizeof(vtx));
7419 vtx.op = FETCH_OP_VFETCH;
7420 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7421 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7422 vtx.src_gpr = src_gpr;
7423 vtx.mega_fetch_count = 16;
7424 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7425 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7426 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
7427 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
7428 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
7429 vtx.use_const_fields = 1;
7430 vtx.buffer_index_mode = sampler_index_mode;
7431
7432 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7433 return r;
7434
7435 if (ctx->bc->chip_class >= EVERGREEN)
7436 return 0;
7437
7438 for (i = 0; i < 4; i++) {
7439 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7440 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7441 continue;
7442
7443 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7444 alu.op = ALU_OP2_AND_INT;
7445
7446 alu.dst.chan = i;
7447 alu.dst.sel = vtx.dst_gpr;
7448 alu.dst.write = 1;
7449
7450 alu.src[0].sel = vtx.dst_gpr;
7451 alu.src[0].chan = i;
7452
7453 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7454 alu.src[1].sel += (id * 2);
7455 alu.src[1].chan = i % 4;
7456 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7457
7458 if (i == lasti)
7459 alu.last = 1;
7460 r = r600_bytecode_add_alu(ctx->bc, &alu);
7461 if (r)
7462 return r;
7463 }
7464
7465 if (inst->Dst[0].Register.WriteMask & 3) {
7466 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7467 alu.op = ALU_OP2_OR_INT;
7468
7469 alu.dst.chan = 3;
7470 alu.dst.sel = vtx.dst_gpr;
7471 alu.dst.write = 1;
7472
7473 alu.src[0].sel = vtx.dst_gpr;
7474 alu.src[0].chan = 3;
7475
7476 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7477 alu.src[1].chan = 0;
7478 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7479
7480 alu.last = 1;
7481 r = r600_bytecode_add_alu(ctx->bc, &alu);
7482 if (r)
7483 return r;
7484 }
7485 return 0;
7486 }
7487
7488 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7489 {
7490 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7491 int r;
7492 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7493 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7494
7495 if (ctx->bc->chip_class < EVERGREEN) {
7496 struct r600_bytecode_alu alu;
7497 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7498 alu.op = ALU_OP1_MOV;
7499 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7500 /* r600 we have them at channel 2 of the second dword */
7501 alu.src[0].sel += (id * 2) + 1;
7502 alu.src[0].chan = 1;
7503 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7504 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7505 alu.last = 1;
7506 r = r600_bytecode_add_alu(ctx->bc, &alu);
7507 if (r)
7508 return r;
7509 return 0;
7510 } else {
7511 struct r600_bytecode_vtx vtx;
7512 memset(&vtx, 0, sizeof(vtx));
7513 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7514 vtx.buffer_id = id + eg_buffer_base;
7515 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7516 vtx.src_gpr = 0;
7517 vtx.mega_fetch_count = 16; /* no idea here really... */
7518 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7519 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7520 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
7521 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
7522 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
7523 vtx.data_format = FMT_32_32_32_32;
7524 vtx.buffer_index_mode = sampler_index_mode;
7525
7526 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7527 return r;
7528 return 0;
7529 }
7530 }
7531
7532
7533 static int tgsi_tex(struct r600_shader_ctx *ctx)
7534 {
7535 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7536 struct r600_bytecode_tex tex;
7537 struct r600_bytecode_tex grad_offs[3];
7538 struct r600_bytecode_alu alu;
7539 unsigned src_gpr;
7540 int r, i, j, n_grad_offs = 0;
7541 int opcode;
7542 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7543 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7544 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7545 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7546
7547 bool txf_add_offsets = inst->Texture.NumOffsets &&
7548 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7549 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7550
7551 /* Texture fetch instructions can only use gprs as source.
7552 * Also they cannot negate the source or take the absolute value */
7553 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7554 tgsi_tex_src_requires_loading(ctx, 0)) ||
7555 read_compressed_msaa || txf_add_offsets;
7556
7557 boolean src_loaded = FALSE;
7558 unsigned sampler_src_reg = 1;
7559 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7560 boolean has_txq_cube_array_z = false;
7561 unsigned sampler_index_mode;
7562 int array_index_offset_channel = -1;
7563
7564 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7565 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7566 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7567 if (inst->Dst[0].Register.WriteMask & 4) {
7568 ctx->shader->has_txq_cube_array_z_comp = true;
7569 has_txq_cube_array_z = true;
7570 }
7571
7572 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7573 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7574 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7575 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7576 sampler_src_reg = 2;
7577
7578 /* TGSI moves the sampler to src reg 3 for TXD */
7579 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7580 sampler_src_reg = 3;
7581
7582 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7583
7584 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7585
7586 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7587 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7588 if (ctx->bc->chip_class < EVERGREEN)
7589 ctx->shader->uses_tex_buffers = true;
7590 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7591 }
7592 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7593 if (ctx->bc->chip_class < EVERGREEN)
7594 ctx->shader->uses_tex_buffers = true;
7595 return do_vtx_fetch_inst(ctx, src_requires_loading);
7596 }
7597 }
7598
7599 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7600 int out_chan;
7601 /* Add perspective divide */
7602 if (ctx->bc->chip_class == CAYMAN) {
7603 out_chan = 2;
7604 for (i = 0; i < 3; i++) {
7605 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7606 alu.op = ALU_OP1_RECIP_IEEE;
7607 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7608
7609 alu.dst.sel = ctx->temp_reg;
7610 alu.dst.chan = i;
7611 if (i == 2)
7612 alu.last = 1;
7613 if (out_chan == i)
7614 alu.dst.write = 1;
7615 r = r600_bytecode_add_alu(ctx->bc, &alu);
7616 if (r)
7617 return r;
7618 }
7619
7620 } else {
7621 out_chan = 3;
7622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7623 alu.op = ALU_OP1_RECIP_IEEE;
7624 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7625
7626 alu.dst.sel = ctx->temp_reg;
7627 alu.dst.chan = out_chan;
7628 alu.last = 1;
7629 alu.dst.write = 1;
7630 r = r600_bytecode_add_alu(ctx->bc, &alu);
7631 if (r)
7632 return r;
7633 }
7634
7635 for (i = 0; i < 3; i++) {
7636 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7637 alu.op = ALU_OP2_MUL;
7638 alu.src[0].sel = ctx->temp_reg;
7639 alu.src[0].chan = out_chan;
7640 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7641 alu.dst.sel = ctx->temp_reg;
7642 alu.dst.chan = i;
7643 alu.dst.write = 1;
7644 r = r600_bytecode_add_alu(ctx->bc, &alu);
7645 if (r)
7646 return r;
7647 }
7648 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7649 alu.op = ALU_OP1_MOV;
7650 alu.src[0].sel = V_SQ_ALU_SRC_1;
7651 alu.src[0].chan = 0;
7652 alu.dst.sel = ctx->temp_reg;
7653 alu.dst.chan = 3;
7654 alu.last = 1;
7655 alu.dst.write = 1;
7656 r = r600_bytecode_add_alu(ctx->bc, &alu);
7657 if (r)
7658 return r;
7659 src_loaded = TRUE;
7660 src_gpr = ctx->temp_reg;
7661 }
7662
7663
7664 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7665 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7666 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7667 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7668 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7669
7670 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7671 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7672
7673 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7674 for (i = 0; i < 4; i++) {
7675 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7676 alu.op = ALU_OP2_CUBE;
7677 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7678 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7679 alu.dst.sel = ctx->temp_reg;
7680 alu.dst.chan = i;
7681 if (i == 3)
7682 alu.last = 1;
7683 alu.dst.write = 1;
7684 r = r600_bytecode_add_alu(ctx->bc, &alu);
7685 if (r)
7686 return r;
7687 }
7688
7689 /* tmp1.z = RCP_e(|tmp1.z|) */
7690 if (ctx->bc->chip_class == CAYMAN) {
7691 for (i = 0; i < 3; i++) {
7692 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7693 alu.op = ALU_OP1_RECIP_IEEE;
7694 alu.src[0].sel = ctx->temp_reg;
7695 alu.src[0].chan = 2;
7696 alu.src[0].abs = 1;
7697 alu.dst.sel = ctx->temp_reg;
7698 alu.dst.chan = i;
7699 if (i == 2)
7700 alu.dst.write = 1;
7701 if (i == 2)
7702 alu.last = 1;
7703 r = r600_bytecode_add_alu(ctx->bc, &alu);
7704 if (r)
7705 return r;
7706 }
7707 } else {
7708 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7709 alu.op = ALU_OP1_RECIP_IEEE;
7710 alu.src[0].sel = ctx->temp_reg;
7711 alu.src[0].chan = 2;
7712 alu.src[0].abs = 1;
7713 alu.dst.sel = ctx->temp_reg;
7714 alu.dst.chan = 2;
7715 alu.dst.write = 1;
7716 alu.last = 1;
7717 r = r600_bytecode_add_alu(ctx->bc, &alu);
7718 if (r)
7719 return r;
7720 }
7721
7722 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7723 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7724 * muladd has no writemask, have to use another temp
7725 */
7726 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7727 alu.op = ALU_OP3_MULADD;
7728 alu.is_op3 = 1;
7729
7730 alu.src[0].sel = ctx->temp_reg;
7731 alu.src[0].chan = 0;
7732 alu.src[1].sel = ctx->temp_reg;
7733 alu.src[1].chan = 2;
7734
7735 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7736 alu.src[2].chan = 0;
7737 alu.src[2].value = u_bitcast_f2u(1.5f);
7738
7739 alu.dst.sel = ctx->temp_reg;
7740 alu.dst.chan = 0;
7741 alu.dst.write = 1;
7742
7743 r = r600_bytecode_add_alu(ctx->bc, &alu);
7744 if (r)
7745 return r;
7746
7747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7748 alu.op = ALU_OP3_MULADD;
7749 alu.is_op3 = 1;
7750
7751 alu.src[0].sel = ctx->temp_reg;
7752 alu.src[0].chan = 1;
7753 alu.src[1].sel = ctx->temp_reg;
7754 alu.src[1].chan = 2;
7755
7756 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7757 alu.src[2].chan = 0;
7758 alu.src[2].value = u_bitcast_f2u(1.5f);
7759
7760 alu.dst.sel = ctx->temp_reg;
7761 alu.dst.chan = 1;
7762 alu.dst.write = 1;
7763
7764 alu.last = 1;
7765 r = r600_bytecode_add_alu(ctx->bc, &alu);
7766 if (r)
7767 return r;
7768 /* write initial compare value into Z component
7769 - W src 0 for shadow cube
7770 - X src 1 for shadow cube array */
7771 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7772 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7773 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7774 alu.op = ALU_OP1_MOV;
7775 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7776 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7777 else
7778 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7779 alu.dst.sel = ctx->temp_reg;
7780 alu.dst.chan = 2;
7781 alu.dst.write = 1;
7782 alu.last = 1;
7783 r = r600_bytecode_add_alu(ctx->bc, &alu);
7784 if (r)
7785 return r;
7786 }
7787
7788 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7789 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7790 if (ctx->bc->chip_class >= EVERGREEN) {
7791 int mytmp = r600_get_temp(ctx);
7792 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7793 alu.op = ALU_OP1_MOV;
7794 alu.src[0].sel = ctx->temp_reg;
7795 alu.src[0].chan = 3;
7796 alu.dst.sel = mytmp;
7797 alu.dst.chan = 0;
7798 alu.dst.write = 1;
7799 alu.last = 1;
7800 r = r600_bytecode_add_alu(ctx->bc, &alu);
7801 if (r)
7802 return r;
7803
7804 /* Evaluate the array index according to floor(idx + 0.5). This
7805 * needs to be done before merging the face select value, because
7806 * otherwise the fractional part of the array index will interfere
7807 * with the face select value */
7808 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7809 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7810 alu.op = ALU_OP1_RNDNE;
7811 alu.dst.sel = ctx->temp_reg;
7812 alu.dst.chan = 3;
7813 alu.dst.write = 1;
7814 alu.last = 1;
7815 r = r600_bytecode_add_alu(ctx->bc, &alu);
7816 if (r)
7817 return r;
7818
7819 /* Because the array slice index and the cube face index are merged
7820 * into one value we have to make sure the array slice index is >= 0,
7821 * otherwise the face selection will fail */
7822 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7823 alu.op = ALU_OP2_MAX;
7824 alu.src[0].sel = ctx->temp_reg;
7825 alu.src[0].chan = 3;
7826 alu.src[1].sel = V_SQ_ALU_SRC_0;
7827 alu.dst.sel = ctx->temp_reg;
7828 alu.dst.chan = 3;
7829 alu.dst.write = 1;
7830 alu.last = 1;
7831 r = r600_bytecode_add_alu(ctx->bc, &alu);
7832 if (r)
7833 return r;
7834
7835 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7836 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7837 alu.op = ALU_OP3_MULADD;
7838 alu.is_op3 = 1;
7839 alu.src[0].sel = ctx->temp_reg;
7840 alu.src[0].chan = 3;
7841 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7842 alu.src[1].chan = 0;
7843 alu.src[1].value = u_bitcast_f2u(8.0f);
7844 alu.src[2].sel = mytmp;
7845 alu.src[2].chan = 0;
7846 alu.dst.sel = ctx->temp_reg;
7847 alu.dst.chan = 3;
7848 alu.dst.write = 1;
7849 alu.last = 1;
7850 r = r600_bytecode_add_alu(ctx->bc, &alu);
7851 if (r)
7852 return r;
7853 } else if (ctx->bc->chip_class < EVERGREEN) {
7854 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7855 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7856 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7857 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7858 tex.src_gpr = r600_get_temp(ctx);
7859 tex.src_sel_x = 0;
7860 tex.src_sel_y = 0;
7861 tex.src_sel_z = 0;
7862 tex.src_sel_w = 0;
7863 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7864 tex.coord_type_x = 1;
7865 tex.coord_type_y = 1;
7866 tex.coord_type_z = 1;
7867 tex.coord_type_w = 1;
7868 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7869 alu.op = ALU_OP1_MOV;
7870 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7871 alu.dst.sel = tex.src_gpr;
7872 alu.dst.chan = 0;
7873 alu.last = 1;
7874 alu.dst.write = 1;
7875 r = r600_bytecode_add_alu(ctx->bc, &alu);
7876 if (r)
7877 return r;
7878
7879 r = r600_bytecode_add_tex(ctx->bc, &tex);
7880 if (r)
7881 return r;
7882 }
7883
7884 }
7885
7886 /* for cube forms of lod and bias we need to route things */
7887 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7888 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7889 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7890 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7891 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7892 alu.op = ALU_OP1_MOV;
7893 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7894 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7895 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7896 else
7897 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7898 alu.dst.sel = ctx->temp_reg;
7899 alu.dst.chan = 2;
7900 alu.last = 1;
7901 alu.dst.write = 1;
7902 r = r600_bytecode_add_alu(ctx->bc, &alu);
7903 if (r)
7904 return r;
7905 }
7906
7907 src_loaded = TRUE;
7908 src_gpr = ctx->temp_reg;
7909 }
7910
7911 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7912 int temp_h = 0, temp_v = 0;
7913 int start_val = 0;
7914
7915 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7916 if (src_loaded == TRUE)
7917 start_val = 1;
7918 else
7919 src_loaded = TRUE;
7920 for (i = start_val; i < 3; i++) {
7921 int treg = r600_get_temp(ctx);
7922
7923 if (i == 0)
7924 src_gpr = treg;
7925 else if (i == 1)
7926 temp_h = treg;
7927 else
7928 temp_v = treg;
7929
7930 for (j = 0; j < 4; j++) {
7931 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7932 alu.op = ALU_OP1_MOV;
7933 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7934 alu.dst.sel = treg;
7935 alu.dst.chan = j;
7936 if (j == 3)
7937 alu.last = 1;
7938 alu.dst.write = 1;
7939 r = r600_bytecode_add_alu(ctx->bc, &alu);
7940 if (r)
7941 return r;
7942 }
7943 }
7944 for (i = 1; i < 3; i++) {
7945 /* set gradients h/v */
7946 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7947 memset(t, 0, sizeof(struct r600_bytecode_tex));
7948 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7949 FETCH_OP_SET_GRADIENTS_V;
7950 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7951 t->sampler_index_mode = sampler_index_mode;
7952 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7953 t->resource_index_mode = sampler_index_mode;
7954
7955 t->src_gpr = (i == 1) ? temp_h : temp_v;
7956 t->src_sel_x = 0;
7957 t->src_sel_y = 1;
7958 t->src_sel_z = 2;
7959 t->src_sel_w = 3;
7960
7961 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7962 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7963 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7964 t->coord_type_x = 1;
7965 t->coord_type_y = 1;
7966 t->coord_type_z = 1;
7967 t->coord_type_w = 1;
7968 }
7969 }
7970 }
7971
7972 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7973 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
7974 * incorrectly forces nearest filtering if the texture format is integer.
7975 * The only effect it has on Gather4, which always returns 4 texels for
7976 * bilinear filtering, is that the final coordinates are off by 0.5 of
7977 * the texel size.
7978 *
7979 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7980 * or (0.5 / size) from the normalized coordinates.
7981 */
7982 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7983 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7984 int treg = r600_get_temp(ctx);
7985
7986 /* mov array and comparison oordinate to temp_reg if needed */
7987 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7988 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7989 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7990 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7991 for (i = 2; i <= end; i++) {
7992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7993 alu.op = ALU_OP1_MOV;
7994 alu.dst.sel = ctx->temp_reg;
7995 alu.dst.chan = i;
7996 alu.dst.write = 1;
7997 alu.last = (i == end);
7998 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7999 r = r600_bytecode_add_alu(ctx->bc, &alu);
8000 if (r)
8001 return r;
8002 }
8003 }
8004
8005 if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
8006 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
8007 for (i = 0; i < 2; i++) {
8008 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8009 alu.op = ALU_OP2_ADD;
8010 alu.dst.sel = ctx->temp_reg;
8011 alu.dst.chan = i;
8012 alu.dst.write = 1;
8013 alu.last = i == 1;
8014 if (src_loaded) {
8015 alu.src[0].sel = ctx->temp_reg;
8016 alu.src[0].chan = i;
8017 } else
8018 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8019 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8020 alu.src[1].neg = 1;
8021 r = r600_bytecode_add_alu(ctx->bc, &alu);
8022 if (r)
8023 return r;
8024 }
8025 } else {
8026 /* execute a TXQ */
8027 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8028 tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
8029 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8030 tex.sampler_index_mode = sampler_index_mode;
8031 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8032 tex.resource_index_mode = sampler_index_mode;
8033 tex.dst_gpr = treg;
8034 tex.src_sel_x = 4;
8035 tex.src_sel_y = 4;
8036 tex.src_sel_z = 4;
8037 tex.src_sel_w = 4;
8038 tex.dst_sel_x = 0;
8039 tex.dst_sel_y = 1;
8040 tex.dst_sel_z = 7;
8041 tex.dst_sel_w = 7;
8042 r = r600_bytecode_add_tex(ctx->bc, &tex);
8043 if (r)
8044 return r;
8045
8046 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
8047 if (ctx->bc->chip_class == CAYMAN) {
8048 /* */
8049 for (i = 0; i < 2; i++) {
8050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8051 alu.op = ALU_OP1_INT_TO_FLT;
8052 alu.dst.sel = treg;
8053 alu.dst.chan = i;
8054 alu.dst.write = 1;
8055 alu.src[0].sel = treg;
8056 alu.src[0].chan = i;
8057 alu.last = (i == 1) ? 1 : 0;
8058 r = r600_bytecode_add_alu(ctx->bc, &alu);
8059 if (r)
8060 return r;
8061 }
8062 for (j = 0; j < 2; j++) {
8063 for (i = 0; i < 3; i++) {
8064 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8065 alu.op = ALU_OP1_RECIP_IEEE;
8066 alu.src[0].sel = treg;
8067 alu.src[0].chan = j;
8068 alu.dst.sel = treg;
8069 alu.dst.chan = i;
8070 if (i == 2)
8071 alu.last = 1;
8072 if (i == j)
8073 alu.dst.write = 1;
8074 r = r600_bytecode_add_alu(ctx->bc, &alu);
8075 if (r)
8076 return r;
8077 }
8078 }
8079 } else {
8080 for (i = 0; i < 2; i++) {
8081 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8082 alu.op = ALU_OP1_INT_TO_FLT;
8083 alu.dst.sel = treg;
8084 alu.dst.chan = i;
8085 alu.dst.write = 1;
8086 alu.src[0].sel = treg;
8087 alu.src[0].chan = i;
8088 alu.last = 1;
8089 r = r600_bytecode_add_alu(ctx->bc, &alu);
8090 if (r)
8091 return r;
8092 }
8093 for (i = 0; i < 2; i++) {
8094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8095 alu.op = ALU_OP1_RECIP_IEEE;
8096 alu.src[0].sel = treg;
8097 alu.src[0].chan = i;
8098 alu.dst.sel = treg;
8099 alu.dst.chan = i;
8100 alu.last = 1;
8101 alu.dst.write = 1;
8102 r = r600_bytecode_add_alu(ctx->bc, &alu);
8103 if (r)
8104 return r;
8105 }
8106 }
8107 for (i = 0; i < 2; i++) {
8108 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8109 alu.op = ALU_OP3_MULADD;
8110 alu.is_op3 = 1;
8111 alu.dst.sel = ctx->temp_reg;
8112 alu.dst.chan = i;
8113 alu.dst.write = 1;
8114 alu.last = i == 1;
8115 alu.src[0].sel = treg;
8116 alu.src[0].chan = i;
8117 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8118 alu.src[1].neg = 1;
8119 if (src_loaded) {
8120 alu.src[2].sel = ctx->temp_reg;
8121 alu.src[2].chan = i;
8122 } else
8123 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8124 r = r600_bytecode_add_alu(ctx->bc, &alu);
8125 if (r)
8126 return r;
8127 }
8128 }
8129 src_loaded = TRUE;
8130 src_gpr = ctx->temp_reg;
8131 }
8132 }
8133
8134 if (src_requires_loading && !src_loaded) {
8135 for (i = 0; i < 4; i++) {
8136 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8137 alu.op = ALU_OP1_MOV;
8138 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8139 alu.dst.sel = ctx->temp_reg;
8140 alu.dst.chan = i;
8141 if (i == 3)
8142 alu.last = 1;
8143 alu.dst.write = 1;
8144 r = r600_bytecode_add_alu(ctx->bc, &alu);
8145 if (r)
8146 return r;
8147 }
8148 src_loaded = TRUE;
8149 src_gpr = ctx->temp_reg;
8150 }
8151
8152 /* get offset values */
8153 if (inst->Texture.NumOffsets) {
8154 assert(inst->Texture.NumOffsets == 1);
8155
8156 /* The texture offset feature doesn't work with the TXF instruction
8157 * and must be emulated by adding the offset to the texture coordinates. */
8158 if (txf_add_offsets) {
8159 const struct tgsi_texture_offset *off = inst->TexOffsets;
8160
8161 switch (inst->Texture.Texture) {
8162 case TGSI_TEXTURE_3D:
8163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8164 alu.op = ALU_OP2_ADD_INT;
8165 alu.src[0].sel = src_gpr;
8166 alu.src[0].chan = 2;
8167 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8168 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8169 alu.dst.sel = src_gpr;
8170 alu.dst.chan = 2;
8171 alu.dst.write = 1;
8172 alu.last = 1;
8173 r = r600_bytecode_add_alu(ctx->bc, &alu);
8174 if (r)
8175 return r;
8176 /* fall through */
8177
8178 case TGSI_TEXTURE_2D:
8179 case TGSI_TEXTURE_SHADOW2D:
8180 case TGSI_TEXTURE_RECT:
8181 case TGSI_TEXTURE_SHADOWRECT:
8182 case TGSI_TEXTURE_2D_ARRAY:
8183 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8184 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8185 alu.op = ALU_OP2_ADD_INT;
8186 alu.src[0].sel = src_gpr;
8187 alu.src[0].chan = 1;
8188 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8189 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8190 alu.dst.sel = src_gpr;
8191 alu.dst.chan = 1;
8192 alu.dst.write = 1;
8193 alu.last = 1;
8194 r = r600_bytecode_add_alu(ctx->bc, &alu);
8195 if (r)
8196 return r;
8197 /* fall through */
8198
8199 case TGSI_TEXTURE_1D:
8200 case TGSI_TEXTURE_SHADOW1D:
8201 case TGSI_TEXTURE_1D_ARRAY:
8202 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8203 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8204 alu.op = ALU_OP2_ADD_INT;
8205 alu.src[0].sel = src_gpr;
8206 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8207 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8208 alu.dst.sel = src_gpr;
8209 alu.dst.write = 1;
8210 alu.last = 1;
8211 r = r600_bytecode_add_alu(ctx->bc, &alu);
8212 if (r)
8213 return r;
8214 break;
8215 /* texture offsets do not apply to other texture targets */
8216 }
8217 } else {
8218 switch (inst->Texture.Texture) {
8219 case TGSI_TEXTURE_3D:
8220 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8221 /* fallthrough */
8222 case TGSI_TEXTURE_2D:
8223 case TGSI_TEXTURE_SHADOW2D:
8224 case TGSI_TEXTURE_RECT:
8225 case TGSI_TEXTURE_SHADOWRECT:
8226 case TGSI_TEXTURE_2D_ARRAY:
8227 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8228 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8229 /* fallthrough */
8230 case TGSI_TEXTURE_1D:
8231 case TGSI_TEXTURE_SHADOW1D:
8232 case TGSI_TEXTURE_1D_ARRAY:
8233 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8234 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8235 }
8236 }
8237 }
8238
8239 /* Obtain the sample index for reading a compressed MSAA color texture.
8240 * To read the FMASK, we use the ldfptr instruction, which tells us
8241 * where the samples are stored.
8242 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8243 * which is the identity mapping. Each nibble says which physical sample
8244 * should be fetched to get that sample.
8245 *
8246 * Assume src.z contains the sample index. It should be modified like this:
8247 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8248 * Then fetch the texel with src.
8249 */
8250 if (read_compressed_msaa) {
8251 unsigned sample_chan = 3;
8252 unsigned temp = r600_get_temp(ctx);
8253 assert(src_loaded);
8254
8255 /* temp.w = ldfptr() */
8256 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8257 tex.op = FETCH_OP_LD;
8258 tex.inst_mod = 1; /* to indicate this is ldfptr */
8259 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8260 tex.sampler_index_mode = sampler_index_mode;
8261 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8262 tex.resource_index_mode = sampler_index_mode;
8263 tex.src_gpr = src_gpr;
8264 tex.dst_gpr = temp;
8265 tex.dst_sel_x = 7; /* mask out these components */
8266 tex.dst_sel_y = 7;
8267 tex.dst_sel_z = 7;
8268 tex.dst_sel_w = 0; /* store X */
8269 tex.src_sel_x = 0;
8270 tex.src_sel_y = 1;
8271 tex.src_sel_z = 2;
8272 tex.src_sel_w = 3;
8273 tex.offset_x = offset_x;
8274 tex.offset_y = offset_y;
8275 tex.offset_z = offset_z;
8276 r = r600_bytecode_add_tex(ctx->bc, &tex);
8277 if (r)
8278 return r;
8279
8280 /* temp.x = sample_index*4 */
8281 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8282 alu.op = ALU_OP2_MULLO_INT;
8283 alu.src[0].sel = src_gpr;
8284 alu.src[0].chan = sample_chan;
8285 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8286 alu.src[1].value = 4;
8287 alu.dst.sel = temp;
8288 alu.dst.chan = 0;
8289 alu.dst.write = 1;
8290 r = emit_mul_int_op(ctx->bc, &alu);
8291 if (r)
8292 return r;
8293
8294 /* sample_index = temp.w >> temp.x */
8295 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8296 alu.op = ALU_OP2_LSHR_INT;
8297 alu.src[0].sel = temp;
8298 alu.src[0].chan = 3;
8299 alu.src[1].sel = temp;
8300 alu.src[1].chan = 0;
8301 alu.dst.sel = src_gpr;
8302 alu.dst.chan = sample_chan;
8303 alu.dst.write = 1;
8304 alu.last = 1;
8305 r = r600_bytecode_add_alu(ctx->bc, &alu);
8306 if (r)
8307 return r;
8308
8309 /* sample_index & 0xF */
8310 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8311 alu.op = ALU_OP2_AND_INT;
8312 alu.src[0].sel = src_gpr;
8313 alu.src[0].chan = sample_chan;
8314 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8315 alu.src[1].value = 0xF;
8316 alu.dst.sel = src_gpr;
8317 alu.dst.chan = sample_chan;
8318 alu.dst.write = 1;
8319 alu.last = 1;
8320 r = r600_bytecode_add_alu(ctx->bc, &alu);
8321 if (r)
8322 return r;
8323 #if 0
8324 /* visualize the FMASK */
8325 for (i = 0; i < 4; i++) {
8326 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8327 alu.op = ALU_OP1_INT_TO_FLT;
8328 alu.src[0].sel = src_gpr;
8329 alu.src[0].chan = sample_chan;
8330 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8331 alu.dst.chan = i;
8332 alu.dst.write = 1;
8333 alu.last = 1;
8334 r = r600_bytecode_add_alu(ctx->bc, &alu);
8335 if (r)
8336 return r;
8337 }
8338 return 0;
8339 #endif
8340 }
8341
8342 /* does this shader want a num layers from TXQ for a cube array? */
8343 if (has_txq_cube_array_z) {
8344 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8345
8346 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8347 alu.op = ALU_OP1_MOV;
8348
8349 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8350 if (ctx->bc->chip_class >= EVERGREEN) {
8351 /* with eg each dword is number of cubes */
8352 alu.src[0].sel += id / 4;
8353 alu.src[0].chan = id % 4;
8354 } else {
8355 /* r600 we have them at channel 2 of the second dword */
8356 alu.src[0].sel += (id * 2) + 1;
8357 alu.src[0].chan = 2;
8358 }
8359 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8360 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8361 alu.last = 1;
8362 r = r600_bytecode_add_alu(ctx->bc, &alu);
8363 if (r)
8364 return r;
8365 /* disable writemask from texture instruction */
8366 inst->Dst[0].Register.WriteMask &= ~4;
8367 }
8368
8369 opcode = ctx->inst_info->op;
8370 if (opcode == FETCH_OP_GATHER4 &&
8371 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8372 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8373 struct r600_bytecode_tex *t;
8374 opcode = FETCH_OP_GATHER4_O;
8375
8376 /* GATHER4_O/GATHER4_C_O use offset values loaded by
8377 SET_TEXTURE_OFFSETS instruction. The immediate offset values
8378 encoded in the instruction are ignored. */
8379 t = &grad_offs[n_grad_offs++];
8380 memset(t, 0, sizeof(struct r600_bytecode_tex));
8381 t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8382 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8383 t->sampler_index_mode = sampler_index_mode;
8384 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8385 t->resource_index_mode = sampler_index_mode;
8386
8387 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8388 t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8389 t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8390 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8391 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8392 /* make sure array index selector is 0, this is just a safety
8393 * precausion because TGSI seems to emit something strange here */
8394 t->src_sel_z = 4;
8395 else
8396 t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8397
8398 t->src_sel_w = 4;
8399
8400 t->dst_sel_x = 7;
8401 t->dst_sel_y = 7;
8402 t->dst_sel_z = 7;
8403 t->dst_sel_w = 7;
8404 }
8405
8406 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8407 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8408 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8409 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8410 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8411 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8412 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8413 switch (opcode) {
8414 case FETCH_OP_SAMPLE:
8415 opcode = FETCH_OP_SAMPLE_C;
8416 break;
8417 case FETCH_OP_SAMPLE_L:
8418 opcode = FETCH_OP_SAMPLE_C_L;
8419 break;
8420 case FETCH_OP_SAMPLE_LB:
8421 opcode = FETCH_OP_SAMPLE_C_LB;
8422 break;
8423 case FETCH_OP_SAMPLE_G:
8424 opcode = FETCH_OP_SAMPLE_C_G;
8425 break;
8426 /* Texture gather variants */
8427 case FETCH_OP_GATHER4:
8428 opcode = FETCH_OP_GATHER4_C;
8429 break;
8430 case FETCH_OP_GATHER4_O:
8431 opcode = FETCH_OP_GATHER4_C_O;
8432 break;
8433 }
8434 }
8435
8436 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8437 tex.op = opcode;
8438
8439 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8440 tex.sampler_index_mode = sampler_index_mode;
8441 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8442 tex.resource_index_mode = sampler_index_mode;
8443 tex.src_gpr = src_gpr;
8444 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8445
8446 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8447 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8448 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8449 }
8450
8451 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8452 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8453 tex.inst_mod = texture_component_select;
8454
8455 if (ctx->bc->chip_class == CAYMAN) {
8456 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8457 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8458 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8459 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8460 } else {
8461 /* GATHER4 result order is different from TGSI TG4 */
8462 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8463 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8464 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8465 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8466 }
8467 }
8468 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8469 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8470 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8471 tex.dst_sel_z = 7;
8472 tex.dst_sel_w = 7;
8473 }
8474 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8475 tex.dst_sel_x = 3;
8476 tex.dst_sel_y = 7;
8477 tex.dst_sel_z = 7;
8478 tex.dst_sel_w = 7;
8479 }
8480 else {
8481 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8482 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8483 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8484 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8485 }
8486
8487
8488 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8489 tex.src_sel_x = 4;
8490 tex.src_sel_y = 4;
8491 tex.src_sel_z = 4;
8492 tex.src_sel_w = 4;
8493 } else if (src_loaded) {
8494 tex.src_sel_x = 0;
8495 tex.src_sel_y = 1;
8496 tex.src_sel_z = 2;
8497 tex.src_sel_w = 3;
8498 } else {
8499 tex.src_sel_x = ctx->src[0].swizzle[0];
8500 tex.src_sel_y = ctx->src[0].swizzle[1];
8501 tex.src_sel_z = ctx->src[0].swizzle[2];
8502 tex.src_sel_w = ctx->src[0].swizzle[3];
8503 tex.src_rel = ctx->src[0].rel;
8504 }
8505
8506 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8507 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8508 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8509 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8510 tex.src_sel_x = 1;
8511 tex.src_sel_y = 0;
8512 tex.src_sel_z = 3;
8513 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8514 }
8515
8516 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8517 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8518 tex.coord_type_x = 1;
8519 tex.coord_type_y = 1;
8520 }
8521 tex.coord_type_z = 1;
8522 tex.coord_type_w = 1;
8523
8524 tex.offset_x = offset_x;
8525 tex.offset_y = offset_y;
8526 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8527 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8528 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8529 tex.offset_z = 0;
8530 }
8531 else {
8532 tex.offset_z = offset_z;
8533 }
8534
8535 /* Put the depth for comparison in W.
8536 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8537 * Some instructions expect the depth in Z. */
8538 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8539 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8540 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8541 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8542 opcode != FETCH_OP_SAMPLE_C_L &&
8543 opcode != FETCH_OP_SAMPLE_C_LB) {
8544 tex.src_sel_w = tex.src_sel_z;
8545 }
8546
8547 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8548 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8549 if (opcode == FETCH_OP_SAMPLE_C_L ||
8550 opcode == FETCH_OP_SAMPLE_C_LB) {
8551 /* the array index is read from Y */
8552 tex.coord_type_y = 0;
8553 array_index_offset_channel = tex.src_sel_y;
8554 } else {
8555 /* the array index is read from Z */
8556 tex.coord_type_z = 0;
8557 tex.src_sel_z = tex.src_sel_y;
8558 array_index_offset_channel = tex.src_sel_z;
8559 }
8560 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8561 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8562 tex.coord_type_z = 0;
8563 array_index_offset_channel = tex.src_sel_z;
8564 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8565 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8566 (ctx->bc->chip_class >= EVERGREEN))
8567 /* the array index is read from Z, coordinate will be corrected elsewhere */
8568 tex.coord_type_z = 0;
8569
8570 /* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8571 * evaluate the array index */
8572 if (array_index_offset_channel >= 0 &&
8573 opcode != FETCH_OP_LD &&
8574 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8575 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8576 alu.src[0].sel = tex.src_gpr;
8577 alu.src[0].chan = array_index_offset_channel;
8578 alu.src[0].rel = tex.src_rel;
8579 alu.op = ALU_OP1_RNDNE;
8580 alu.dst.sel = tex.src_gpr;
8581 alu.dst.chan = array_index_offset_channel;
8582 alu.dst.rel = tex.src_rel;
8583 alu.dst.write = 1;
8584 alu.last = 1;
8585 r = r600_bytecode_add_alu(ctx->bc, &alu);
8586 if (r)
8587 return r;
8588 }
8589
8590 /* mask unused source components */
8591 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8592 switch (inst->Texture.Texture) {
8593 case TGSI_TEXTURE_2D:
8594 case TGSI_TEXTURE_RECT:
8595 tex.src_sel_z = 7;
8596 tex.src_sel_w = 7;
8597 break;
8598 case TGSI_TEXTURE_1D_ARRAY:
8599 tex.src_sel_y = 7;
8600 tex.src_sel_w = 7;
8601 break;
8602 case TGSI_TEXTURE_1D:
8603 tex.src_sel_y = 7;
8604 tex.src_sel_z = 7;
8605 tex.src_sel_w = 7;
8606 break;
8607 }
8608 }
8609
8610 /* Emit set gradient and offset instructions. */
8611 for (i = 0; i < n_grad_offs; ++i) {
8612 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8613 if (r)
8614 return r;
8615 }
8616
8617 r = r600_bytecode_add_tex(ctx->bc, &tex);
8618 if (r)
8619 return r;
8620
8621 /* add shadow ambient support - gallium doesn't do it yet */
8622 return 0;
8623 }
8624
8625 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8626 struct tgsi_full_src_register *src)
8627 {
8628 unsigned i;
8629
8630 if (src->Register.Indirect) {
8631 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8632 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8633 return ctx->shader->atomics[i].hw_idx;
8634 }
8635 } else {
8636 uint32_t index = src->Register.Index;
8637 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8638 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8639 continue;
8640 if (index > ctx->shader->atomics[i].end)
8641 continue;
8642 if (index < ctx->shader->atomics[i].start)
8643 continue;
8644 uint32_t offset = (index - ctx->shader->atomics[i].start);
8645 return ctx->shader->atomics[i].hw_idx + offset;
8646 }
8647 }
8648 assert(0);
8649 return -1;
8650 }
8651
8652 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8653 int *uav_id_p, int *uav_index_mode_p)
8654 {
8655 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8656 int uav_id, uav_index_mode = 0;
8657 int r;
8658 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8659
8660 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8661
8662 if (inst->Src[0].Register.Indirect) {
8663 if (is_cm) {
8664 struct r600_bytecode_alu alu;
8665 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8666 alu.op = ALU_OP2_LSHL_INT;
8667 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8668 alu.src[0].chan = 0;
8669 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8670 alu.src[1].value = 2;
8671 alu.dst.sel = ctx->temp_reg;
8672 alu.dst.chan = 0;
8673 alu.dst.write = 1;
8674 alu.last = 1;
8675 r = r600_bytecode_add_alu(ctx->bc, &alu);
8676 if (r)
8677 return r;
8678
8679 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8680 ctx->temp_reg, 0,
8681 ctx->temp_reg, 0,
8682 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8683 if (r)
8684 return r;
8685 } else
8686 uav_index_mode = 2;
8687 } else if (is_cm) {
8688 r = single_alu_op2(ctx, ALU_OP1_MOV,
8689 ctx->temp_reg, 0,
8690 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8691 0, 0);
8692 if (r)
8693 return r;
8694 }
8695 *uav_id_p = uav_id;
8696 *uav_index_mode_p = uav_index_mode;
8697 return 0;
8698 }
8699
8700 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8701 {
8702 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8703 int r;
8704 struct r600_bytecode_gds gds;
8705 int uav_id = 0;
8706 int uav_index_mode = 0;
8707 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8708
8709 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8710 if (r)
8711 return r;
8712
8713 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8714 gds.op = FETCH_OP_GDS_READ_RET;
8715 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8716 gds.uav_id = is_cm ? 0 : uav_id;
8717 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8718 gds.src_gpr = ctx->temp_reg;
8719 gds.src_sel_x = (is_cm) ? 0 : 4;
8720 gds.src_sel_y = 4;
8721 gds.src_sel_z = 4;
8722 gds.dst_sel_x = 0;
8723 gds.dst_sel_y = 7;
8724 gds.dst_sel_z = 7;
8725 gds.dst_sel_w = 7;
8726 gds.src_gpr2 = 0;
8727 gds.alloc_consume = !is_cm;
8728 r = r600_bytecode_add_gds(ctx->bc, &gds);
8729 if (r)
8730 return r;
8731
8732 ctx->bc->cf_last->vpm = 1;
8733 return 0;
8734 }
8735
8736 /* this fixes up 1D arrays properly */
8737 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8738 {
8739 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8740 int r, i;
8741 struct r600_bytecode_alu alu;
8742 int temp_reg = r600_get_temp(ctx);
8743
8744 for (i = 0; i < 4; i++) {
8745 bool def_val = true, write_zero = false;
8746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8747 alu.op = ALU_OP1_MOV;
8748 alu.dst.sel = temp_reg;
8749 alu.dst.chan = i;
8750
8751 switch (inst->Memory.Texture) {
8752 case TGSI_TEXTURE_BUFFER:
8753 case TGSI_TEXTURE_1D:
8754 if (i == 1 || i == 2 || i == 3) {
8755 write_zero = true;
8756 }
8757 break;
8758 case TGSI_TEXTURE_1D_ARRAY:
8759 if (i == 1 || i == 3)
8760 write_zero = true;
8761 else if (i == 2) {
8762 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8763 def_val = false;
8764 }
8765 break;
8766 case TGSI_TEXTURE_2D:
8767 if (i == 2 || i == 3)
8768 write_zero = true;
8769 break;
8770 default:
8771 if (i == 3)
8772 write_zero = true;
8773 break;
8774 }
8775
8776 if (write_zero) {
8777 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8778 alu.src[0].value = 0;
8779 } else if (def_val) {
8780 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8781 }
8782
8783 if (i == 3)
8784 alu.last = 1;
8785 alu.dst.write = 1;
8786 r = r600_bytecode_add_alu(ctx->bc, &alu);
8787 if (r)
8788 return r;
8789 }
8790 *idx_gpr = temp_reg;
8791 return 0;
8792 }
8793
8794 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8795 int temp_reg)
8796 {
8797 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8798 int r;
8799 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8800 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8801 r = single_alu_op2(ctx, ALU_OP1_MOV,
8802 temp_reg, 0,
8803 V_SQ_ALU_SRC_LITERAL, value >> 2,
8804 0, 0);
8805 if (r)
8806 return r;
8807 } else {
8808 struct r600_bytecode_alu alu;
8809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8810 alu.op = ALU_OP2_LSHR_INT;
8811 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8812 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8813 alu.src[1].value = 2;
8814 alu.dst.sel = temp_reg;
8815 alu.dst.write = 1;
8816 alu.last = 1;
8817 r = r600_bytecode_add_alu(ctx->bc, &alu);
8818 if (r)
8819 return r;
8820 }
8821 return 0;
8822 }
8823
8824 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8825 {
8826 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8827 /* have to work out the offset into the RAT immediate return buffer */
8828 struct r600_bytecode_vtx vtx;
8829 struct r600_bytecode_cf *cf;
8830 int r;
8831 int temp_reg = r600_get_temp(ctx);
8832 unsigned rat_index_mode;
8833 unsigned base;
8834
8835 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8836 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8837
8838 r = load_buffer_coord(ctx, 1, temp_reg);
8839 if (r)
8840 return r;
8841 ctx->bc->cf_last->barrier = 1;
8842 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8843 vtx.op = FETCH_OP_VFETCH;
8844 vtx.buffer_id = inst->Src[0].Register.Index + base;
8845 vtx.buffer_index_mode = rat_index_mode;
8846 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8847 vtx.src_gpr = temp_reg;
8848 vtx.src_sel_x = 0;
8849 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8850 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8851 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8852 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8853 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8854 vtx.num_format_all = 1;
8855 vtx.format_comp_all = 1;
8856 vtx.srf_mode_all = 0;
8857
8858 if (inst->Dst[0].Register.WriteMask & 8) {
8859 vtx.data_format = FMT_32_32_32_32;
8860 vtx.use_const_fields = 0;
8861 } else if (inst->Dst[0].Register.WriteMask & 4) {
8862 vtx.data_format = FMT_32_32_32;
8863 vtx.use_const_fields = 0;
8864 } else if (inst->Dst[0].Register.WriteMask & 2) {
8865 vtx.data_format = FMT_32_32;
8866 vtx.use_const_fields = 0;
8867 } else {
8868 vtx.data_format = FMT_32;
8869 vtx.use_const_fields = 0;
8870 }
8871
8872 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8873 if (r)
8874 return r;
8875 cf = ctx->bc->cf_last;
8876 cf->barrier = 1;
8877 return 0;
8878 }
8879
8880 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8881 {
8882 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8883 /* have to work out the offset into the RAT immediate return buffer */
8884 struct r600_bytecode_vtx vtx;
8885 struct r600_bytecode_cf *cf;
8886 int r;
8887 int idx_gpr;
8888 unsigned format, num_format, format_comp, endian;
8889 const struct util_format_description *desc;
8890 unsigned rat_index_mode;
8891 unsigned immed_base;
8892
8893 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8894
8895 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8896 r = load_index_src(ctx, 1, &idx_gpr);
8897 if (r)
8898 return r;
8899
8900 if (rat_index_mode)
8901 egcm_load_index_reg(ctx->bc, 1, false);
8902
8903 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8904 cf = ctx->bc->cf_last;
8905
8906 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8907 cf->rat.inst = V_RAT_INST_NOP_RTN;
8908 cf->rat.index_mode = rat_index_mode;
8909 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8910 cf->output.gpr = ctx->thread_id_gpr;
8911 cf->output.index_gpr = idx_gpr;
8912 cf->output.comp_mask = 0xf;
8913 cf->output.burst_count = 1;
8914 cf->vpm = 1;
8915 cf->barrier = 1;
8916 cf->mark = 1;
8917 cf->output.elem_size = 0;
8918
8919 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8920 cf = ctx->bc->cf_last;
8921 cf->barrier = 1;
8922
8923 desc = util_format_description(inst->Memory.Format);
8924 r600_vertex_data_type(inst->Memory.Format,
8925 &format, &num_format, &format_comp, &endian);
8926 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8927 vtx.op = FETCH_OP_VFETCH;
8928 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8929 vtx.buffer_index_mode = rat_index_mode;
8930 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8931 vtx.src_gpr = ctx->thread_id_gpr;
8932 vtx.src_sel_x = 1;
8933 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8934 vtx.dst_sel_x = desc->swizzle[0];
8935 vtx.dst_sel_y = desc->swizzle[1];
8936 vtx.dst_sel_z = desc->swizzle[2];
8937 vtx.dst_sel_w = desc->swizzle[3];
8938 vtx.srf_mode_all = 1;
8939 vtx.data_format = format;
8940 vtx.num_format_all = num_format;
8941 vtx.format_comp_all = format_comp;
8942 vtx.endian = endian;
8943 vtx.offset = 0;
8944 vtx.mega_fetch_count = 3;
8945 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8946 if (r)
8947 return r;
8948 cf = ctx->bc->cf_last;
8949 cf->barrier = 1;
8950 return 0;
8951 }
8952
8953 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8954 {
8955 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8956 struct r600_bytecode_alu alu;
8957 int r;
8958 int temp_reg = r600_get_temp(ctx);
8959
8960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8961 alu.op = ALU_OP1_MOV;
8962 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8963 alu.dst.sel = temp_reg;
8964 alu.dst.write = 1;
8965 alu.last = 1;
8966 r = r600_bytecode_add_alu(ctx->bc, &alu);
8967 if (r)
8968 return r;
8969
8970 r = do_lds_fetch_values(ctx, temp_reg,
8971 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8972 if (r)
8973 return r;
8974 return 0;
8975 }
8976
8977 static int tgsi_load(struct r600_shader_ctx *ctx)
8978 {
8979 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8980 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8981 return tgsi_load_rat(ctx);
8982 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8983 return tgsi_load_gds(ctx);
8984 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8985 return tgsi_load_buffer(ctx);
8986 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8987 return tgsi_load_lds(ctx);
8988 return 0;
8989 }
8990
8991 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8992 {
8993 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8994 struct r600_bytecode_cf *cf;
8995 int r, i;
8996 unsigned rat_index_mode;
8997 int lasti;
8998 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8999
9000 r = load_buffer_coord(ctx, 0, treg2);
9001 if (r)
9002 return r;
9003
9004 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9005 if (rat_index_mode)
9006 egcm_load_index_reg(ctx->bc, 1, false);
9007
9008 for (i = 0; i <= 3; i++) {
9009 struct r600_bytecode_alu alu;
9010 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9011 alu.op = ALU_OP1_MOV;
9012 alu.dst.sel = temp_reg;
9013 alu.dst.chan = i;
9014 alu.src[0].sel = V_SQ_ALU_SRC_0;
9015 alu.last = (i == 3);
9016 alu.dst.write = 1;
9017 r = r600_bytecode_add_alu(ctx->bc, &alu);
9018 if (r)
9019 return r;
9020 }
9021
9022 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9023 for (i = 0; i <= lasti; i++) {
9024 struct r600_bytecode_alu alu;
9025 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
9026 continue;
9027
9028 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9029 temp_reg, 0,
9030 treg2, 0,
9031 V_SQ_ALU_SRC_LITERAL, i);
9032 if (r)
9033 return r;
9034
9035 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9036 alu.op = ALU_OP1_MOV;
9037 alu.dst.sel = ctx->temp_reg;
9038 alu.dst.chan = 0;
9039
9040 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9041 alu.last = 1;
9042 alu.dst.write = 1;
9043 r = r600_bytecode_add_alu(ctx->bc, &alu);
9044 if (r)
9045 return r;
9046
9047 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9048 cf = ctx->bc->cf_last;
9049
9050 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
9051 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9052 cf->rat.index_mode = rat_index_mode;
9053 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9054 cf->output.gpr = ctx->temp_reg;
9055 cf->output.index_gpr = temp_reg;
9056 cf->output.comp_mask = 1;
9057 cf->output.burst_count = 1;
9058 cf->vpm = 1;
9059 cf->barrier = 1;
9060 cf->output.elem_size = 0;
9061 }
9062 return 0;
9063 }
9064
9065 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
9066 {
9067 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9068 struct r600_bytecode_cf *cf;
9069 bool src_requires_loading = false;
9070 int val_gpr, idx_gpr;
9071 int r, i;
9072 unsigned rat_index_mode;
9073
9074 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9075
9076 r = load_index_src(ctx, 0, &idx_gpr);
9077 if (r)
9078 return r;
9079
9080 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
9081 src_requires_loading = true;
9082
9083 if (src_requires_loading) {
9084 struct r600_bytecode_alu alu;
9085 for (i = 0; i < 4; i++) {
9086 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9087 alu.op = ALU_OP1_MOV;
9088 alu.dst.sel = ctx->temp_reg;
9089 alu.dst.chan = i;
9090
9091 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9092 if (i == 3)
9093 alu.last = 1;
9094 alu.dst.write = 1;
9095 r = r600_bytecode_add_alu(ctx->bc, &alu);
9096 if (r)
9097 return r;
9098 }
9099 val_gpr = ctx->temp_reg;
9100 } else
9101 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9102 if (rat_index_mode)
9103 egcm_load_index_reg(ctx->bc, 1, false);
9104
9105 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9106 cf = ctx->bc->cf_last;
9107
9108 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9109 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9110 cf->rat.index_mode = rat_index_mode;
9111 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9112 cf->output.gpr = val_gpr;
9113 cf->output.index_gpr = idx_gpr;
9114 cf->output.comp_mask = 0xf;
9115 cf->output.burst_count = 1;
9116 cf->vpm = 1;
9117 cf->barrier = 1;
9118 cf->output.elem_size = 0;
9119 return 0;
9120 }
9121
9122 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9123 {
9124 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9125 struct r600_bytecode_alu alu;
9126 int r, i, lasti;
9127 int write_mask = inst->Dst[0].Register.WriteMask;
9128 int temp_reg = r600_get_temp(ctx);
9129
9130 /* LDS write */
9131 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9132 alu.op = ALU_OP1_MOV;
9133 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9134 alu.dst.sel = temp_reg;
9135 alu.dst.write = 1;
9136 alu.last = 1;
9137 r = r600_bytecode_add_alu(ctx->bc, &alu);
9138 if (r)
9139 return r;
9140
9141 lasti = tgsi_last_instruction(write_mask);
9142 for (i = 1; i <= lasti; i++) {
9143 if (!(write_mask & (1 << i)))
9144 continue;
9145 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9146 temp_reg, i,
9147 temp_reg, 0,
9148 V_SQ_ALU_SRC_LITERAL, 4 * i);
9149 if (r)
9150 return r;
9151 }
9152 for (i = 0; i <= lasti; i++) {
9153 if (!(write_mask & (1 << i)))
9154 continue;
9155
9156 if ((i == 0 && ((write_mask & 3) == 3)) ||
9157 (i == 2 && ((write_mask & 0xc) == 0xc))) {
9158 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9159 alu.op = LDS_OP3_LDS_WRITE_REL;
9160
9161 alu.src[0].sel = temp_reg;
9162 alu.src[0].chan = i;
9163 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9164 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9165 alu.last = 1;
9166 alu.is_lds_idx_op = true;
9167 alu.lds_idx = 1;
9168 r = r600_bytecode_add_alu(ctx->bc, &alu);
9169 if (r)
9170 return r;
9171 i += 1;
9172 continue;
9173 }
9174 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9175 alu.op = LDS_OP2_LDS_WRITE;
9176
9177 alu.src[0].sel = temp_reg;
9178 alu.src[0].chan = i;
9179 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9180
9181 alu.last = 1;
9182 alu.is_lds_idx_op = true;
9183
9184 r = r600_bytecode_add_alu(ctx->bc, &alu);
9185 if (r)
9186 return r;
9187 }
9188 return 0;
9189 }
9190
9191 static int tgsi_store(struct r600_shader_ctx *ctx)
9192 {
9193 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9194 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9195 return tgsi_store_buffer_rat(ctx);
9196 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9197 return tgsi_store_lds(ctx);
9198 else
9199 return tgsi_store_rat(ctx);
9200 }
9201
9202 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9203 {
9204 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9205 /* have to work out the offset into the RAT immediate return buffer */
9206 struct r600_bytecode_alu alu;
9207 struct r600_bytecode_vtx vtx;
9208 struct r600_bytecode_cf *cf;
9209 int r;
9210 int idx_gpr;
9211 unsigned format, num_format, format_comp, endian;
9212 const struct util_format_description *desc;
9213 unsigned rat_index_mode;
9214 unsigned immed_base;
9215 unsigned rat_base;
9216
9217 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9218 rat_base = ctx->shader->rat_base;
9219
9220 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9221 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9222 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9223
9224 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9225 if (r)
9226 return r;
9227 idx_gpr = ctx->temp_reg;
9228 } else {
9229 r = load_index_src(ctx, 1, &idx_gpr);
9230 if (r)
9231 return r;
9232 }
9233
9234 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9235
9236 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9238 alu.op = ALU_OP1_MOV;
9239 alu.dst.sel = ctx->thread_id_gpr;
9240 alu.dst.chan = 0;
9241 alu.dst.write = 1;
9242 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9243 alu.last = 1;
9244 r = r600_bytecode_add_alu(ctx->bc, &alu);
9245 if (r)
9246 return r;
9247
9248 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9249 alu.op = ALU_OP1_MOV;
9250 alu.dst.sel = ctx->thread_id_gpr;
9251 if (ctx->bc->chip_class == CAYMAN)
9252 alu.dst.chan = 2;
9253 else
9254 alu.dst.chan = 3;
9255 alu.dst.write = 1;
9256 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9257 alu.last = 1;
9258 r = r600_bytecode_add_alu(ctx->bc, &alu);
9259 if (r)
9260 return r;
9261 } else {
9262 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9263 alu.op = ALU_OP1_MOV;
9264 alu.dst.sel = ctx->thread_id_gpr;
9265 alu.dst.chan = 0;
9266 alu.dst.write = 1;
9267 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9268 alu.last = 1;
9269 r = r600_bytecode_add_alu(ctx->bc, &alu);
9270 if (r)
9271 return r;
9272 }
9273
9274 if (rat_index_mode)
9275 egcm_load_index_reg(ctx->bc, 1, false);
9276 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9277 cf = ctx->bc->cf_last;
9278
9279 cf->rat.id = rat_base + inst->Src[0].Register.Index;
9280 cf->rat.inst = ctx->inst_info->op;
9281 cf->rat.index_mode = rat_index_mode;
9282 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9283 cf->output.gpr = ctx->thread_id_gpr;
9284 cf->output.index_gpr = idx_gpr;
9285 cf->output.comp_mask = 0xf;
9286 cf->output.burst_count = 1;
9287 cf->vpm = 1;
9288 cf->barrier = 1;
9289 cf->mark = 1;
9290 cf->output.elem_size = 0;
9291 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9292 cf = ctx->bc->cf_last;
9293 cf->barrier = 1;
9294 cf->cf_addr = 1;
9295
9296 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9297 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9298 desc = util_format_description(inst->Memory.Format);
9299 r600_vertex_data_type(inst->Memory.Format,
9300 &format, &num_format, &format_comp, &endian);
9301 vtx.dst_sel_x = desc->swizzle[0];
9302 } else {
9303 format = FMT_32;
9304 num_format = 1;
9305 format_comp = 0;
9306 endian = 0;
9307 vtx.dst_sel_x = 0;
9308 }
9309 vtx.op = FETCH_OP_VFETCH;
9310 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9311 vtx.buffer_index_mode = rat_index_mode;
9312 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9313 vtx.src_gpr = ctx->thread_id_gpr;
9314 vtx.src_sel_x = 1;
9315 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9316 vtx.dst_sel_y = 7;
9317 vtx.dst_sel_z = 7;
9318 vtx.dst_sel_w = 7;
9319 vtx.use_const_fields = 0;
9320 vtx.srf_mode_all = 1;
9321 vtx.data_format = format;
9322 vtx.num_format_all = num_format;
9323 vtx.format_comp_all = format_comp;
9324 vtx.endian = endian;
9325 vtx.offset = 0;
9326 vtx.mega_fetch_count = 0xf;
9327 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9328 if (r)
9329 return r;
9330 cf = ctx->bc->cf_last;
9331 cf->vpm = 1;
9332 cf->barrier = 1;
9333 return 0;
9334 }
9335
9336 static int get_gds_op(int opcode)
9337 {
9338 switch (opcode) {
9339 case TGSI_OPCODE_ATOMUADD:
9340 return FETCH_OP_GDS_ADD_RET;
9341 case TGSI_OPCODE_ATOMAND:
9342 return FETCH_OP_GDS_AND_RET;
9343 case TGSI_OPCODE_ATOMOR:
9344 return FETCH_OP_GDS_OR_RET;
9345 case TGSI_OPCODE_ATOMXOR:
9346 return FETCH_OP_GDS_XOR_RET;
9347 case TGSI_OPCODE_ATOMUMIN:
9348 return FETCH_OP_GDS_MIN_UINT_RET;
9349 case TGSI_OPCODE_ATOMUMAX:
9350 return FETCH_OP_GDS_MAX_UINT_RET;
9351 case TGSI_OPCODE_ATOMXCHG:
9352 return FETCH_OP_GDS_XCHG_RET;
9353 case TGSI_OPCODE_ATOMCAS:
9354 return FETCH_OP_GDS_CMP_XCHG_RET;
9355 default:
9356 return -1;
9357 }
9358 }
9359
9360 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9361 {
9362 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9363 struct r600_bytecode_gds gds;
9364 struct r600_bytecode_alu alu;
9365 int gds_op = get_gds_op(inst->Instruction.Opcode);
9366 int r;
9367 int uav_id = 0;
9368 int uav_index_mode = 0;
9369 bool is_cm = (ctx->bc->chip_class == CAYMAN);
9370
9371 if (gds_op == -1) {
9372 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9373 return -1;
9374 }
9375
9376 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9377 if (r)
9378 return r;
9379
9380 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9381 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9382 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9383 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9384 alu.op = ALU_OP1_MOV;
9385 alu.dst.sel = ctx->temp_reg;
9386 alu.dst.chan = is_cm ? 2 : 1;
9387 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9388 alu.src[0].value = value;
9389 alu.last = 1;
9390 alu.dst.write = 1;
9391 r = r600_bytecode_add_alu(ctx->bc, &alu);
9392 if (r)
9393 return r;
9394 } else {
9395 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9396 alu.op = ALU_OP1_MOV;
9397 alu.dst.sel = ctx->temp_reg;
9398 alu.dst.chan = is_cm ? 2 : 1;
9399 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9400 alu.last = 1;
9401 alu.dst.write = 1;
9402 r = r600_bytecode_add_alu(ctx->bc, &alu);
9403 if (r)
9404 return r;
9405 }
9406 }
9407 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9408 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9409 int abs_value = abs(value);
9410 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9411 gds_op = FETCH_OP_GDS_SUB_RET;
9412 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9413 alu.op = ALU_OP1_MOV;
9414 alu.dst.sel = ctx->temp_reg;
9415 alu.dst.chan = is_cm ? 1 : 0;
9416 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9417 alu.src[0].value = abs_value;
9418 alu.last = 1;
9419 alu.dst.write = 1;
9420 r = r600_bytecode_add_alu(ctx->bc, &alu);
9421 if (r)
9422 return r;
9423 } else {
9424 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9425 alu.op = ALU_OP1_MOV;
9426 alu.dst.sel = ctx->temp_reg;
9427 alu.dst.chan = is_cm ? 1 : 0;
9428 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9429 alu.last = 1;
9430 alu.dst.write = 1;
9431 r = r600_bytecode_add_alu(ctx->bc, &alu);
9432 if (r)
9433 return r;
9434 }
9435
9436
9437 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9438 gds.op = gds_op;
9439 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9440 gds.uav_id = is_cm ? 0 : uav_id;
9441 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9442 gds.src_gpr = ctx->temp_reg;
9443 gds.src_gpr2 = 0;
9444 gds.src_sel_x = is_cm ? 0 : 4;
9445 gds.src_sel_y = is_cm ? 1 : 0;
9446 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9447 gds.src_sel_z = is_cm ? 2 : 1;
9448 else
9449 gds.src_sel_z = 7;
9450 gds.dst_sel_x = 0;
9451 gds.dst_sel_y = 7;
9452 gds.dst_sel_z = 7;
9453 gds.dst_sel_w = 7;
9454 gds.alloc_consume = !is_cm;
9455
9456 r = r600_bytecode_add_gds(ctx->bc, &gds);
9457 if (r)
9458 return r;
9459 ctx->bc->cf_last->vpm = 1;
9460 return 0;
9461 }
9462
9463 static int get_lds_op(int opcode)
9464 {
9465 switch (opcode) {
9466 case TGSI_OPCODE_ATOMUADD:
9467 return LDS_OP2_LDS_ADD_RET;
9468 case TGSI_OPCODE_ATOMAND:
9469 return LDS_OP2_LDS_AND_RET;
9470 case TGSI_OPCODE_ATOMOR:
9471 return LDS_OP2_LDS_OR_RET;
9472 case TGSI_OPCODE_ATOMXOR:
9473 return LDS_OP2_LDS_XOR_RET;
9474 case TGSI_OPCODE_ATOMUMIN:
9475 return LDS_OP2_LDS_MIN_UINT_RET;
9476 case TGSI_OPCODE_ATOMUMAX:
9477 return LDS_OP2_LDS_MAX_UINT_RET;
9478 case TGSI_OPCODE_ATOMIMIN:
9479 return LDS_OP2_LDS_MIN_INT_RET;
9480 case TGSI_OPCODE_ATOMIMAX:
9481 return LDS_OP2_LDS_MAX_INT_RET;
9482 case TGSI_OPCODE_ATOMXCHG:
9483 return LDS_OP2_LDS_XCHG_RET;
9484 case TGSI_OPCODE_ATOMCAS:
9485 return LDS_OP3_LDS_CMP_XCHG_RET;
9486 default:
9487 return -1;
9488 }
9489 }
9490
9491 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9492 {
9493 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9494 int lds_op = get_lds_op(inst->Instruction.Opcode);
9495 int r;
9496
9497 struct r600_bytecode_alu alu;
9498 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9499 alu.op = lds_op;
9500 alu.is_lds_idx_op = true;
9501 alu.last = 1;
9502 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9503 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9504 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9505 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9506 else
9507 alu.src[2].sel = V_SQ_ALU_SRC_0;
9508 r = r600_bytecode_add_alu(ctx->bc, &alu);
9509 if (r)
9510 return r;
9511
9512 /* then read from LDS_OQ_A_POP */
9513 memset(&alu, 0, sizeof(alu));
9514
9515 alu.op = ALU_OP1_MOV;
9516 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9517 alu.src[0].chan = 0;
9518 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9519 alu.dst.write = 1;
9520 alu.last = 1;
9521 r = r600_bytecode_add_alu(ctx->bc, &alu);
9522 if (r)
9523 return r;
9524
9525 return 0;
9526 }
9527
9528 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9529 {
9530 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9531 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9532 return tgsi_atomic_op_rat(ctx);
9533 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9534 return tgsi_atomic_op_gds(ctx);
9535 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9536 return tgsi_atomic_op_rat(ctx);
9537 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9538 return tgsi_atomic_op_lds(ctx);
9539 return 0;
9540 }
9541
9542 static int tgsi_resq(struct r600_shader_ctx *ctx)
9543 {
9544 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9545 unsigned sampler_index_mode;
9546 struct r600_bytecode_tex tex;
9547 int r;
9548 boolean has_txq_cube_array_z = false;
9549
9550 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9551 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9552 if (ctx->bc->chip_class < EVERGREEN)
9553 ctx->shader->uses_tex_buffers = true;
9554 unsigned eg_buffer_base = 0;
9555 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9556 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9557 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9558 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9559 }
9560
9561 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9562 inst->Dst[0].Register.WriteMask & 4) {
9563 ctx->shader->has_txq_cube_array_z_comp = true;
9564 has_txq_cube_array_z = true;
9565 }
9566
9567 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9568 if (sampler_index_mode)
9569 egcm_load_index_reg(ctx->bc, 1, false);
9570
9571
9572 /* does this shader want a num layers from TXQ for a cube array? */
9573 if (has_txq_cube_array_z) {
9574 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9575 struct r600_bytecode_alu alu;
9576
9577 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9578 alu.op = ALU_OP1_MOV;
9579
9580 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9581 /* with eg each dword is either number of cubes */
9582 alu.src[0].sel += id / 4;
9583 alu.src[0].chan = id % 4;
9584 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9585 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9586 alu.last = 1;
9587 r = r600_bytecode_add_alu(ctx->bc, &alu);
9588 if (r)
9589 return r;
9590 /* disable writemask from texture instruction */
9591 inst->Dst[0].Register.WriteMask &= ~4;
9592 }
9593 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9594 tex.op = ctx->inst_info->op;
9595 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9596 tex.sampler_index_mode = sampler_index_mode;
9597 tex.resource_id = tex.sampler_id;
9598 tex.resource_index_mode = sampler_index_mode;
9599 tex.src_sel_x = 4;
9600 tex.src_sel_y = 4;
9601 tex.src_sel_z = 4;
9602 tex.src_sel_w = 4;
9603 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9604 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9605 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9606 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9607 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9608 r = r600_bytecode_add_tex(ctx->bc, &tex);
9609 if (r)
9610 return r;
9611
9612 return 0;
9613 }
9614
9615 static int tgsi_lrp(struct r600_shader_ctx *ctx)
9616 {
9617 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9618 struct r600_bytecode_alu alu;
9619 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9620 struct r600_bytecode_alu_src srcs[2][4];
9621 unsigned i;
9622 int r;
9623
9624 /* optimize if it's just an equal balance */
9625 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9626 for (i = 0; i < lasti + 1; i++) {
9627 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9628 continue;
9629
9630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9631 alu.op = ALU_OP2_ADD;
9632 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9633 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9634 alu.omod = 3;
9635 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9636 alu.dst.chan = i;
9637 if (i == lasti) {
9638 alu.last = 1;
9639 }
9640 r = r600_bytecode_add_alu(ctx->bc, &alu);
9641 if (r)
9642 return r;
9643 }
9644 return 0;
9645 }
9646
9647 /* 1 - src0 */
9648 for (i = 0; i < lasti + 1; i++) {
9649 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9650 continue;
9651
9652 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9653 alu.op = ALU_OP2_ADD;
9654 alu.src[0].sel = V_SQ_ALU_SRC_1;
9655 alu.src[0].chan = 0;
9656 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9657 r600_bytecode_src_toggle_neg(&alu.src[1]);
9658 alu.dst.sel = ctx->temp_reg;
9659 alu.dst.chan = i;
9660 if (i == lasti) {
9661 alu.last = 1;
9662 }
9663 alu.dst.write = 1;
9664 r = r600_bytecode_add_alu(ctx->bc, &alu);
9665 if (r)
9666 return r;
9667 }
9668
9669 /* (1 - src0) * src2 */
9670 for (i = 0; i < lasti + 1; i++) {
9671 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9672 continue;
9673
9674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9675 alu.op = ALU_OP2_MUL;
9676 alu.src[0].sel = ctx->temp_reg;
9677 alu.src[0].chan = i;
9678 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9679 alu.dst.sel = ctx->temp_reg;
9680 alu.dst.chan = i;
9681 if (i == lasti) {
9682 alu.last = 1;
9683 }
9684 alu.dst.write = 1;
9685 r = r600_bytecode_add_alu(ctx->bc, &alu);
9686 if (r)
9687 return r;
9688 }
9689
9690 /* src0 * src1 + (1 - src0) * src2 */
9691
9692 for (i = 0; i < 2; i++) {
9693 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9694 srcs[i], &ctx->src[i]);
9695 if (r)
9696 return r;
9697 }
9698
9699 for (i = 0; i < lasti + 1; i++) {
9700 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9701 continue;
9702
9703 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9704 alu.op = ALU_OP3_MULADD;
9705 alu.is_op3 = 1;
9706 alu.src[0] = srcs[0][i];
9707 alu.src[1] = srcs[1][i];
9708 alu.src[2].sel = ctx->temp_reg;
9709 alu.src[2].chan = i;
9710
9711 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9712 alu.dst.chan = i;
9713 if (i == lasti) {
9714 alu.last = 1;
9715 }
9716 r = r600_bytecode_add_alu(ctx->bc, &alu);
9717 if (r)
9718 return r;
9719 }
9720 return 0;
9721 }
9722
9723 static int tgsi_cmp(struct r600_shader_ctx *ctx)
9724 {
9725 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9726 struct r600_bytecode_alu alu;
9727 int i, r, j;
9728 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9729 struct r600_bytecode_alu_src srcs[3][4];
9730
9731 unsigned op;
9732
9733 if (ctx->src[0].abs && ctx->src[0].neg) {
9734 op = ALU_OP3_CNDE;
9735 ctx->src[0].abs = 0;
9736 ctx->src[0].neg = 0;
9737 } else {
9738 op = ALU_OP3_CNDGE;
9739 }
9740
9741 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9742 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9743 srcs[j], &ctx->src[j]);
9744 if (r)
9745 return r;
9746 }
9747
9748 for (i = 0; i < lasti + 1; i++) {
9749 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9750 continue;
9751
9752 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9753 alu.op = op;
9754 alu.src[0] = srcs[0][i];
9755 alu.src[1] = srcs[2][i];
9756 alu.src[2] = srcs[1][i];
9757
9758 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9759 alu.dst.chan = i;
9760 alu.dst.write = 1;
9761 alu.is_op3 = 1;
9762 if (i == lasti)
9763 alu.last = 1;
9764 r = r600_bytecode_add_alu(ctx->bc, &alu);
9765 if (r)
9766 return r;
9767 }
9768 return 0;
9769 }
9770
9771 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9772 {
9773 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9774 struct r600_bytecode_alu alu;
9775 int i, r;
9776 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9777
9778 for (i = 0; i < lasti + 1; i++) {
9779 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9780 continue;
9781
9782 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9783 alu.op = ALU_OP3_CNDE_INT;
9784 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9785 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9786 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9787 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9788 alu.dst.chan = i;
9789 alu.dst.write = 1;
9790 alu.is_op3 = 1;
9791 if (i == lasti)
9792 alu.last = 1;
9793 r = r600_bytecode_add_alu(ctx->bc, &alu);
9794 if (r)
9795 return r;
9796 }
9797 return 0;
9798 }
9799
9800 static int tgsi_exp(struct r600_shader_ctx *ctx)
9801 {
9802 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9803 struct r600_bytecode_alu alu;
9804 int r;
9805 unsigned i;
9806
9807 /* result.x = 2^floor(src); */
9808 if (inst->Dst[0].Register.WriteMask & 1) {
9809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9810
9811 alu.op = ALU_OP1_FLOOR;
9812 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9813
9814 alu.dst.sel = ctx->temp_reg;
9815 alu.dst.chan = 0;
9816 alu.dst.write = 1;
9817 alu.last = 1;
9818 r = r600_bytecode_add_alu(ctx->bc, &alu);
9819 if (r)
9820 return r;
9821
9822 if (ctx->bc->chip_class == CAYMAN) {
9823 for (i = 0; i < 3; i++) {
9824 alu.op = ALU_OP1_EXP_IEEE;
9825 alu.src[0].sel = ctx->temp_reg;
9826 alu.src[0].chan = 0;
9827
9828 alu.dst.sel = ctx->temp_reg;
9829 alu.dst.chan = i;
9830 alu.dst.write = i == 0;
9831 alu.last = i == 2;
9832 r = r600_bytecode_add_alu(ctx->bc, &alu);
9833 if (r)
9834 return r;
9835 }
9836 } else {
9837 alu.op = ALU_OP1_EXP_IEEE;
9838 alu.src[0].sel = ctx->temp_reg;
9839 alu.src[0].chan = 0;
9840
9841 alu.dst.sel = ctx->temp_reg;
9842 alu.dst.chan = 0;
9843 alu.dst.write = 1;
9844 alu.last = 1;
9845 r = r600_bytecode_add_alu(ctx->bc, &alu);
9846 if (r)
9847 return r;
9848 }
9849 }
9850
9851 /* result.y = tmp - floor(tmp); */
9852 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9853 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9854
9855 alu.op = ALU_OP1_FRACT;
9856 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9857
9858 alu.dst.sel = ctx->temp_reg;
9859 #if 0
9860 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9861 if (r)
9862 return r;
9863 #endif
9864 alu.dst.write = 1;
9865 alu.dst.chan = 1;
9866
9867 alu.last = 1;
9868
9869 r = r600_bytecode_add_alu(ctx->bc, &alu);
9870 if (r)
9871 return r;
9872 }
9873
9874 /* result.z = RoughApprox2ToX(tmp);*/
9875 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9876 if (ctx->bc->chip_class == CAYMAN) {
9877 for (i = 0; i < 3; i++) {
9878 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9879 alu.op = ALU_OP1_EXP_IEEE;
9880 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9881
9882 alu.dst.sel = ctx->temp_reg;
9883 alu.dst.chan = i;
9884 if (i == 2) {
9885 alu.dst.write = 1;
9886 alu.last = 1;
9887 }
9888
9889 r = r600_bytecode_add_alu(ctx->bc, &alu);
9890 if (r)
9891 return r;
9892 }
9893 } else {
9894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9895 alu.op = ALU_OP1_EXP_IEEE;
9896 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9897
9898 alu.dst.sel = ctx->temp_reg;
9899 alu.dst.write = 1;
9900 alu.dst.chan = 2;
9901
9902 alu.last = 1;
9903
9904 r = r600_bytecode_add_alu(ctx->bc, &alu);
9905 if (r)
9906 return r;
9907 }
9908 }
9909
9910 /* result.w = 1.0;*/
9911 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9912 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9913
9914 alu.op = ALU_OP1_MOV;
9915 alu.src[0].sel = V_SQ_ALU_SRC_1;
9916 alu.src[0].chan = 0;
9917
9918 alu.dst.sel = ctx->temp_reg;
9919 alu.dst.chan = 3;
9920 alu.dst.write = 1;
9921 alu.last = 1;
9922 r = r600_bytecode_add_alu(ctx->bc, &alu);
9923 if (r)
9924 return r;
9925 }
9926 return tgsi_helper_copy(ctx, inst);
9927 }
9928
9929 static int tgsi_log(struct r600_shader_ctx *ctx)
9930 {
9931 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9932 struct r600_bytecode_alu alu;
9933 int r;
9934 unsigned i;
9935
9936 /* result.x = floor(log2(|src|)); */
9937 if (inst->Dst[0].Register.WriteMask & 1) {
9938 if (ctx->bc->chip_class == CAYMAN) {
9939 for (i = 0; i < 3; i++) {
9940 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9941
9942 alu.op = ALU_OP1_LOG_IEEE;
9943 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9944 r600_bytecode_src_set_abs(&alu.src[0]);
9945
9946 alu.dst.sel = ctx->temp_reg;
9947 alu.dst.chan = i;
9948 if (i == 0)
9949 alu.dst.write = 1;
9950 if (i == 2)
9951 alu.last = 1;
9952 r = r600_bytecode_add_alu(ctx->bc, &alu);
9953 if (r)
9954 return r;
9955 }
9956
9957 } else {
9958 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9959
9960 alu.op = ALU_OP1_LOG_IEEE;
9961 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9962 r600_bytecode_src_set_abs(&alu.src[0]);
9963
9964 alu.dst.sel = ctx->temp_reg;
9965 alu.dst.chan = 0;
9966 alu.dst.write = 1;
9967 alu.last = 1;
9968 r = r600_bytecode_add_alu(ctx->bc, &alu);
9969 if (r)
9970 return r;
9971 }
9972
9973 alu.op = ALU_OP1_FLOOR;
9974 alu.src[0].sel = ctx->temp_reg;
9975 alu.src[0].chan = 0;
9976
9977 alu.dst.sel = ctx->temp_reg;
9978 alu.dst.chan = 0;
9979 alu.dst.write = 1;
9980 alu.last = 1;
9981
9982 r = r600_bytecode_add_alu(ctx->bc, &alu);
9983 if (r)
9984 return r;
9985 }
9986
9987 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9988 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9989
9990 if (ctx->bc->chip_class == CAYMAN) {
9991 for (i = 0; i < 3; i++) {
9992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9993
9994 alu.op = ALU_OP1_LOG_IEEE;
9995 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9996 r600_bytecode_src_set_abs(&alu.src[0]);
9997
9998 alu.dst.sel = ctx->temp_reg;
9999 alu.dst.chan = i;
10000 if (i == 1)
10001 alu.dst.write = 1;
10002 if (i == 2)
10003 alu.last = 1;
10004
10005 r = r600_bytecode_add_alu(ctx->bc, &alu);
10006 if (r)
10007 return r;
10008 }
10009 } else {
10010 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10011
10012 alu.op = ALU_OP1_LOG_IEEE;
10013 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10014 r600_bytecode_src_set_abs(&alu.src[0]);
10015
10016 alu.dst.sel = ctx->temp_reg;
10017 alu.dst.chan = 1;
10018 alu.dst.write = 1;
10019 alu.last = 1;
10020
10021 r = r600_bytecode_add_alu(ctx->bc, &alu);
10022 if (r)
10023 return r;
10024 }
10025
10026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10027
10028 alu.op = ALU_OP1_FLOOR;
10029 alu.src[0].sel = ctx->temp_reg;
10030 alu.src[0].chan = 1;
10031
10032 alu.dst.sel = ctx->temp_reg;
10033 alu.dst.chan = 1;
10034 alu.dst.write = 1;
10035 alu.last = 1;
10036
10037 r = r600_bytecode_add_alu(ctx->bc, &alu);
10038 if (r)
10039 return r;
10040
10041 if (ctx->bc->chip_class == CAYMAN) {
10042 for (i = 0; i < 3; i++) {
10043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10044 alu.op = ALU_OP1_EXP_IEEE;
10045 alu.src[0].sel = ctx->temp_reg;
10046 alu.src[0].chan = 1;
10047
10048 alu.dst.sel = ctx->temp_reg;
10049 alu.dst.chan = i;
10050 if (i == 1)
10051 alu.dst.write = 1;
10052 if (i == 2)
10053 alu.last = 1;
10054
10055 r = r600_bytecode_add_alu(ctx->bc, &alu);
10056 if (r)
10057 return r;
10058 }
10059 } else {
10060 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10061 alu.op = ALU_OP1_EXP_IEEE;
10062 alu.src[0].sel = ctx->temp_reg;
10063 alu.src[0].chan = 1;
10064
10065 alu.dst.sel = ctx->temp_reg;
10066 alu.dst.chan = 1;
10067 alu.dst.write = 1;
10068 alu.last = 1;
10069
10070 r = r600_bytecode_add_alu(ctx->bc, &alu);
10071 if (r)
10072 return r;
10073 }
10074
10075 if (ctx->bc->chip_class == CAYMAN) {
10076 for (i = 0; i < 3; i++) {
10077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10078 alu.op = ALU_OP1_RECIP_IEEE;
10079 alu.src[0].sel = ctx->temp_reg;
10080 alu.src[0].chan = 1;
10081
10082 alu.dst.sel = ctx->temp_reg;
10083 alu.dst.chan = i;
10084 if (i == 1)
10085 alu.dst.write = 1;
10086 if (i == 2)
10087 alu.last = 1;
10088
10089 r = r600_bytecode_add_alu(ctx->bc, &alu);
10090 if (r)
10091 return r;
10092 }
10093 } else {
10094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10095 alu.op = ALU_OP1_RECIP_IEEE;
10096 alu.src[0].sel = ctx->temp_reg;
10097 alu.src[0].chan = 1;
10098
10099 alu.dst.sel = ctx->temp_reg;
10100 alu.dst.chan = 1;
10101 alu.dst.write = 1;
10102 alu.last = 1;
10103
10104 r = r600_bytecode_add_alu(ctx->bc, &alu);
10105 if (r)
10106 return r;
10107 }
10108
10109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10110
10111 alu.op = ALU_OP2_MUL;
10112
10113 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10114 r600_bytecode_src_set_abs(&alu.src[0]);
10115
10116 alu.src[1].sel = ctx->temp_reg;
10117 alu.src[1].chan = 1;
10118
10119 alu.dst.sel = ctx->temp_reg;
10120 alu.dst.chan = 1;
10121 alu.dst.write = 1;
10122 alu.last = 1;
10123
10124 r = r600_bytecode_add_alu(ctx->bc, &alu);
10125 if (r)
10126 return r;
10127 }
10128
10129 /* result.z = log2(|src|);*/
10130 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10131 if (ctx->bc->chip_class == CAYMAN) {
10132 for (i = 0; i < 3; i++) {
10133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10134
10135 alu.op = ALU_OP1_LOG_IEEE;
10136 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10137 r600_bytecode_src_set_abs(&alu.src[0]);
10138
10139 alu.dst.sel = ctx->temp_reg;
10140 if (i == 2)
10141 alu.dst.write = 1;
10142 alu.dst.chan = i;
10143 if (i == 2)
10144 alu.last = 1;
10145
10146 r = r600_bytecode_add_alu(ctx->bc, &alu);
10147 if (r)
10148 return r;
10149 }
10150 } else {
10151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10152
10153 alu.op = ALU_OP1_LOG_IEEE;
10154 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10155 r600_bytecode_src_set_abs(&alu.src[0]);
10156
10157 alu.dst.sel = ctx->temp_reg;
10158 alu.dst.write = 1;
10159 alu.dst.chan = 2;
10160 alu.last = 1;
10161
10162 r = r600_bytecode_add_alu(ctx->bc, &alu);
10163 if (r)
10164 return r;
10165 }
10166 }
10167
10168 /* result.w = 1.0; */
10169 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10170 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10171
10172 alu.op = ALU_OP1_MOV;
10173 alu.src[0].sel = V_SQ_ALU_SRC_1;
10174 alu.src[0].chan = 0;
10175
10176 alu.dst.sel = ctx->temp_reg;
10177 alu.dst.chan = 3;
10178 alu.dst.write = 1;
10179 alu.last = 1;
10180
10181 r = r600_bytecode_add_alu(ctx->bc, &alu);
10182 if (r)
10183 return r;
10184 }
10185
10186 return tgsi_helper_copy(ctx, inst);
10187 }
10188
10189 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10190 {
10191 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10192 struct r600_bytecode_alu alu;
10193 int r;
10194 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10195 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10196
10197 assert(inst->Dst[0].Register.Index < 3);
10198 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10199
10200 switch (inst->Instruction.Opcode) {
10201 case TGSI_OPCODE_ARL:
10202 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10203 break;
10204 case TGSI_OPCODE_ARR:
10205 alu.op = ALU_OP1_FLT_TO_INT;
10206 break;
10207 case TGSI_OPCODE_UARL:
10208 alu.op = ALU_OP1_MOV;
10209 break;
10210 default:
10211 assert(0);
10212 return -1;
10213 }
10214
10215 for (i = 0; i <= lasti; ++i) {
10216 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10217 continue;
10218 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10219 alu.last = i == lasti;
10220 alu.dst.sel = reg;
10221 alu.dst.chan = i;
10222 alu.dst.write = 1;
10223 r = r600_bytecode_add_alu(ctx->bc, &alu);
10224 if (r)
10225 return r;
10226 }
10227
10228 if (inst->Dst[0].Register.Index > 0)
10229 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10230 else
10231 ctx->bc->ar_loaded = 0;
10232
10233 return 0;
10234 }
10235 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10236 {
10237 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10238 struct r600_bytecode_alu alu;
10239 int r;
10240 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10241
10242 switch (inst->Instruction.Opcode) {
10243 case TGSI_OPCODE_ARL:
10244 memset(&alu, 0, sizeof(alu));
10245 alu.op = ALU_OP1_FLOOR;
10246 alu.dst.sel = ctx->bc->ar_reg;
10247 alu.dst.write = 1;
10248 for (i = 0; i <= lasti; ++i) {
10249 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10250 alu.dst.chan = i;
10251 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10252 alu.last = i == lasti;
10253 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10254 return r;
10255 }
10256 }
10257
10258 memset(&alu, 0, sizeof(alu));
10259 alu.op = ALU_OP1_FLT_TO_INT;
10260 alu.src[0].sel = ctx->bc->ar_reg;
10261 alu.dst.sel = ctx->bc->ar_reg;
10262 alu.dst.write = 1;
10263 /* FLT_TO_INT is trans-only on r600/r700 */
10264 alu.last = TRUE;
10265 for (i = 0; i <= lasti; ++i) {
10266 alu.dst.chan = i;
10267 alu.src[0].chan = i;
10268 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10269 return r;
10270 }
10271 break;
10272 case TGSI_OPCODE_ARR:
10273 memset(&alu, 0, sizeof(alu));
10274 alu.op = ALU_OP1_FLT_TO_INT;
10275 alu.dst.sel = ctx->bc->ar_reg;
10276 alu.dst.write = 1;
10277 /* FLT_TO_INT is trans-only on r600/r700 */
10278 alu.last = TRUE;
10279 for (i = 0; i <= lasti; ++i) {
10280 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10281 alu.dst.chan = i;
10282 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10283 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10284 return r;
10285 }
10286 }
10287 break;
10288 case TGSI_OPCODE_UARL:
10289 memset(&alu, 0, sizeof(alu));
10290 alu.op = ALU_OP1_MOV;
10291 alu.dst.sel = ctx->bc->ar_reg;
10292 alu.dst.write = 1;
10293 for (i = 0; i <= lasti; ++i) {
10294 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10295 alu.dst.chan = i;
10296 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10297 alu.last = i == lasti;
10298 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10299 return r;
10300 }
10301 }
10302 break;
10303 default:
10304 assert(0);
10305 return -1;
10306 }
10307
10308 ctx->bc->ar_loaded = 0;
10309 return 0;
10310 }
10311
10312 static int tgsi_opdst(struct r600_shader_ctx *ctx)
10313 {
10314 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10315 struct r600_bytecode_alu alu;
10316 int i, r = 0;
10317
10318 for (i = 0; i < 4; i++) {
10319 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10320
10321 alu.op = ALU_OP2_MUL;
10322 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10323
10324 if (i == 0 || i == 3) {
10325 alu.src[0].sel = V_SQ_ALU_SRC_1;
10326 } else {
10327 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10328 }
10329
10330 if (i == 0 || i == 2) {
10331 alu.src[1].sel = V_SQ_ALU_SRC_1;
10332 } else {
10333 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10334 }
10335 if (i == 3)
10336 alu.last = 1;
10337 r = r600_bytecode_add_alu(ctx->bc, &alu);
10338 if (r)
10339 return r;
10340 }
10341 return 0;
10342 }
10343
10344 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10345 struct r600_bytecode_alu_src *src)
10346 {
10347 struct r600_bytecode_alu alu;
10348 int r;
10349
10350 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10351 alu.op = opcode;
10352 alu.execute_mask = 1;
10353 alu.update_pred = 1;
10354
10355 alu.dst.sel = ctx->temp_reg;
10356 alu.dst.write = 1;
10357 alu.dst.chan = 0;
10358
10359 alu.src[0] = *src;
10360 alu.src[1].sel = V_SQ_ALU_SRC_0;
10361 alu.src[1].chan = 0;
10362
10363 alu.last = 1;
10364
10365 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10366 if (r)
10367 return r;
10368 return 0;
10369 }
10370
10371 static int pops(struct r600_shader_ctx *ctx, int pops)
10372 {
10373 unsigned force_pop = ctx->bc->force_add_cf;
10374
10375 if (!force_pop) {
10376 int alu_pop = 3;
10377 if (ctx->bc->cf_last) {
10378 if (ctx->bc->cf_last->op == CF_OP_ALU)
10379 alu_pop = 0;
10380 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10381 alu_pop = 1;
10382 }
10383 alu_pop += pops;
10384 if (alu_pop == 1) {
10385 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10386 ctx->bc->force_add_cf = 1;
10387 } else if (alu_pop == 2) {
10388 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10389 ctx->bc->force_add_cf = 1;
10390 } else {
10391 force_pop = 1;
10392 }
10393 }
10394
10395 if (force_pop) {
10396 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10397 ctx->bc->cf_last->pop_count = pops;
10398 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10399 }
10400
10401 return 0;
10402 }
10403
10404 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10405 unsigned reason)
10406 {
10407 struct r600_stack_info *stack = &ctx->bc->stack;
10408 unsigned elements;
10409 int entries;
10410
10411 unsigned entry_size = stack->entry_size;
10412
10413 elements = (stack->loop + stack->push_wqm ) * entry_size;
10414 elements += stack->push;
10415
10416 switch (ctx->bc->chip_class) {
10417 case R600:
10418 case R700:
10419 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10420 * the stack must be reserved to hold the current active/continue
10421 * masks */
10422 if (reason == FC_PUSH_VPM || stack->push > 0) {
10423 elements += 2;
10424 }
10425 break;
10426
10427 case CAYMAN:
10428 /* r9xx: any stack operation on empty stack consumes 2 additional
10429 * elements */
10430 elements += 2;
10431
10432 /* fallthrough */
10433 /* FIXME: do the two elements added above cover the cases for the
10434 * r8xx+ below? */
10435
10436 case EVERGREEN:
10437 /* r8xx+: 2 extra elements are not always required, but one extra
10438 * element must be added for each of the following cases:
10439 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10440 * stack usage.
10441 * (Currently we don't use ALU_ELSE_AFTER.)
10442 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10443 * PUSH instruction executed.
10444 *
10445 * NOTE: it seems we also need to reserve additional element in some
10446 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10447 * then STACK_SIZE should be 2 instead of 1 */
10448 if (reason == FC_PUSH_VPM || stack->push > 0) {
10449 elements += 1;
10450 }
10451 break;
10452
10453 default:
10454 assert(0);
10455 break;
10456 }
10457
10458 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10459 * for all chips, so we use 4 in the final formula, not the real entry_size
10460 * for the chip */
10461 entry_size = 4;
10462
10463 entries = (elements + (entry_size - 1)) / entry_size;
10464
10465 if (entries > stack->max_entries)
10466 stack->max_entries = entries;
10467 return elements;
10468 }
10469
10470 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10471 {
10472 switch(reason) {
10473 case FC_PUSH_VPM:
10474 --ctx->bc->stack.push;
10475 assert(ctx->bc->stack.push >= 0);
10476 break;
10477 case FC_PUSH_WQM:
10478 --ctx->bc->stack.push_wqm;
10479 assert(ctx->bc->stack.push_wqm >= 0);
10480 break;
10481 case FC_LOOP:
10482 --ctx->bc->stack.loop;
10483 assert(ctx->bc->stack.loop >= 0);
10484 break;
10485 default:
10486 assert(0);
10487 break;
10488 }
10489 }
10490
10491 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10492 {
10493 switch (reason) {
10494 case FC_PUSH_VPM:
10495 ++ctx->bc->stack.push;
10496 break;
10497 case FC_PUSH_WQM:
10498 ++ctx->bc->stack.push_wqm;
10499 break;
10500 case FC_LOOP:
10501 ++ctx->bc->stack.loop;
10502 break;
10503 default:
10504 assert(0);
10505 }
10506
10507 return callstack_update_max_depth(ctx, reason);
10508 }
10509
10510 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10511 {
10512 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10513
10514 sp->mid = realloc((void *)sp->mid,
10515 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10516 sp->mid[sp->num_mid] = ctx->bc->cf_last;
10517 sp->num_mid++;
10518 }
10519
10520 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10521 {
10522 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10523 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10524 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10525 ctx->bc->fc_sp++;
10526 }
10527
10528 static void fc_poplevel(struct r600_shader_ctx *ctx)
10529 {
10530 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10531 free(sp->mid);
10532 sp->mid = NULL;
10533 sp->num_mid = 0;
10534 sp->start = NULL;
10535 sp->type = 0;
10536 ctx->bc->fc_sp--;
10537 }
10538
10539 #if 0
10540 static int emit_return(struct r600_shader_ctx *ctx)
10541 {
10542 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10543 return 0;
10544 }
10545
10546 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10547 {
10548
10549 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10550 ctx->bc->cf_last->pop_count = pops;
10551 /* XXX work out offset */
10552 return 0;
10553 }
10554
10555 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10556 {
10557 return 0;
10558 }
10559
10560 static void emit_testflag(struct r600_shader_ctx *ctx)
10561 {
10562
10563 }
10564
10565 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10566 {
10567 emit_testflag(ctx);
10568 emit_jump_to_offset(ctx, 1, 4);
10569 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10570 pops(ctx, ifidx + 1);
10571 emit_return(ctx);
10572 }
10573
10574 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10575 {
10576 emit_testflag(ctx);
10577
10578 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10579 ctx->bc->cf_last->pop_count = 1;
10580
10581 fc_set_mid(ctx, fc_sp);
10582
10583 pops(ctx, 1);
10584 }
10585 #endif
10586
10587 static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10588 struct r600_bytecode_alu_src *src)
10589 {
10590 int alu_type = CF_OP_ALU_PUSH_BEFORE;
10591 bool needs_workaround = false;
10592 int elems = callstack_push(ctx, FC_PUSH_VPM);
10593
10594 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
10595 needs_workaround = true;
10596
10597 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10598 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10599 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10600
10601 if (elems && (!dmod1 || !dmod2))
10602 needs_workaround = true;
10603 }
10604
10605 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10606 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10607 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10608 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10609 if (needs_workaround) {
10610 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10611 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10612 alu_type = CF_OP_ALU;
10613 }
10614
10615 emit_logic_pred(ctx, opcode, alu_type, src);
10616
10617 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10618
10619 fc_pushlevel(ctx, FC_IF);
10620
10621 return 0;
10622 }
10623
10624 static int tgsi_if(struct r600_shader_ctx *ctx)
10625 {
10626 struct r600_bytecode_alu_src alu_src;
10627 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10628
10629 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10630 }
10631
10632 static int tgsi_uif(struct r600_shader_ctx *ctx)
10633 {
10634 struct r600_bytecode_alu_src alu_src;
10635 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10636 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10637 }
10638
10639 static int tgsi_else(struct r600_shader_ctx *ctx)
10640 {
10641 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10642 ctx->bc->cf_last->pop_count = 1;
10643
10644 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10645 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10646 return 0;
10647 }
10648
10649 static int tgsi_endif(struct r600_shader_ctx *ctx)
10650 {
10651 int offset = 2;
10652 pops(ctx, 1);
10653 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10654 R600_ERR("if/endif unbalanced in shader\n");
10655 return -1;
10656 }
10657
10658 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10659 if (ctx->bc->cf_last->eg_alu_extended)
10660 offset += 2;
10661
10662 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10663 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10664 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10665 } else {
10666 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10667 }
10668 fc_poplevel(ctx);
10669
10670 callstack_pop(ctx, FC_PUSH_VPM);
10671 return 0;
10672 }
10673
10674 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10675 {
10676 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10677 * limited to 4096 iterations, like the other LOOP_* instructions. */
10678 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10679
10680 fc_pushlevel(ctx, FC_LOOP);
10681
10682 /* check stack depth */
10683 callstack_push(ctx, FC_LOOP);
10684 return 0;
10685 }
10686
10687 static int tgsi_endloop(struct r600_shader_ctx *ctx)
10688 {
10689 int i;
10690
10691 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10692
10693 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10694 R600_ERR("loop/endloop in shader code are not paired.\n");
10695 return -EINVAL;
10696 }
10697
10698 /* fixup loop pointers - from r600isa
10699 LOOP END points to CF after LOOP START,
10700 LOOP START point to CF after LOOP END
10701 BRK/CONT point to LOOP END CF
10702 */
10703 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10704
10705 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10706
10707 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10708 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10709 }
10710 /* XXX add LOOPRET support */
10711 fc_poplevel(ctx);
10712 callstack_pop(ctx, FC_LOOP);
10713 return 0;
10714 }
10715
10716 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10717 {
10718 unsigned int fscp;
10719
10720 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10721 {
10722 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10723 break;
10724 }
10725
10726 if (fscp == 0) {
10727 R600_ERR("Break not inside loop/endloop pair\n");
10728 return -EINVAL;
10729 }
10730
10731 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10732
10733 fc_set_mid(ctx, fscp - 1);
10734
10735 return 0;
10736 }
10737
10738 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10739 {
10740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10741 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10742 int r;
10743
10744 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10745 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10746
10747 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10748 if (!r) {
10749 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10750 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10751 return emit_inc_ring_offset(ctx, stream, TRUE);
10752 }
10753 return r;
10754 }
10755
10756 static int tgsi_umad(struct r600_shader_ctx *ctx)
10757 {
10758 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10759 struct r600_bytecode_alu alu;
10760 int i, j, r;
10761 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10762
10763 /* src0 * src1 */
10764 for (i = 0; i < lasti + 1; i++) {
10765 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10766 continue;
10767
10768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10769
10770 alu.dst.chan = i;
10771 alu.dst.sel = ctx->temp_reg;
10772 alu.dst.write = 1;
10773
10774 alu.op = ALU_OP2_MULLO_UINT;
10775 for (j = 0; j < 2; j++) {
10776 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10777 }
10778
10779 alu.last = 1;
10780 r = emit_mul_int_op(ctx->bc, &alu);
10781 if (r)
10782 return r;
10783 }
10784
10785
10786 for (i = 0; i < lasti + 1; i++) {
10787 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10788 continue;
10789
10790 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10791 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10792
10793 alu.op = ALU_OP2_ADD_INT;
10794
10795 alu.src[0].sel = ctx->temp_reg;
10796 alu.src[0].chan = i;
10797
10798 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10799 if (i == lasti) {
10800 alu.last = 1;
10801 }
10802 r = r600_bytecode_add_alu(ctx->bc, &alu);
10803 if (r)
10804 return r;
10805 }
10806 return 0;
10807 }
10808
10809 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10810 {
10811 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10812 struct r600_bytecode_alu alu;
10813 int r, i;
10814 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10815
10816 /* temp.xy = f32_to_f16(src) */
10817 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10818 alu.op = ALU_OP1_FLT32_TO_FLT16;
10819 alu.dst.chan = 0;
10820 alu.dst.sel = ctx->temp_reg;
10821 alu.dst.write = 1;
10822 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10823 r = r600_bytecode_add_alu(ctx->bc, &alu);
10824 if (r)
10825 return r;
10826 alu.dst.chan = 1;
10827 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10828 alu.last = 1;
10829 r = r600_bytecode_add_alu(ctx->bc, &alu);
10830 if (r)
10831 return r;
10832
10833 /* dst.x = temp.y * 0x10000 + temp.x */
10834 for (i = 0; i < lasti + 1; i++) {
10835 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10836 continue;
10837
10838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10839 alu.op = ALU_OP3_MULADD_UINT24;
10840 alu.is_op3 = 1;
10841 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10842 alu.last = i == lasti;
10843 alu.src[0].sel = ctx->temp_reg;
10844 alu.src[0].chan = 1;
10845 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10846 alu.src[1].value = 0x10000;
10847 alu.src[2].sel = ctx->temp_reg;
10848 alu.src[2].chan = 0;
10849 r = r600_bytecode_add_alu(ctx->bc, &alu);
10850 if (r)
10851 return r;
10852 }
10853
10854 return 0;
10855 }
10856
10857 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10858 {
10859 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10860 struct r600_bytecode_alu alu;
10861 int r, i;
10862 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10863
10864 /* temp.x = src.x */
10865 /* note: no need to mask out the high bits */
10866 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10867 alu.op = ALU_OP1_MOV;
10868 alu.dst.chan = 0;
10869 alu.dst.sel = ctx->temp_reg;
10870 alu.dst.write = 1;
10871 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10872 r = r600_bytecode_add_alu(ctx->bc, &alu);
10873 if (r)
10874 return r;
10875
10876 /* temp.y = src.x >> 16 */
10877 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10878 alu.op = ALU_OP2_LSHR_INT;
10879 alu.dst.chan = 1;
10880 alu.dst.sel = ctx->temp_reg;
10881 alu.dst.write = 1;
10882 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10883 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10884 alu.src[1].value = 16;
10885 alu.last = 1;
10886 r = r600_bytecode_add_alu(ctx->bc, &alu);
10887 if (r)
10888 return r;
10889
10890 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10891 for (i = 0; i < lasti + 1; i++) {
10892 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10893 continue;
10894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10895 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10896 alu.op = ALU_OP1_FLT16_TO_FLT32;
10897 alu.src[0].sel = ctx->temp_reg;
10898 alu.src[0].chan = i % 2;
10899 alu.last = i == lasti;
10900 r = r600_bytecode_add_alu(ctx->bc, &alu);
10901 if (r)
10902 return r;
10903 }
10904
10905 return 0;
10906 }
10907
10908 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10909 {
10910 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10911 struct r600_bytecode_alu alu;
10912 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10913 int r, i;
10914 int dst = -1;
10915
10916 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10917 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10918 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10919 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10920 dst = r600_get_temp(ctx);
10921
10922 r = tgsi_op3_dst(ctx, dst);
10923 if (r)
10924 return r;
10925
10926 for (i = 0; i < lasti + 1; i++) {
10927 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10928 alu.op = ALU_OP2_SETGE_INT;
10929 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10930 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10931 alu.src[1].value = 32;
10932 alu.dst.sel = ctx->temp_reg;
10933 alu.dst.chan = i;
10934 alu.dst.write = 1;
10935 if (i == lasti)
10936 alu.last = 1;
10937 r = r600_bytecode_add_alu(ctx->bc, &alu);
10938 if (r)
10939 return r;
10940 }
10941
10942 for (i = 0; i < lasti + 1; i++) {
10943 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10944 alu.op = ALU_OP3_CNDE_INT;
10945 alu.is_op3 = 1;
10946 alu.src[0].sel = ctx->temp_reg;
10947 alu.src[0].chan = i;
10948
10949 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10950 if (dst != -1)
10951 alu.src[1].sel = dst;
10952 else
10953 alu.src[1].sel = alu.dst.sel;
10954 alu.src[1].chan = i;
10955 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10956 alu.dst.write = 1;
10957 if (i == lasti)
10958 alu.last = 1;
10959 r = r600_bytecode_add_alu(ctx->bc, &alu);
10960 if (r)
10961 return r;
10962 }
10963
10964 return 0;
10965 }
10966
10967 static int tgsi_clock(struct r600_shader_ctx *ctx)
10968 {
10969 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10970 struct r600_bytecode_alu alu;
10971 int r;
10972
10973 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10974 alu.op = ALU_OP1_MOV;
10975 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10976 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10977 r = r600_bytecode_add_alu(ctx->bc, &alu);
10978 if (r)
10979 return r;
10980 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10981 alu.op = ALU_OP1_MOV;
10982 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10983 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10984 alu.last = 1;
10985 r = r600_bytecode_add_alu(ctx->bc, &alu);
10986 if (r)
10987 return r;
10988 return 0;
10989 }
10990
10991 static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10992 int treg,
10993 int src0_sel, int src0_chan,
10994 int src1_sel, int src1_chan)
10995 {
10996 struct r600_bytecode_alu alu;
10997 int r;
10998 int opc;
10999
11000 if (op == ALU_OP2_ADD_INT)
11001 opc = ALU_OP2_ADDC_UINT;
11002 else
11003 opc = ALU_OP2_SUBB_UINT;
11004
11005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11006 alu.op = op; ;
11007 alu.dst.sel = treg;
11008 alu.dst.chan = 0;
11009 alu.dst.write = 1;
11010 alu.src[0].sel = src0_sel;
11011 alu.src[0].chan = src0_chan + 0;
11012 alu.src[1].sel = src1_sel;
11013 alu.src[1].chan = src1_chan + 0;
11014 alu.src[1].neg = 0;
11015 r = r600_bytecode_add_alu(ctx->bc, &alu);
11016 if (r)
11017 return r;
11018
11019 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11020 alu.op = op;
11021 alu.dst.sel = treg;
11022 alu.dst.chan = 1;
11023 alu.dst.write = 1;
11024 alu.src[0].sel = src0_sel;
11025 alu.src[0].chan = src0_chan + 1;
11026 alu.src[1].sel = src1_sel;
11027 alu.src[1].chan = src1_chan + 1;
11028 alu.src[1].neg = 0;
11029 r = r600_bytecode_add_alu(ctx->bc, &alu);
11030 if (r)
11031 return r;
11032
11033 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11034 alu.op = opc;
11035 alu.dst.sel = treg;
11036 alu.dst.chan = 2;
11037 alu.dst.write = 1;
11038 alu.last = 1;
11039 alu.src[0].sel = src0_sel;
11040 alu.src[0].chan = src0_chan + 0;
11041 alu.src[1].sel = src1_sel;
11042 alu.src[1].chan = src1_chan + 0;
11043 alu.src[1].neg = 0;
11044 r = r600_bytecode_add_alu(ctx->bc, &alu);
11045 if (r)
11046 return r;
11047
11048 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11049 alu.op = op;
11050 alu.dst.sel = treg;
11051 alu.dst.chan = 1;
11052 alu.dst.write = 1;
11053 alu.src[0].sel = treg;
11054 alu.src[0].chan = 1;
11055 alu.src[1].sel = treg;
11056 alu.src[1].chan = 2;
11057 alu.last = 1;
11058 r = r600_bytecode_add_alu(ctx->bc, &alu);
11059 if (r)
11060 return r;
11061 return 0;
11062 }
11063
11064 static int egcm_u64add(struct r600_shader_ctx *ctx)
11065 {
11066 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11067 struct r600_bytecode_alu alu;
11068 int r;
11069 int treg = ctx->temp_reg;
11070 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
11071
11072 if (ctx->src[1].neg) {
11073 op = ALU_OP2_SUB_INT;
11074 opc = ALU_OP2_SUBB_UINT;
11075 }
11076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11077 alu.op = op; ;
11078 alu.dst.sel = treg;
11079 alu.dst.chan = 0;
11080 alu.dst.write = 1;
11081 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11082 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11083 alu.src[1].neg = 0;
11084 r = r600_bytecode_add_alu(ctx->bc, &alu);
11085 if (r)
11086 return r;
11087
11088 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11089 alu.op = op;
11090 alu.dst.sel = treg;
11091 alu.dst.chan = 1;
11092 alu.dst.write = 1;
11093 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11094 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11095 alu.src[1].neg = 0;
11096 r = r600_bytecode_add_alu(ctx->bc, &alu);
11097 if (r)
11098 return r;
11099
11100 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11101 alu.op = opc ;
11102 alu.dst.sel = treg;
11103 alu.dst.chan = 2;
11104 alu.dst.write = 1;
11105 alu.last = 1;
11106 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11107 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11108 alu.src[1].neg = 0;
11109 r = r600_bytecode_add_alu(ctx->bc, &alu);
11110 if (r)
11111 return r;
11112
11113 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11114 alu.op = op;
11115 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11116 alu.src[0].sel = treg;
11117 alu.src[0].chan = 1;
11118 alu.src[1].sel = treg;
11119 alu.src[1].chan = 2;
11120 alu.last = 1;
11121 r = r600_bytecode_add_alu(ctx->bc, &alu);
11122 if (r)
11123 return r;
11124 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11125 alu.op = ALU_OP1_MOV;
11126 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11127 alu.src[0].sel = treg;
11128 alu.src[0].chan = 0;
11129 alu.last = 1;
11130 r = r600_bytecode_add_alu(ctx->bc, &alu);
11131 if (r)
11132 return r;
11133 return 0;
11134 }
11135
11136 /* result.y = mul_high a, b
11137 result.x = mul a,b
11138 result.y += a.x * b.y + a.y * b.x;
11139 */
11140 static int egcm_u64mul(struct r600_shader_ctx *ctx)
11141 {
11142 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11143 struct r600_bytecode_alu alu;
11144 int r;
11145 int treg = ctx->temp_reg;
11146
11147 /* temp.x = mul_lo a.x, b.x */
11148 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11149 alu.op = ALU_OP2_MULLO_UINT;
11150 alu.dst.sel = treg;
11151 alu.dst.chan = 0;
11152 alu.dst.write = 1;
11153 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11154 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11155 r = emit_mul_int_op(ctx->bc, &alu);
11156 if (r)
11157 return r;
11158
11159 /* temp.y = mul_hi a.x, b.x */
11160 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11161 alu.op = ALU_OP2_MULHI_UINT;
11162 alu.dst.sel = treg;
11163 alu.dst.chan = 1;
11164 alu.dst.write = 1;
11165 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11166 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11167 r = emit_mul_int_op(ctx->bc, &alu);
11168 if (r)
11169 return r;
11170
11171 /* temp.z = mul a.x, b.y */
11172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11173 alu.op = ALU_OP2_MULLO_UINT;
11174 alu.dst.sel = treg;
11175 alu.dst.chan = 2;
11176 alu.dst.write = 1;
11177 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11178 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11179 r = emit_mul_int_op(ctx->bc, &alu);
11180 if (r)
11181 return r;
11182
11183 /* temp.w = mul a.y, b.x */
11184 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11185 alu.op = ALU_OP2_MULLO_UINT;
11186 alu.dst.sel = treg;
11187 alu.dst.chan = 3;
11188 alu.dst.write = 1;
11189 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11190 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11191 r = emit_mul_int_op(ctx->bc, &alu);
11192 if (r)
11193 return r;
11194
11195 /* temp.z = temp.z + temp.w */
11196 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11197 alu.op = ALU_OP2_ADD_INT;
11198 alu.dst.sel = treg;
11199 alu.dst.chan = 2;
11200 alu.dst.write = 1;
11201 alu.src[0].sel = treg;
11202 alu.src[0].chan = 2;
11203 alu.src[1].sel = treg;
11204 alu.src[1].chan = 3;
11205 alu.last = 1;
11206 r = r600_bytecode_add_alu(ctx->bc, &alu);
11207 if (r)
11208 return r;
11209
11210 /* temp.y = temp.y + temp.z */
11211 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11212 alu.op = ALU_OP2_ADD_INT;
11213 alu.dst.sel = treg;
11214 alu.dst.chan = 1;
11215 alu.dst.write = 1;
11216 alu.src[0].sel = treg;
11217 alu.src[0].chan = 1;
11218 alu.src[1].sel = treg;
11219 alu.src[1].chan = 2;
11220 alu.last = 1;
11221 r = r600_bytecode_add_alu(ctx->bc, &alu);
11222 if (r)
11223 return r;
11224
11225 /* dst.x = temp.x */
11226 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11227 alu.op = ALU_OP1_MOV;
11228 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11229 alu.src[0].sel = treg;
11230 alu.src[0].chan = 0;
11231 r = r600_bytecode_add_alu(ctx->bc, &alu);
11232 if (r)
11233 return r;
11234
11235 /* dst.y = temp.y */
11236 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11237 alu.op = ALU_OP1_MOV;
11238 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11239 alu.src[0].sel = treg;
11240 alu.src[0].chan = 1;
11241 alu.last = 1;
11242 r = r600_bytecode_add_alu(ctx->bc, &alu);
11243 if (r)
11244 return r;
11245
11246 return 0;
11247 }
11248
11249 static int emit_u64sge(struct r600_shader_ctx *ctx,
11250 int treg,
11251 int src0_sel, int src0_base_chan,
11252 int src1_sel, int src1_base_chan)
11253 {
11254 int r;
11255 /* for 64-bit sge */
11256 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11257 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11258 treg, 1,
11259 src0_sel, src0_base_chan + 1,
11260 src1_sel, src1_base_chan + 1);
11261 if (r)
11262 return r;
11263
11264 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11265 treg, 0,
11266 src0_sel, src0_base_chan,
11267 src1_sel, src1_base_chan);
11268 if (r)
11269 return r;
11270
11271 r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11272 treg, 2,
11273 src0_sel, src0_base_chan + 1,
11274 src1_sel, src1_base_chan + 1);
11275 if (r)
11276 return r;
11277
11278 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11279 treg, 0,
11280 treg, 0,
11281 treg, 2);
11282 if (r)
11283 return r;
11284
11285 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11286 treg, 0,
11287 treg, 0,
11288 treg, 1);
11289 if (r)
11290 return r;
11291 return 0;
11292 }
11293
11294 /* this isn't a complete div it's just enough for qbo shader to work */
11295 static int egcm_u64div(struct r600_shader_ctx *ctx)
11296 {
11297 struct r600_bytecode_alu alu;
11298 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11299 int r, i;
11300 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11301
11302 /* make sure we are dividing my a const with 0 in the high bits */
11303 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11304 return -1;
11305 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11306 return -1;
11307 /* make sure we are doing one division */
11308 if (inst->Dst[0].Register.WriteMask != 0x3)
11309 return -1;
11310
11311 /* emit_if uses ctx->temp_reg so we can't */
11312 int treg = r600_get_temp(ctx);
11313 int tmp_num = r600_get_temp(ctx);
11314 int sub_tmp = r600_get_temp(ctx);
11315
11316 /* tmp quot are tmp_num.zw */
11317 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11318 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11319 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11320 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11321
11322 /* MOV tmp_num.xy, numerator */
11323 r = single_alu_op2(ctx, ALU_OP1_MOV,
11324 tmp_num, 0,
11325 alu_num_lo.sel, alu_num_lo.chan,
11326 0, 0);
11327 if (r)
11328 return r;
11329 r = single_alu_op2(ctx, ALU_OP1_MOV,
11330 tmp_num, 1,
11331 alu_num_hi.sel, alu_num_hi.chan,
11332 0, 0);
11333 if (r)
11334 return r;
11335
11336 r = single_alu_op2(ctx, ALU_OP1_MOV,
11337 tmp_num, 2,
11338 V_SQ_ALU_SRC_LITERAL, 0,
11339 0, 0);
11340 if (r)
11341 return r;
11342
11343 r = single_alu_op2(ctx, ALU_OP1_MOV,
11344 tmp_num, 3,
11345 V_SQ_ALU_SRC_LITERAL, 0,
11346 0, 0);
11347 if (r)
11348 return r;
11349
11350 /* treg 0 is log2_denom */
11351 /* normally this gets the MSB for the denom high value
11352 - however we know this will always be 0 here. */
11353 r = single_alu_op2(ctx,
11354 ALU_OP1_MOV,
11355 treg, 0,
11356 V_SQ_ALU_SRC_LITERAL, 32,
11357 0, 0);
11358 if (r)
11359 return r;
11360
11361 /* normally check demon hi for 0, but we know it is already */
11362 /* t0.z = num_hi >= denom_lo */
11363 r = single_alu_op2(ctx,
11364 ALU_OP2_SETGE_UINT,
11365 treg, 1,
11366 alu_num_hi.sel, alu_num_hi.chan,
11367 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11368 if (r)
11369 return r;
11370
11371 memset(&alu_src, 0, sizeof(alu_src));
11372 alu_src.sel = treg;
11373 alu_src.chan = 1;
11374 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11375 if (r)
11376 return r;
11377
11378 /* for loops in here */
11379 /* get msb t0.x = msb(src[1].x) first */
11380 int msb_lo = util_last_bit(alu_denom_lo.value);
11381 r = single_alu_op2(ctx, ALU_OP1_MOV,
11382 treg, 0,
11383 V_SQ_ALU_SRC_LITERAL, msb_lo,
11384 0, 0);
11385 if (r)
11386 return r;
11387
11388 /* unroll the asm here */
11389 for (i = 0; i < 31; i++) {
11390 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11391 treg, 2,
11392 V_SQ_ALU_SRC_LITERAL, i,
11393 treg, 0);
11394 if (r)
11395 return r;
11396
11397 /* we can do this on the CPU */
11398 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11399 /* t0.z = tmp_num.y >= t0.z */
11400 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11401 treg, 1,
11402 tmp_num, 1,
11403 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11404 if (r)
11405 return r;
11406
11407 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11408 treg, 1,
11409 treg, 1,
11410 treg, 2);
11411 if (r)
11412 return r;
11413
11414 memset(&alu_src, 0, sizeof(alu_src));
11415 alu_src.sel = treg;
11416 alu_src.chan = 1;
11417 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11418 if (r)
11419 return r;
11420
11421 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11422 tmp_num, 1,
11423 tmp_num, 1,
11424 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11425 if (r)
11426 return r;
11427
11428 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11429 tmp_num, 3,
11430 tmp_num, 3,
11431 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11432 if (r)
11433 return r;
11434
11435 r = tgsi_endif(ctx);
11436 if (r)
11437 return r;
11438 }
11439
11440 /* log2_denom is always <= 31, so manually peel the last loop
11441 * iteration.
11442 */
11443 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11444 treg, 1,
11445 tmp_num, 1,
11446 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11447 if (r)
11448 return r;
11449
11450 memset(&alu_src, 0, sizeof(alu_src));
11451 alu_src.sel = treg;
11452 alu_src.chan = 1;
11453 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11454 if (r)
11455 return r;
11456
11457 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11458 tmp_num, 1,
11459 tmp_num, 1,
11460 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11461 if (r)
11462 return r;
11463
11464 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11465 tmp_num, 3,
11466 tmp_num, 3,
11467 V_SQ_ALU_SRC_LITERAL, 1U);
11468 if (r)
11469 return r;
11470 r = tgsi_endif(ctx);
11471 if (r)
11472 return r;
11473
11474 r = tgsi_endif(ctx);
11475 if (r)
11476 return r;
11477
11478 /* onto the second loop to unroll */
11479 for (i = 0; i < 31; i++) {
11480 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11481 treg, 1,
11482 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11483 treg, 0);
11484 if (r)
11485 return r;
11486
11487 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11488 r = single_alu_op2(ctx, ALU_OP1_MOV,
11489 treg, 2,
11490 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11491 0, 0);
11492 if (r)
11493 return r;
11494
11495 r = single_alu_op2(ctx, ALU_OP1_MOV,
11496 treg, 3,
11497 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11498 0, 0);
11499 if (r)
11500 return r;
11501
11502 r = emit_u64sge(ctx, sub_tmp,
11503 tmp_num, 0,
11504 treg, 2);
11505 if (r)
11506 return r;
11507
11508 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11509 treg, 1,
11510 treg, 1,
11511 sub_tmp, 0);
11512 if (r)
11513 return r;
11514
11515 memset(&alu_src, 0, sizeof(alu_src));
11516 alu_src.sel = treg;
11517 alu_src.chan = 1;
11518 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11519 if (r)
11520 return r;
11521
11522
11523 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11524 sub_tmp,
11525 tmp_num, 0,
11526 treg, 2);
11527 if (r)
11528 return r;
11529
11530 r = single_alu_op2(ctx, ALU_OP1_MOV,
11531 tmp_num, 0,
11532 sub_tmp, 0,
11533 0, 0);
11534 if (r)
11535 return r;
11536
11537 r = single_alu_op2(ctx, ALU_OP1_MOV,
11538 tmp_num, 1,
11539 sub_tmp, 1,
11540 0, 0);
11541 if (r)
11542 return r;
11543
11544 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11545 tmp_num, 2,
11546 tmp_num, 2,
11547 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11548 if (r)
11549 return r;
11550
11551 r = tgsi_endif(ctx);
11552 if (r)
11553 return r;
11554 }
11555
11556 /* log2_denom is always <= 63, so manually peel the last loop
11557 * iteration.
11558 */
11559 uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11560 r = single_alu_op2(ctx, ALU_OP1_MOV,
11561 treg, 2,
11562 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11563 0, 0);
11564 if (r)
11565 return r;
11566
11567 r = single_alu_op2(ctx, ALU_OP1_MOV,
11568 treg, 3,
11569 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11570 0, 0);
11571 if (r)
11572 return r;
11573
11574 r = emit_u64sge(ctx, sub_tmp,
11575 tmp_num, 0,
11576 treg, 2);
11577 if (r)
11578 return r;
11579
11580 memset(&alu_src, 0, sizeof(alu_src));
11581 alu_src.sel = sub_tmp;
11582 alu_src.chan = 0;
11583 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11584 if (r)
11585 return r;
11586
11587 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11588 sub_tmp,
11589 tmp_num, 0,
11590 treg, 2);
11591 if (r)
11592 return r;
11593
11594 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11595 tmp_num, 2,
11596 tmp_num, 2,
11597 V_SQ_ALU_SRC_LITERAL, 1U);
11598 if (r)
11599 return r;
11600 r = tgsi_endif(ctx);
11601 if (r)
11602 return r;
11603
11604 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11605 alu.op = ALU_OP1_MOV;
11606 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11607 alu.src[0].sel = tmp_num;
11608 alu.src[0].chan = 2;
11609 r = r600_bytecode_add_alu(ctx->bc, &alu);
11610 if (r)
11611 return r;
11612
11613 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11614 alu.op = ALU_OP1_MOV;
11615 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11616 alu.src[0].sel = tmp_num;
11617 alu.src[0].chan = 3;
11618 alu.last = 1;
11619 r = r600_bytecode_add_alu(ctx->bc, &alu);
11620 if (r)
11621 return r;
11622 return 0;
11623 }
11624
11625 static int egcm_u64sne(struct r600_shader_ctx *ctx)
11626 {
11627 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11628 struct r600_bytecode_alu alu;
11629 int r;
11630 int treg = ctx->temp_reg;
11631
11632 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11633 alu.op = ALU_OP2_SETNE_INT;
11634 alu.dst.sel = treg;
11635 alu.dst.chan = 0;
11636 alu.dst.write = 1;
11637 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11638 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11639 r = r600_bytecode_add_alu(ctx->bc, &alu);
11640 if (r)
11641 return r;
11642
11643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11644 alu.op = ALU_OP2_SETNE_INT;
11645 alu.dst.sel = treg;
11646 alu.dst.chan = 1;
11647 alu.dst.write = 1;
11648 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11649 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11650 alu.last = 1;
11651 r = r600_bytecode_add_alu(ctx->bc, &alu);
11652 if (r)
11653 return r;
11654
11655 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11656 alu.op = ALU_OP2_OR_INT;
11657 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11658 alu.src[0].sel = treg;
11659 alu.src[0].chan = 0;
11660 alu.src[1].sel = treg;
11661 alu.src[1].chan = 1;
11662 alu.last = 1;
11663 r = r600_bytecode_add_alu(ctx->bc, &alu);
11664 if (r)
11665 return r;
11666 return 0;
11667 }
11668
11669 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11670 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
11671 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11672 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11673
11674 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11675
11676 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11677 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11678 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11679 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11680 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11681 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11682 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11683 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11684 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11685 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11686 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11687 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11688 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11689 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11690 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11691 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
11692 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11693 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11694 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11695 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11696 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11697 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11698 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11699 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11700 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11701 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11702 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11703 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11704 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11705 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
11706 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11707 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11708 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11709 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11710 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11711 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11712 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
11713 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11714 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11715 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11716 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11717 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11718 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11719 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11720 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11721 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11722 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11723 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11724 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11725 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11726 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11727 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
11728 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11729 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11730 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11731 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11732 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11733 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
11734 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11735 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11736 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11737 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11738 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11739 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11740 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11741 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11742 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11743 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11744 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11745 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11746 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11747 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11748 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11749 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11750 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11751 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11752 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11753 [81] = { ALU_OP0_NOP, tgsi_unsupported},
11754 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11755 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11756 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11757 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11758 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11759 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11760 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11761 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11762 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11763 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11764 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11765 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11766 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11767 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11768 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11769 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11770 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11771 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11772 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11773 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11774 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11775 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11776 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11777 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
11778 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11779 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11780 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11781 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11782 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11783 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11784 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
11785 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11786 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11787 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11788 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11789 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11790 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
11791 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11792 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11793 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11794 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11795 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11796 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11797 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11798 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11799 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11800 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11801 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11802 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11803 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11804 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11805 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11806 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11807 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11808 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11809 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11810 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11811 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11812 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11813 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11814 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11815 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11816 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11817 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11818 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11819 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11820 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11821 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11822 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11823 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11824 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11825 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11826 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11827 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11828 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11829 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11830 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11831 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11832 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11833 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
11834 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
11835 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11836 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11837 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11838 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
11839 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
11840 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
11841 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
11842 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
11843 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
11844 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
11845 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11846 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11847 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11848 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11849 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11850 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11851 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11852 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11853 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11854 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
11855 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
11856 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
11857 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
11858 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
11859 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
11860 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
11861 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
11862 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
11863 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11864 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
11865 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
11866 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
11867 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11868 };
11869
11870 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11871 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11872 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11873 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11874 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11875 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11876 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11877 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11878 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11879 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11880 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11881 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11882 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11883 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11884 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11885 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11886 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11887 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11888 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11889 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11890 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11891 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11892 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11893 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11894 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11895 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11896 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11897 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11898 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11899 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11900 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11901 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11902 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11903 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11904 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11905 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11906 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11907 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11908 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11909 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11910 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11911 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11912 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11913 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11914 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11915 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11916 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11917 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11918 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11919 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11920 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11921 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11922 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11923 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11924 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11925 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11926 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11927 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11928 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11929 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11930 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11931 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11932 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11933 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11934 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11935 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11936 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11937 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11938 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11939 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11940 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11941 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11942 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11943 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11944 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11945 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11946 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11947 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11948 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11949 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11950 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11951 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11952 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11953 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11954 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11955 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11956 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11957 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11958 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11959 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11960 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11961 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11962 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11963 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11964 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11965 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11966 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11967 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11968 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11969 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11970 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11971 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11972 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11973 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11974 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11975 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11976 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11977 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11978 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11979 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11980 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11981 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11982 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11983 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11984 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11985 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11986 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11987 /* Refer below for TGSI_OPCODE_DFMA */
11988 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11989 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11990 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11991 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11992 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11993 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11994 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11995 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11996 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11997 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11998 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11999 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12000 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12001 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12002 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12003 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12004 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
12005 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12006 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12007 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12008 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12009 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12010 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12011 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12012 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12013 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12014 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12015 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12016 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12017 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12018 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12019 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12020 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12021 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12022 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12023 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12024 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12025 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12026 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12027 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12028 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12029 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12030 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12031 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12032 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12033 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12034 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12035 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12036 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12037 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12038 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12039 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12040 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12041 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12042 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12043 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12044 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12045 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12046 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12047 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12048 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12049 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
12050 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
12051 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12052 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12053 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12054 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12055 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12056 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12057 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12058 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12059 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12060 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12061 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12062 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12063 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12064 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12065 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12066 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12067 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12068 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12069 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12070 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12071 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12072 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12073 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12074 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12075 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12076 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12077 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12078 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12079 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12080 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12081 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12082 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12083 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12084 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12085 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12086 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12087 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12088 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12089 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12090 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12091 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12092 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12093 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12094 };
12095
12096 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12097 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
12098 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
12099 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
12100 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12101 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12102 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
12103 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
12104 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
12105 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
12106 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12107 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12108 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
12109 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
12110 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
12111 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
12112 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
12113 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
12114 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
12115 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
12116 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12117 [21] = { ALU_OP0_NOP, tgsi_unsupported},
12118 [22] = { ALU_OP0_NOP, tgsi_unsupported},
12119 [23] = { ALU_OP0_NOP, tgsi_unsupported},
12120 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
12121 [25] = { ALU_OP0_NOP, tgsi_unsupported},
12122 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
12123 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
12124 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12125 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12126 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
12127 [31] = { ALU_OP0_NOP, tgsi_unsupported},
12128 [32] = { ALU_OP0_NOP, tgsi_unsupported},
12129 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
12130 [34] = { ALU_OP0_NOP, tgsi_unsupported},
12131 [35] = { ALU_OP0_NOP, tgsi_unsupported},
12132 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
12133 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12134 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12135 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
12136 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
12137 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
12138 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
12139 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12140 [44] = { ALU_OP0_NOP, tgsi_unsupported},
12141 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
12142 [46] = { ALU_OP0_NOP, tgsi_unsupported},
12143 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
12144 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
12145 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
12146 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
12147 [51] = { ALU_OP0_NOP, tgsi_unsupported},
12148 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
12149 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
12150 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
12151 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
12152 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
12153 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
12154 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12155 [59] = { ALU_OP0_NOP, tgsi_unsupported},
12156 [60] = { ALU_OP0_NOP, tgsi_unsupported},
12157 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
12158 [62] = { ALU_OP0_NOP, tgsi_unsupported},
12159 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
12160 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
12161 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
12162 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
12163 [67] = { ALU_OP0_NOP, tgsi_unsupported},
12164 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12165 [69] = { ALU_OP0_NOP, tgsi_unsupported},
12166 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
12167 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12168 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12169 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12170 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
12171 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
12172 [76] = { ALU_OP0_NOP, tgsi_unsupported},
12173 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
12174 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
12175 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12176 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12177 [82] = { ALU_OP0_NOP, tgsi_unsupported},
12178 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
12179 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
12180 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
12181 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
12182 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
12183 [88] = { ALU_OP0_NOP, tgsi_unsupported},
12184 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
12185 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
12186 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
12187 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
12188 [93] = { ALU_OP0_NOP, tgsi_unsupported},
12189 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
12190 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12191 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12192 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12193 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12194 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
12195 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12196 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
12197 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12198 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12199 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12200 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12201 [106] = { ALU_OP0_NOP, tgsi_unsupported},
12202 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
12203 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
12204 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
12205 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12206 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12207 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12208 [113] = { ALU_OP0_NOP, tgsi_unsupported},
12209 [114] = { ALU_OP0_NOP, tgsi_unsupported},
12210 [115] = { ALU_OP0_NOP, tgsi_unsupported},
12211 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
12212 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
12213 /* Refer below for TGSI_OPCODE_DFMA */
12214 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
12215 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
12216 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
12217 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
12218 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
12219 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
12220 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
12221 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12222 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12223 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12224 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
12225 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12226 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12227 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12228 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12229 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12230 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12231 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12232 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12233 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12234 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12235 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12236 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12237 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12238 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12239 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12240 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12241 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12242 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12243 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12244 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12245 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12246 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12247 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12248 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12249 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12250 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12251 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12252 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12253 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12254 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12255 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12256 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12257 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12258 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12259 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12260 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12261 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12262 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12263 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12264 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12265 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12266 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12267 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12268 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12269 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12270 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12271 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12272 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12273 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12274 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12275 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12276 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12277 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12278 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12279 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12280 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12281 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12282 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12283 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12284 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12285 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12286 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12287 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12288 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12289 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12290 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12291 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12292 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12293 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12294 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12295 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12296 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12297 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12298 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12299 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12300 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12301 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12302 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12303 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12304 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12305 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12306 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12307 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12308 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12309 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12310 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12311 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12312 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12313 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12314 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12315 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12316 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12317 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12318 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12319 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12320 };