nir: Stop passing an options arg to nir_lower_int64()
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600_dump.h"
28 #include "r600d.h"
29 #include "sfn/sfn_nir.h"
30
31 #include "sb/sb_public.h"
32
33 #include "pipe/p_shader_tokens.h"
34 #include "tgsi/tgsi_info.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "tgsi/tgsi_scan.h"
37 #include "tgsi/tgsi_dump.h"
38 #include "tgsi/tgsi_from_mesa.h"
39 #include "nir/tgsi_to_nir.h"
40 #include "nir/nir_to_tgsi_info.h"
41 #include "compiler/nir/nir.h"
42 #include "util/u_bitcast.h"
43 #include "util/u_memory.h"
44 #include "util/u_math.h"
45 #include <stdio.h>
46 #include <errno.h>
47
48 /* CAYMAN notes
49 Why CAYMAN got loops for lots of instructions is explained here.
50
51 -These 8xx t-slot only ops are implemented in all vector slots.
52 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
53 These 8xx t-slot only opcodes become vector ops, with all four
54 slots expecting the arguments on sources a and b. Result is
55 broadcast to all channels.
56 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
57 These 8xx t-slot only opcodes become vector ops in the z, y, and
58 x slots.
59 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
60 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
61 SQRT_IEEE/_64
62 SIN/COS
63 The w slot may have an independent co-issued operation, or if the
64 result is required to be in the w slot, the opcode above may be
65 issued in the w slot as well.
66 The compiler must issue the source argument to slots z, y, and x
67 */
68
69 /* Contents of r0 on entry to various shaders
70
71 VS - .x = VertexID
72 .y = RelVertexID (??)
73 .w = InstanceID
74
75 GS - r0.xyw, r1.xyz = per-vertex offsets
76 r0.z = PrimitiveID
77
78 TCS - .x = PatchID
79 .y = RelPatchID (??)
80 .z = InvocationID
81 .w = tess factor base.
82
83 TES - .x = TessCoord.x
84 - .y = TessCoord.y
85 - .z = RelPatchID (??)
86 - .w = PrimitiveID
87
88 PS - face_gpr.z = SampleMask
89 face_gpr.w = SampleID
90 */
91 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
92 static int r600_shader_from_tgsi(struct r600_context *rctx,
93 struct r600_pipe_shader *pipeshader,
94 union r600_shader_key key);
95
96 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
97 int size, unsigned comp_mask) {
98
99 if (!size)
100 return;
101
102 if (ps->num_arrays == ps->max_arrays) {
103 ps->max_arrays += 64;
104 ps->arrays = realloc(ps->arrays, ps->max_arrays *
105 sizeof(struct r600_shader_array));
106 }
107
108 int n = ps->num_arrays;
109 ++ps->num_arrays;
110
111 ps->arrays[n].comp_mask = comp_mask;
112 ps->arrays[n].gpr_start = start_gpr;
113 ps->arrays[n].gpr_count = size;
114 }
115
116 static void r600_dump_streamout(struct pipe_stream_output_info *so)
117 {
118 unsigned i;
119
120 fprintf(stderr, "STREAMOUT\n");
121 for (i = 0; i < so->num_outputs; i++) {
122 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
123 so->output[i].start_component;
124 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
125 i,
126 so->output[i].stream,
127 so->output[i].output_buffer,
128 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
129 so->output[i].register_index,
130 mask & 1 ? "x" : "",
131 mask & 2 ? "y" : "",
132 mask & 4 ? "z" : "",
133 mask & 8 ? "w" : "",
134 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
135 }
136 }
137
138 static int store_shader(struct pipe_context *ctx,
139 struct r600_pipe_shader *shader)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 uint32_t *ptr, i;
143
144 if (shader->bo == NULL) {
145 shader->bo = (struct r600_resource*)
146 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
147 if (shader->bo == NULL) {
148 return -ENOMEM;
149 }
150 ptr = r600_buffer_map_sync_with_rings(
151 &rctx->b, shader->bo,
152 PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY);
153 if (R600_BIG_ENDIAN) {
154 for (i = 0; i < shader->shader.bc.ndw; ++i) {
155 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
156 }
157 } else {
158 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
159 }
160 rctx->b.ws->buffer_unmap(shader->bo->buf);
161 }
162
163 return 0;
164 }
165
166 extern const struct nir_shader_compiler_options r600_nir_options;
167 static int nshader = 0;
168 int r600_pipe_shader_create(struct pipe_context *ctx,
169 struct r600_pipe_shader *shader,
170 union r600_shader_key key)
171 {
172 struct r600_context *rctx = (struct r600_context *)ctx;
173 struct r600_pipe_shader_selector *sel = shader->selector;
174 int r;
175 struct r600_screen *rscreen = (struct r600_screen *)ctx->screen;
176
177 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
178 tgsi_get_processor_type(sel->tokens):
179 pipe_shader_type_from_mesa(sel->nir->info.stage);
180
181 bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
182 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB) &&
183 !(rscreen->b.debug_flags & DBG_NIR);
184 unsigned sb_disasm;
185 unsigned export_shader;
186
187 shader->shader.bc.isa = rctx->isa;
188
189 if (!(rscreen->b.debug_flags & DBG_NIR)) {
190 assert(sel->ir_type == PIPE_SHADER_IR_TGSI);
191 r = r600_shader_from_tgsi(rctx, shader, key);
192 if (r) {
193 R600_ERR("translation from TGSI failed !\n");
194 goto error;
195 }
196 } else {
197 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
198 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
199 /* Lower int64 ops because we have some r600 build-in shaders that use it */
200 if (!ctx->screen->get_param(ctx->screen, PIPE_CAP_DOUBLES)) {
201 NIR_PASS_V(sel->nir, nir_lower_regs_to_ssa);
202 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, NULL, NULL);
203 NIR_PASS_V(sel->nir, nir_lower_int64);
204 NIR_PASS_V(sel->nir, nir_opt_vectorize);
205 }
206 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false, false);
207 }
208 nir_tgsi_scan_shader(sel->nir, &sel->info, true);
209
210 r = r600_shader_from_nir(rctx, shader, &key);
211 if (r) {
212 fprintf(stderr, "--Failed shader--------------------------------------------------\n");
213
214 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
215 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
216 tgsi_dump(sel->tokens, 0);
217 }
218
219 if (rscreen->b.debug_flags & DBG_NIR) {
220 fprintf(stderr, "--NIR --------------------------------------------------------\n");
221 nir_print_shader(sel->nir, stderr);
222 }
223
224 R600_ERR("translation from NIR failed !\n");
225 goto error;
226 }
227 }
228
229 if (dump) {
230 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
231 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
232 tgsi_dump(sel->tokens, 0);
233 }
234
235 if (sel->so.num_outputs) {
236 r600_dump_streamout(&sel->so);
237 }
238 }
239
240 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
241 /* only disable for vertex shaders in tess paths */
242 if (key.vs.as_ls)
243 use_sb = 0;
244 }
245 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
246 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
247 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
248
249 /* disable SB for shaders using doubles */
250 use_sb &= !shader->shader.uses_doubles;
251
252 use_sb &= !shader->shader.uses_atomics;
253 use_sb &= !shader->shader.uses_images;
254 use_sb &= !shader->shader.uses_helper_invocation;
255
256 /* Check if the bytecode has already been built. */
257 if (!shader->shader.bc.bytecode) {
258 r = r600_bytecode_build(&shader->shader.bc);
259 if (r) {
260 R600_ERR("building bytecode failed !\n");
261 goto error;
262 }
263 }
264
265 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
266 if (dump && !sb_disasm) {
267 fprintf(stderr, "--------------------------------------------------------------\n");
268 r600_bytecode_disasm(&shader->shader.bc);
269 fprintf(stderr, "______________________________________________________________\n");
270 } else if ((dump && sb_disasm) || use_sb) {
271 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
272 dump, use_sb);
273 if (r) {
274 R600_ERR("r600_sb_bytecode_process failed !\n");
275 goto error;
276 }
277 }
278
279 if (dump) {
280 FILE *f;
281 char fname[1024];
282 snprintf(fname, 1024, "shader_from_%s_%d.cpp",
283 (sel->ir_type == PIPE_SHADER_IR_TGSI ?
284 (rscreen->b.debug_flags & DBG_NIR ? "tgsi-nir" : "tgsi")
285 : "nir"), nshader);
286 f = fopen(fname, "w");
287 print_shader_info(f, nshader++, &shader->shader);
288 print_shader_info(stderr, nshader++, &shader->shader);
289 print_pipe_info(stderr, &sel->info);
290 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
291 fprintf(f, "/****TGSI**********************************\n");
292 tgsi_dump_to_file(sel->tokens, 0, f);
293 }
294
295 if (rscreen->b.debug_flags & DBG_NIR){
296 fprintf(f, "/****NIR **********************************\n");
297 nir_print_shader(sel->nir, f);
298 }
299 fprintf(f, "******************************************/\n");
300 fclose(f);
301 }
302
303 if (shader->gs_copy_shader) {
304 if (dump) {
305 // dump copy shader
306 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
307 &shader->gs_copy_shader->shader, dump, 0);
308 if (r)
309 goto error;
310 }
311
312 if ((r = store_shader(ctx, shader->gs_copy_shader)))
313 goto error;
314 }
315
316 /* Store the shader in a buffer. */
317 if ((r = store_shader(ctx, shader)))
318 goto error;
319
320 /* Build state. */
321 switch (shader->shader.processor_type) {
322 case PIPE_SHADER_TESS_CTRL:
323 evergreen_update_hs_state(ctx, shader);
324 break;
325 case PIPE_SHADER_TESS_EVAL:
326 if (key.tes.as_es)
327 evergreen_update_es_state(ctx, shader);
328 else
329 evergreen_update_vs_state(ctx, shader);
330 break;
331 case PIPE_SHADER_GEOMETRY:
332 if (rctx->b.chip_class >= EVERGREEN) {
333 evergreen_update_gs_state(ctx, shader);
334 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
335 } else {
336 r600_update_gs_state(ctx, shader);
337 r600_update_vs_state(ctx, shader->gs_copy_shader);
338 }
339 break;
340 case PIPE_SHADER_VERTEX:
341 export_shader = key.vs.as_es;
342 if (rctx->b.chip_class >= EVERGREEN) {
343 if (key.vs.as_ls)
344 evergreen_update_ls_state(ctx, shader);
345 else if (key.vs.as_es)
346 evergreen_update_es_state(ctx, shader);
347 else
348 evergreen_update_vs_state(ctx, shader);
349 } else {
350 if (export_shader)
351 r600_update_es_state(ctx, shader);
352 else
353 r600_update_vs_state(ctx, shader);
354 }
355 break;
356 case PIPE_SHADER_FRAGMENT:
357 if (rctx->b.chip_class >= EVERGREEN) {
358 evergreen_update_ps_state(ctx, shader);
359 } else {
360 r600_update_ps_state(ctx, shader);
361 }
362 break;
363 case PIPE_SHADER_COMPUTE:
364 evergreen_update_ls_state(ctx, shader);
365 break;
366 default:
367 r = -EINVAL;
368 goto error;
369 }
370 return 0;
371
372 error:
373 r600_pipe_shader_destroy(ctx, shader);
374 return r;
375 }
376
377 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
378 {
379 r600_resource_reference(&shader->bo, NULL);
380 if (shader->shader.bc.cf.next)
381 r600_bytecode_clear(&shader->shader.bc);
382 r600_release_command_buffer(&shader->command_buffer);
383 }
384
385 /*
386 * tgsi -> r600 shader
387 */
388 struct r600_shader_tgsi_instruction;
389
390 struct r600_shader_src {
391 unsigned sel;
392 unsigned swizzle[4];
393 unsigned neg;
394 unsigned abs;
395 unsigned rel;
396 unsigned kc_bank;
397 boolean kc_rel; /* true if cache bank is indexed */
398 uint32_t value[4];
399 };
400
401 struct eg_interp {
402 boolean enabled;
403 unsigned ij_index;
404 };
405
406 struct r600_shader_ctx {
407 struct tgsi_shader_info info;
408 struct tgsi_array_info *array_infos;
409 /* flag for each tgsi temp array if its been spilled or not */
410 bool *spilled_arrays;
411 struct tgsi_parse_context parse;
412 const struct tgsi_token *tokens;
413 unsigned type;
414 unsigned file_offset[TGSI_FILE_COUNT];
415 unsigned temp_reg;
416 const struct r600_shader_tgsi_instruction *inst_info;
417 struct r600_bytecode *bc;
418 struct r600_shader *shader;
419 struct r600_shader_src src[4];
420 uint32_t *literals;
421 uint32_t nliterals;
422 uint32_t max_driver_temp_used;
423 /* needed for evergreen interpolation */
424 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
425 /* evergreen/cayman also store sample mask in face register */
426 int face_gpr;
427 /* sample id is .w component stored in fixed point position register */
428 int fixed_pt_position_gpr;
429 int colors_used;
430 boolean clip_vertex_write;
431 unsigned cv_output;
432 unsigned edgeflag_output;
433 int helper_invoc_reg;
434 int cs_block_size_reg;
435 int cs_grid_size_reg;
436 bool cs_block_size_loaded, cs_grid_size_loaded;
437 int fragcoord_input;
438 int next_ring_offset;
439 int gs_out_ring_offset;
440 int gs_next_vertex;
441 struct r600_shader *gs_for_vs;
442 int gs_export_gpr_tregs[4];
443 int gs_rotated_input[2];
444 const struct pipe_stream_output_info *gs_stream_output_info;
445 unsigned enabled_stream_buffers_mask;
446 unsigned tess_input_info; /* temp with tess input offsets */
447 unsigned tess_output_info; /* temp with tess input offsets */
448 unsigned thread_id_gpr; /* temp with thread id calculated for images */
449 };
450
451 struct r600_shader_tgsi_instruction {
452 unsigned op;
453 int (*process)(struct r600_shader_ctx *ctx);
454 };
455
456 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
457 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
458 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
459 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
460 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
461 static int tgsi_else(struct r600_shader_ctx *ctx);
462 static int tgsi_endif(struct r600_shader_ctx *ctx);
463 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
464 static int tgsi_endloop(struct r600_shader_ctx *ctx);
465 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
466 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
467 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
468 unsigned int dst_reg);
469 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
470 const struct r600_shader_src *shader_src,
471 unsigned chan);
472 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
473 unsigned dst_reg, unsigned mask);
474
475 static bool ctx_needs_stack_workaround_8xx(struct r600_shader_ctx *ctx)
476 {
477 if (ctx->bc->family == CHIP_HEMLOCK ||
478 ctx->bc->family == CHIP_CYPRESS ||
479 ctx->bc->family == CHIP_JUNIPER)
480 return false;
481 return true;
482 }
483
484 static int tgsi_last_instruction(unsigned writemask)
485 {
486 int i, lasti = 0;
487
488 for (i = 0; i < 4; i++) {
489 if (writemask & (1 << i)) {
490 lasti = i;
491 }
492 }
493 return lasti;
494 }
495
496 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
497 {
498 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
499 unsigned j;
500
501 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
502 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
503 return -EINVAL;
504 }
505 #if 0
506 if (i->Instruction.Label) {
507 R600_ERR("label unsupported\n");
508 return -EINVAL;
509 }
510 #endif
511 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
512 if (i->Src[j].Register.Dimension) {
513 switch (i->Src[j].Register.File) {
514 case TGSI_FILE_CONSTANT:
515 case TGSI_FILE_HW_ATOMIC:
516 break;
517 case TGSI_FILE_INPUT:
518 if (ctx->type == PIPE_SHADER_GEOMETRY ||
519 ctx->type == PIPE_SHADER_TESS_CTRL ||
520 ctx->type == PIPE_SHADER_TESS_EVAL)
521 break;
522 /* fallthrough */
523 case TGSI_FILE_OUTPUT:
524 if (ctx->type == PIPE_SHADER_TESS_CTRL)
525 break;
526 /* fallthrough */
527 default:
528 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
529 i->Src[j].Register.File,
530 i->Src[j].Register.Dimension);
531 return -EINVAL;
532 }
533 }
534 }
535 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
536 if (i->Dst[j].Register.Dimension) {
537 if (ctx->type == PIPE_SHADER_TESS_CTRL)
538 continue;
539 R600_ERR("unsupported dst (dimension)\n");
540 return -EINVAL;
541 }
542 }
543 return 0;
544 }
545
546 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
547 {
548 if (interpolate == TGSI_INTERPOLATE_COLOR ||
549 interpolate == TGSI_INTERPOLATE_LINEAR ||
550 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
551 {
552 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
553 int loc;
554
555 switch(location) {
556 case TGSI_INTERPOLATE_LOC_CENTER:
557 loc = 1;
558 break;
559 case TGSI_INTERPOLATE_LOC_CENTROID:
560 loc = 2;
561 break;
562 case TGSI_INTERPOLATE_LOC_SAMPLE:
563 default:
564 loc = 0; break;
565 }
566
567 return is_linear * 3 + loc;
568 }
569
570 return -1;
571 }
572
573 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
574 int input)
575 {
576 int i = eg_get_interpolator_index(
577 ctx->shader->input[input].interpolate,
578 ctx->shader->input[input].interpolate_location);
579 assert(i >= 0);
580 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
581 }
582
583 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
584 {
585 int i, r;
586 struct r600_bytecode_alu alu;
587 int gpr = 0, base_chan = 0;
588 int ij_index = ctx->shader->input[input].ij_index;
589
590 /* work out gpr and base_chan from index */
591 gpr = ij_index / 2;
592 base_chan = (2 * (ij_index % 2)) + 1;
593
594 for (i = 0; i < 8; i++) {
595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
596
597 if (i < 4)
598 alu.op = ALU_OP2_INTERP_ZW;
599 else
600 alu.op = ALU_OP2_INTERP_XY;
601
602 if ((i > 1) && (i < 6)) {
603 alu.dst.sel = ctx->shader->input[input].gpr;
604 alu.dst.write = 1;
605 }
606
607 alu.dst.chan = i % 4;
608
609 alu.src[0].sel = gpr;
610 alu.src[0].chan = (base_chan - (i % 2));
611
612 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
613
614 alu.bank_swizzle_force = SQ_ALU_VEC_210;
615 if ((i % 4) == 3)
616 alu.last = 1;
617 r = r600_bytecode_add_alu(ctx->bc, &alu);
618 if (r)
619 return r;
620 }
621 return 0;
622 }
623
624 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
625 {
626 int i, r;
627 struct r600_bytecode_alu alu;
628
629 for (i = 0; i < 4; i++) {
630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
631
632 alu.op = ALU_OP1_INTERP_LOAD_P0;
633
634 alu.dst.sel = ctx->shader->input[input].gpr;
635 alu.dst.write = 1;
636
637 alu.dst.chan = i;
638
639 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
640 alu.src[0].chan = i;
641
642 if (i == 3)
643 alu.last = 1;
644 r = r600_bytecode_add_alu(ctx->bc, &alu);
645 if (r)
646 return r;
647 }
648 return 0;
649 }
650
651 /*
652 * Special export handling in shaders
653 *
654 * shader export ARRAY_BASE for EXPORT_POS:
655 * 60 is position
656 * 61 is misc vector
657 * 62, 63 are clip distance vectors
658 *
659 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
660 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
661 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
662 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
663 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
664 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
665 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
666 * exclusive from render target index)
667 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
668 *
669 *
670 * shader export ARRAY_BASE for EXPORT_PIXEL:
671 * 0-7 CB targets
672 * 61 computed Z vector
673 *
674 * The use of the values exported in the computed Z vector are controlled
675 * by DB_SHADER_CONTROL:
676 * Z_EXPORT_ENABLE - Z as a float in RED
677 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
678 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
679 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
680 * DB_SOURCE_FORMAT - export control restrictions
681 *
682 */
683
684
685 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
686 static int r600_spi_sid(struct r600_shader_io * io)
687 {
688 int index, name = io->name;
689
690 /* These params are handled differently, they don't need
691 * semantic indices, so we'll use 0 for them.
692 */
693 if (name == TGSI_SEMANTIC_POSITION ||
694 name == TGSI_SEMANTIC_PSIZE ||
695 name == TGSI_SEMANTIC_EDGEFLAG ||
696 name == TGSI_SEMANTIC_FACE ||
697 name == TGSI_SEMANTIC_SAMPLEMASK)
698 index = 0;
699 else {
700 if (name == TGSI_SEMANTIC_GENERIC) {
701 /* For generic params simply use sid from tgsi */
702 index = 9 + io->sid;
703 } else if (name == TGSI_SEMANTIC_TEXCOORD) {
704 index = io->sid;
705 } else {
706 /* For non-generic params - pack name and sid into 8 bits */
707 index = 0x80 | (name<<3) | (io->sid);
708 }
709
710 /* Make sure that all really used indices have nonzero value, so
711 * we can just compare it to 0 later instead of comparing the name
712 * with different values to detect special cases. */
713 index++;
714 }
715
716 return index;
717 };
718
719 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
720 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
721 {
722 switch (semantic_name) {
723 case TGSI_SEMANTIC_POSITION:
724 return 0;
725 case TGSI_SEMANTIC_PSIZE:
726 return 1;
727 case TGSI_SEMANTIC_CLIPDIST:
728 assert(index <= 1);
729 return 2 + index;
730 case TGSI_SEMANTIC_TEXCOORD:
731 return 4 + index;
732 case TGSI_SEMANTIC_GENERIC:
733 if (index <= 63-4)
734 return 4 + index;
735 else
736 /* same explanation as in the default statement,
737 * the only user hitting this is st/nine.
738 */
739 return 0;
740
741 /* patch indices are completely separate and thus start from 0 */
742 case TGSI_SEMANTIC_TESSOUTER:
743 return 0;
744 case TGSI_SEMANTIC_TESSINNER:
745 return 1;
746 case TGSI_SEMANTIC_PATCH:
747 return 2 + index;
748
749 default:
750 /* Don't fail here. The result of this function is only used
751 * for LS, TCS, TES, and GS, where legacy GL semantics can't
752 * occur, but this function is called for all vertex shaders
753 * before it's known whether LS will be compiled or not.
754 */
755 return 0;
756 }
757 }
758
759 /* turn input into interpolate on EG */
760 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
761 {
762 int r = 0;
763
764 if (ctx->shader->input[index].spi_sid) {
765 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
766 if (ctx->shader->input[index].interpolate > 0) {
767 evergreen_interp_assign_ij_index(ctx, index);
768 r = evergreen_interp_alu(ctx, index);
769 } else {
770 r = evergreen_interp_flat(ctx, index);
771 }
772 }
773 return r;
774 }
775
776 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
777 {
778 struct r600_bytecode_alu alu;
779 int i, r;
780 int gpr_front = ctx->shader->input[front].gpr;
781 int gpr_back = ctx->shader->input[back].gpr;
782
783 for (i = 0; i < 4; i++) {
784 memset(&alu, 0, sizeof(alu));
785 alu.op = ALU_OP3_CNDGT;
786 alu.is_op3 = 1;
787 alu.dst.write = 1;
788 alu.dst.sel = gpr_front;
789 alu.src[0].sel = ctx->face_gpr;
790 alu.src[1].sel = gpr_front;
791 alu.src[2].sel = gpr_back;
792
793 alu.dst.chan = i;
794 alu.src[1].chan = i;
795 alu.src[2].chan = i;
796 alu.last = (i==3);
797
798 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
799 return r;
800 }
801
802 return 0;
803 }
804
805 /* execute a single slot ALU calculation */
806 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
807 int dst_sel, int dst_chan,
808 int src0_sel, unsigned src0_chan_val,
809 int src1_sel, unsigned src1_chan_val)
810 {
811 struct r600_bytecode_alu alu;
812 int r, i;
813
814 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
815 for (i = 0; i < 4; i++) {
816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
817 alu.op = op;
818 alu.src[0].sel = src0_sel;
819 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
820 alu.src[0].value = src0_chan_val;
821 else
822 alu.src[0].chan = src0_chan_val;
823 alu.src[1].sel = src1_sel;
824 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
825 alu.src[1].value = src1_chan_val;
826 else
827 alu.src[1].chan = src1_chan_val;
828 alu.dst.sel = dst_sel;
829 alu.dst.chan = i;
830 alu.dst.write = i == dst_chan;
831 alu.last = (i == 3);
832 r = r600_bytecode_add_alu(ctx->bc, &alu);
833 if (r)
834 return r;
835 }
836 return 0;
837 }
838
839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
840 alu.op = op;
841 alu.src[0].sel = src0_sel;
842 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
843 alu.src[0].value = src0_chan_val;
844 else
845 alu.src[0].chan = src0_chan_val;
846 alu.src[1].sel = src1_sel;
847 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
848 alu.src[1].value = src1_chan_val;
849 else
850 alu.src[1].chan = src1_chan_val;
851 alu.dst.sel = dst_sel;
852 alu.dst.chan = dst_chan;
853 alu.dst.write = 1;
854 alu.last = 1;
855 r = r600_bytecode_add_alu(ctx->bc, &alu);
856 if (r)
857 return r;
858 return 0;
859 }
860
861 /* execute a single slot ALU calculation */
862 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
863 int dst_sel, int dst_chan,
864 int src0_sel, unsigned src0_chan_val,
865 int src1_sel, unsigned src1_chan_val,
866 int src2_sel, unsigned src2_chan_val)
867 {
868 struct r600_bytecode_alu alu;
869 int r;
870
871 /* validate this for other ops */
872 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
874 alu.op = op;
875 alu.src[0].sel = src0_sel;
876 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
877 alu.src[0].value = src0_chan_val;
878 else
879 alu.src[0].chan = src0_chan_val;
880 alu.src[1].sel = src1_sel;
881 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
882 alu.src[1].value = src1_chan_val;
883 else
884 alu.src[1].chan = src1_chan_val;
885 alu.src[2].sel = src2_sel;
886 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
887 alu.src[2].value = src2_chan_val;
888 else
889 alu.src[2].chan = src2_chan_val;
890 alu.dst.sel = dst_sel;
891 alu.dst.chan = dst_chan;
892 alu.is_op3 = 1;
893 alu.last = 1;
894 r = r600_bytecode_add_alu(ctx->bc, &alu);
895 if (r)
896 return r;
897 return 0;
898 }
899
900 /* put it in temp_reg.x */
901 static int get_lds_offset0(struct r600_shader_ctx *ctx,
902 int rel_patch_chan,
903 int temp_reg, bool is_patch_var)
904 {
905 int r;
906
907 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
908 /* ADD
909 Dimension - patch0_offset (input_vals.z),
910 Non-dim - patch0_data_offset (input_vals.w)
911 */
912 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
913 temp_reg, 0,
914 ctx->tess_output_info, 0,
915 0, rel_patch_chan,
916 ctx->tess_output_info, is_patch_var ? 3 : 2);
917 if (r)
918 return r;
919 return 0;
920 }
921
922 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
923 {
924 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
925 }
926
927 static int r600_get_temp(struct r600_shader_ctx *ctx)
928 {
929 return ctx->temp_reg + ctx->max_driver_temp_used++;
930 }
931
932 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
933 {
934 int i;
935 i = ctx->shader->noutput++;
936 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
937 ctx->shader->output[i].sid = 0;
938 ctx->shader->output[i].gpr = 0;
939 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
940 ctx->shader->output[i].write_mask = 0x4;
941 ctx->shader->output[i].spi_sid = prim_id_sid;
942
943 return 0;
944 }
945
946 static int tgsi_barrier(struct r600_shader_ctx *ctx)
947 {
948 struct r600_bytecode_alu alu;
949 int r;
950
951 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
952 alu.op = ctx->inst_info->op;
953 alu.last = 1;
954
955 r = r600_bytecode_add_alu(ctx->bc, &alu);
956 if (r)
957 return r;
958 return 0;
959 }
960
961 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
962 {
963 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
964 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
965 unsigned narrays_left = n;
966 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
967
968 *scratch_space_needed = 0;
969 while (*regno > 124 && narrays_left) {
970 unsigned i;
971 unsigned largest = 0;
972 unsigned largest_index = 0;
973
974 for (i = 0; i < n; i++) {
975 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
976 if (!spilled[i] && size > largest) {
977 largest = size;
978 largest_index = i;
979 }
980 }
981
982 spilled[largest_index] = true;
983 *regno -= largest;
984 *scratch_space_needed += largest;
985
986 narrays_left --;
987 }
988
989 if (narrays_left == 0) {
990 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
991 }
992 }
993
994 /* Take spilled temp arrays into account when translating tgsi register
995 * indexes into r600 gprs if spilled is false, or scratch array offset if
996 * spilled is true */
997 static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
998 {
999 unsigned i;
1000 unsigned spilled_size = 0;
1001
1002 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1003 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1004 if (ctx->spilled_arrays[i]) {
1005 /* vec4 index into spilled scratch memory */
1006 *spilled = true;
1007 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
1008 }
1009 else {
1010 /* regular GPR array */
1011 *spilled = false;
1012 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1013 }
1014 }
1015
1016 if (tgsi_reg_index < ctx->array_infos[i].range.First)
1017 break;
1018 if (ctx->spilled_arrays[i]) {
1019 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1020 }
1021 }
1022
1023 /* regular GPR index, minus the holes from spilled arrays */
1024 *spilled = false;
1025
1026 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
1027 }
1028
1029 /* look up spill area base offset and array size for a spilled temp array */
1030 static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
1031 unsigned *array_base, unsigned *array_size)
1032 {
1033 unsigned i;
1034 unsigned offset = 0;
1035
1036 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
1037 if (ctx->spilled_arrays[i]) {
1038 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
1039
1040 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
1041 *array_base = offset;
1042 *array_size = size - 1; /* hw counts from 1 */
1043
1044 return;
1045 }
1046
1047 offset += size;
1048 }
1049 }
1050 }
1051
1052 static int tgsi_declaration(struct r600_shader_ctx *ctx)
1053 {
1054 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
1055 int r, i, j, count = d->Range.Last - d->Range.First + 1;
1056
1057 switch (d->Declaration.File) {
1058 case TGSI_FILE_INPUT:
1059 for (j = 0; j < count; j++) {
1060 i = ctx->shader->ninput + j;
1061 assert(i < ARRAY_SIZE(ctx->shader->input));
1062 ctx->shader->input[i].name = d->Semantic.Name;
1063 ctx->shader->input[i].sid = d->Semantic.Index + j;
1064 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
1065 ctx->shader->input[i].interpolate_location = d->Interp.Location;
1066 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
1067 if (ctx->type == PIPE_SHADER_FRAGMENT) {
1068 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
1069 switch (ctx->shader->input[i].name) {
1070 case TGSI_SEMANTIC_FACE:
1071 if (ctx->face_gpr != -1)
1072 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
1073 else
1074 ctx->face_gpr = ctx->shader->input[i].gpr;
1075 break;
1076 case TGSI_SEMANTIC_COLOR:
1077 ctx->colors_used++;
1078 break;
1079 case TGSI_SEMANTIC_POSITION:
1080 ctx->fragcoord_input = i;
1081 break;
1082 case TGSI_SEMANTIC_PRIMID:
1083 /* set this for now */
1084 ctx->shader->gs_prim_id_input = true;
1085 ctx->shader->ps_prim_id_input = i;
1086 break;
1087 }
1088 if (ctx->bc->chip_class >= EVERGREEN) {
1089 if ((r = evergreen_interp_input(ctx, i)))
1090 return r;
1091 }
1092 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
1093 /* FIXME probably skip inputs if they aren't passed in the ring */
1094 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1095 ctx->next_ring_offset += 16;
1096 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1097 ctx->shader->gs_prim_id_input = true;
1098 }
1099 }
1100 ctx->shader->ninput += count;
1101 break;
1102 case TGSI_FILE_OUTPUT:
1103 for (j = 0; j < count; j++) {
1104 i = ctx->shader->noutput + j;
1105 assert(i < ARRAY_SIZE(ctx->shader->output));
1106 ctx->shader->output[i].name = d->Semantic.Name;
1107 ctx->shader->output[i].sid = d->Semantic.Index + j;
1108 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1109 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1110 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1111 if (ctx->type == PIPE_SHADER_VERTEX ||
1112 ctx->type == PIPE_SHADER_GEOMETRY ||
1113 ctx->type == PIPE_SHADER_TESS_EVAL) {
1114 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1115 switch (d->Semantic.Name) {
1116 case TGSI_SEMANTIC_CLIPDIST:
1117 break;
1118 case TGSI_SEMANTIC_PSIZE:
1119 ctx->shader->vs_out_misc_write = 1;
1120 ctx->shader->vs_out_point_size = 1;
1121 break;
1122 case TGSI_SEMANTIC_EDGEFLAG:
1123 ctx->shader->vs_out_misc_write = 1;
1124 ctx->shader->vs_out_edgeflag = 1;
1125 ctx->edgeflag_output = i;
1126 break;
1127 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1128 ctx->shader->vs_out_misc_write = 1;
1129 ctx->shader->vs_out_viewport = 1;
1130 break;
1131 case TGSI_SEMANTIC_LAYER:
1132 ctx->shader->vs_out_misc_write = 1;
1133 ctx->shader->vs_out_layer = 1;
1134 break;
1135 case TGSI_SEMANTIC_CLIPVERTEX:
1136 ctx->clip_vertex_write = TRUE;
1137 ctx->cv_output = i;
1138 break;
1139 }
1140 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1141 ctx->gs_out_ring_offset += 16;
1142 }
1143 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1144 switch (d->Semantic.Name) {
1145 case TGSI_SEMANTIC_COLOR:
1146 ctx->shader->nr_ps_max_color_exports++;
1147 break;
1148 }
1149 }
1150 }
1151 ctx->shader->noutput += count;
1152 break;
1153 case TGSI_FILE_TEMPORARY:
1154 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1155 if (d->Array.ArrayID) {
1156 bool spilled;
1157 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1158 d->Range.First,
1159 &spilled);
1160
1161 if (!spilled) {
1162 r600_add_gpr_array(ctx->shader, idx,
1163 d->Range.Last - d->Range.First + 1, 0x0F);
1164 }
1165 }
1166 }
1167 break;
1168
1169 case TGSI_FILE_CONSTANT:
1170 case TGSI_FILE_SAMPLER:
1171 case TGSI_FILE_SAMPLER_VIEW:
1172 case TGSI_FILE_ADDRESS:
1173 case TGSI_FILE_BUFFER:
1174 case TGSI_FILE_IMAGE:
1175 case TGSI_FILE_MEMORY:
1176 break;
1177
1178 case TGSI_FILE_HW_ATOMIC:
1179 i = ctx->shader->nhwatomic_ranges;
1180 ctx->shader->atomics[i].start = d->Range.First;
1181 ctx->shader->atomics[i].end = d->Range.Last;
1182 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1183 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1184 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1185 ctx->shader->nhwatomic_ranges++;
1186 ctx->shader->nhwatomic += count;
1187 break;
1188
1189 case TGSI_FILE_SYSTEM_VALUE:
1190 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1191 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1192 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1193 break; /* Already handled from allocate_system_value_inputs */
1194 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1195 break;
1196 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1197 break;
1198 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1199 break;
1200 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1201 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1202 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1203 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1204 unsigned temp_reg = r600_get_temp(ctx);
1205
1206 r = get_lds_offset0(ctx, 2, temp_reg, true);
1207 if (r)
1208 return r;
1209
1210 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1211 temp_reg, 0,
1212 temp_reg, 0,
1213 V_SQ_ALU_SRC_LITERAL, param * 16);
1214 if (r)
1215 return r;
1216
1217 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1218 }
1219 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1220 /* MOV r1.x, r0.x;
1221 MOV r1.y, r0.y;
1222 */
1223 for (i = 0; i < 2; i++) {
1224 struct r600_bytecode_alu alu;
1225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1226 alu.op = ALU_OP1_MOV;
1227 alu.src[0].sel = 0;
1228 alu.src[0].chan = 0 + i;
1229 alu.dst.sel = 1;
1230 alu.dst.chan = 0 + i;
1231 alu.dst.write = 1;
1232 alu.last = (i == 1) ? 1 : 0;
1233 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1234 return r;
1235 }
1236 /* ADD r1.z, 1.0f, -r0.x */
1237 struct r600_bytecode_alu alu;
1238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1239 alu.op = ALU_OP2_ADD;
1240 alu.src[0].sel = V_SQ_ALU_SRC_1;
1241 alu.src[1].sel = 1;
1242 alu.src[1].chan = 0;
1243 alu.src[1].neg = 1;
1244 alu.dst.sel = 1;
1245 alu.dst.chan = 2;
1246 alu.dst.write = 1;
1247 alu.last = 1;
1248 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1249 return r;
1250
1251 /* ADD r1.z, r1.z, -r1.y */
1252 alu.op = ALU_OP2_ADD;
1253 alu.src[0].sel = 1;
1254 alu.src[0].chan = 2;
1255 alu.src[1].sel = 1;
1256 alu.src[1].chan = 1;
1257 alu.src[1].neg = 1;
1258 alu.dst.sel = 1;
1259 alu.dst.chan = 2;
1260 alu.dst.write = 1;
1261 alu.last = 1;
1262 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1263 return r;
1264 break;
1265 }
1266 break;
1267 default:
1268 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1269 return -EINVAL;
1270 }
1271 return 0;
1272 }
1273
1274 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1275 {
1276 struct tgsi_parse_context parse;
1277 struct {
1278 boolean enabled;
1279 int *reg;
1280 unsigned name, alternate_name;
1281 } inputs[2] = {
1282 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1283
1284 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1285 };
1286 int num_regs = 0;
1287 unsigned k, i;
1288
1289 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1290 return 0;
1291 }
1292
1293 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1294 while (!tgsi_parse_end_of_tokens(&parse)) {
1295 tgsi_parse_token(&parse);
1296
1297 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1298 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1299 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1300 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1301 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1302 {
1303 int interpolate, location, k;
1304
1305 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1306 location = TGSI_INTERPOLATE_LOC_CENTER;
1307 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1308 location = TGSI_INTERPOLATE_LOC_CENTER;
1309 /* Needs sample positions, currently those are always available */
1310 } else {
1311 location = TGSI_INTERPOLATE_LOC_CENTROID;
1312 }
1313
1314 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1315 k = eg_get_interpolator_index(interpolate, location);
1316 if (k >= 0)
1317 ctx->eg_interpolators[k].enabled = true;
1318 }
1319 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1320 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1321 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1322 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1323 if (d->Semantic.Name == inputs[k].name ||
1324 d->Semantic.Name == inputs[k].alternate_name) {
1325 inputs[k].enabled = true;
1326 }
1327 }
1328 }
1329 }
1330 }
1331
1332 tgsi_parse_free(&parse);
1333
1334 if (ctx->info.reads_samplemask &&
1335 (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
1336 inputs[1].enabled = true;
1337 }
1338
1339 if (ctx->bc->chip_class >= EVERGREEN) {
1340 int num_baryc = 0;
1341 /* assign gpr to each interpolator according to priority */
1342 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1343 if (ctx->eg_interpolators[i].enabled) {
1344 ctx->eg_interpolators[i].ij_index = num_baryc;
1345 num_baryc++;
1346 }
1347 }
1348 num_baryc = (num_baryc + 1) >> 1;
1349 gpr_offset += num_baryc;
1350 }
1351
1352 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1353 boolean enabled = inputs[i].enabled;
1354 int *reg = inputs[i].reg;
1355 unsigned name = inputs[i].name;
1356
1357 if (enabled) {
1358 int gpr = gpr_offset + num_regs++;
1359 ctx->shader->nsys_inputs++;
1360
1361 // add to inputs, allocate a gpr
1362 k = ctx->shader->ninput++;
1363 ctx->shader->input[k].name = name;
1364 ctx->shader->input[k].sid = 0;
1365 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1366 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1367 *reg = ctx->shader->input[k].gpr = gpr;
1368 }
1369 }
1370
1371 return gpr_offset + num_regs;
1372 }
1373
1374 /*
1375 * for evergreen we need to scan the shader to find the number of GPRs we need to
1376 * reserve for interpolation and system values
1377 *
1378 * we need to know if we are going to emit any sample or centroid inputs
1379 * if perspective and linear are required
1380 */
1381 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1382 {
1383 unsigned i;
1384
1385 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1386
1387 /*
1388 * Could get this information from the shader info. But right now
1389 * we interpolate all declared inputs, whereas the shader info will
1390 * only contain the bits if the inputs are actually used, so it might
1391 * not be safe...
1392 */
1393 for (i = 0; i < ctx->info.num_inputs; i++) {
1394 int k;
1395 /* skip position/face/mask/sampleid */
1396 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1397 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1398 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1399 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1400 continue;
1401
1402 k = eg_get_interpolator_index(
1403 ctx->info.input_interpolate[i],
1404 ctx->info.input_interpolate_loc[i]);
1405 if (k >= 0)
1406 ctx->eg_interpolators[k].enabled = TRUE;
1407 }
1408
1409 /* XXX PULL MODEL and LINE STIPPLE */
1410
1411 return allocate_system_value_inputs(ctx, 0);
1412 }
1413
1414 /* sample_id_sel == NULL means fetch for current sample */
1415 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1416 {
1417 struct r600_bytecode_vtx vtx;
1418 int r, t1;
1419
1420 t1 = r600_get_temp(ctx);
1421
1422 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1423 vtx.op = FETCH_OP_VFETCH;
1424 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1425 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1426 if (sample_id == NULL) {
1427 assert(ctx->fixed_pt_position_gpr != -1);
1428
1429 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1430 vtx.src_sel_x = 3;
1431 }
1432 else {
1433 struct r600_bytecode_alu alu;
1434
1435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1436 alu.op = ALU_OP1_MOV;
1437 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1438 alu.dst.sel = t1;
1439 alu.dst.write = 1;
1440 alu.last = 1;
1441 r = r600_bytecode_add_alu(ctx->bc, &alu);
1442 if (r)
1443 return r;
1444
1445 vtx.src_gpr = t1;
1446 vtx.src_sel_x = 0;
1447 }
1448 vtx.mega_fetch_count = 16;
1449 vtx.dst_gpr = t1;
1450 vtx.dst_sel_x = 0;
1451 vtx.dst_sel_y = 1;
1452 vtx.dst_sel_z = 2;
1453 vtx.dst_sel_w = 3;
1454 vtx.data_format = FMT_32_32_32_32_FLOAT;
1455 vtx.num_format_all = 2;
1456 vtx.format_comp_all = 1;
1457 vtx.use_const_fields = 0;
1458 vtx.offset = 0;
1459 vtx.endian = r600_endian_swap(32);
1460 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1461
1462 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1463 if (r)
1464 return r;
1465
1466 return t1;
1467 }
1468
1469 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1470 {
1471 int r;
1472 struct r600_bytecode_alu alu;
1473
1474 /* do a vtx fetch with wqm set on the vtx fetch */
1475 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1476 alu.op = ALU_OP1_MOV;
1477 alu.dst.sel = ctx->helper_invoc_reg;
1478 alu.dst.chan = 0;
1479 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1480 alu.src[0].value = 0xffffffff;
1481 alu.dst.write = 1;
1482 alu.last = 1;
1483 r = r600_bytecode_add_alu(ctx->bc, &alu);
1484 if (r)
1485 return r;
1486
1487 /* do a vtx fetch in VPM mode */
1488 struct r600_bytecode_vtx vtx;
1489 memset(&vtx, 0, sizeof(vtx));
1490 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1491 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1492 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1493 vtx.src_gpr = 0;
1494 vtx.mega_fetch_count = 16; /* no idea here really... */
1495 vtx.dst_gpr = ctx->helper_invoc_reg;
1496 vtx.dst_sel_x = 4;
1497 vtx.dst_sel_y = 7; /* SEL_Y */
1498 vtx.dst_sel_z = 7; /* SEL_Z */
1499 vtx.dst_sel_w = 7; /* SEL_W */
1500 vtx.data_format = FMT_32;
1501 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1502 return r;
1503 ctx->bc->cf_last->vpm = 1;
1504 return 0;
1505 }
1506
1507 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1508 {
1509 int r;
1510 struct r600_bytecode_alu alu;
1511
1512 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1513 alu.op = ALU_OP1_MOV;
1514 alu.dst.sel = ctx->helper_invoc_reg;
1515 alu.dst.chan = 0;
1516 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1517 alu.src[0].value = 0xffffffff;
1518 alu.dst.write = 1;
1519 alu.last = 1;
1520 r = r600_bytecode_add_alu(ctx->bc, &alu);
1521 if (r)
1522 return r;
1523
1524 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1525 alu.op = ALU_OP1_MOV;
1526 alu.dst.sel = ctx->helper_invoc_reg;
1527 alu.dst.chan = 0;
1528 alu.src[0].sel = V_SQ_ALU_SRC_0;
1529 alu.dst.write = 1;
1530 alu.last = 1;
1531 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1532 if (r)
1533 return r;
1534
1535 return ctx->helper_invoc_reg;
1536 }
1537
1538 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1539 {
1540 struct r600_bytecode_vtx vtx;
1541 int r, t1;
1542
1543 if (ctx->cs_block_size_loaded)
1544 return ctx->cs_block_size_reg;
1545 if (ctx->cs_grid_size_loaded)
1546 return ctx->cs_grid_size_reg;
1547
1548 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1549 struct r600_bytecode_alu alu;
1550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1551 alu.op = ALU_OP1_MOV;
1552 alu.src[0].sel = V_SQ_ALU_SRC_0;
1553 alu.dst.sel = t1;
1554 alu.dst.write = 1;
1555 alu.last = 1;
1556 r = r600_bytecode_add_alu(ctx->bc, &alu);
1557 if (r)
1558 return r;
1559
1560 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1561 vtx.op = FETCH_OP_VFETCH;
1562 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1563 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1564 vtx.src_gpr = t1;
1565 vtx.src_sel_x = 0;
1566
1567 vtx.mega_fetch_count = 16;
1568 vtx.dst_gpr = t1;
1569 vtx.dst_sel_x = 0;
1570 vtx.dst_sel_y = 1;
1571 vtx.dst_sel_z = 2;
1572 vtx.dst_sel_w = 7;
1573 vtx.data_format = FMT_32_32_32_32;
1574 vtx.num_format_all = 1;
1575 vtx.format_comp_all = 0;
1576 vtx.use_const_fields = 0;
1577 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1578 vtx.endian = r600_endian_swap(32);
1579 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1580
1581 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1582 if (r)
1583 return r;
1584
1585 if (load_block)
1586 ctx->cs_block_size_loaded = true;
1587 else
1588 ctx->cs_grid_size_loaded = true;
1589 return t1;
1590 }
1591
1592 static void tgsi_src(struct r600_shader_ctx *ctx,
1593 const struct tgsi_full_src_register *tgsi_src,
1594 struct r600_shader_src *r600_src)
1595 {
1596 memset(r600_src, 0, sizeof(*r600_src));
1597 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1598 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1599 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1600 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1601 r600_src->neg = tgsi_src->Register.Negate;
1602 r600_src->abs = tgsi_src->Register.Absolute;
1603
1604 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1605 bool spilled;
1606 unsigned idx;
1607
1608 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1609
1610 if (spilled) {
1611 int reg = r600_get_temp(ctx);
1612 int r;
1613
1614 r600_src->sel = reg;
1615
1616 if (ctx->bc->chip_class < R700) {
1617 struct r600_bytecode_output cf;
1618
1619 memset(&cf, 0, sizeof(struct r600_bytecode_output));
1620 cf.op = CF_OP_MEM_SCRATCH;
1621 cf.elem_size = 3;
1622 cf.gpr = reg;
1623 cf.comp_mask = 0xF;
1624 cf.swizzle_x = 0;
1625 cf.swizzle_y = 1;
1626 cf.swizzle_z = 2;
1627 cf.swizzle_w = 3;
1628 cf.burst_count = 1;
1629
1630 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1631 &cf.array_base, &cf.array_size);
1632
1633 if (tgsi_src->Register.Indirect) {
1634 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1635 cf.index_gpr = ctx->bc->ar_reg;
1636 }
1637 else {
1638 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1639 cf.array_base += idx;
1640 cf.array_size = 0;
1641 }
1642
1643 r = r600_bytecode_add_output(ctx->bc, &cf);
1644 }
1645 else {
1646 struct r600_bytecode_vtx vtx;
1647
1648 if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1649 r600_bytecode_need_wait_ack(ctx->bc, false);
1650 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1651 }
1652
1653 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1654 vtx.op = FETCH_OP_READ_SCRATCH;
1655 vtx.dst_gpr = reg;
1656 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1657 vtx.elem_size = 3;
1658 vtx.data_format = FMT_32_32_32_32;
1659 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1660 vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1661 vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1662 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1663 vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1664
1665 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1666 &vtx.array_base, &vtx.array_size);
1667
1668 if (tgsi_src->Register.Indirect) {
1669 vtx.indexed = 1;
1670 vtx.src_gpr = ctx->bc->ar_reg;
1671 }
1672 else {
1673 vtx.array_base += idx;
1674 vtx.array_size = 0;
1675 }
1676
1677 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1678 }
1679
1680 if (r)
1681 return;
1682 }
1683 else {
1684 if (tgsi_src->Register.Indirect)
1685 r600_src->rel = V_SQ_REL_RELATIVE;
1686
1687 r600_src->sel = idx;
1688 }
1689
1690 return;
1691 }
1692
1693 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1694 int index;
1695 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1696 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1697 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1698
1699 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1700 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1701 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1702 return;
1703 }
1704 index = tgsi_src->Register.Index;
1705 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1706 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1707 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1708 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1709 r600_src->swizzle[0] = 2; // Z value
1710 r600_src->swizzle[1] = 2;
1711 r600_src->swizzle[2] = 2;
1712 r600_src->swizzle[3] = 2;
1713 r600_src->sel = ctx->face_gpr;
1714 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1715 r600_src->swizzle[0] = 3; // W value
1716 r600_src->swizzle[1] = 3;
1717 r600_src->swizzle[2] = 3;
1718 r600_src->swizzle[3] = 3;
1719 r600_src->sel = ctx->fixed_pt_position_gpr;
1720 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1721 r600_src->swizzle[0] = 0;
1722 r600_src->swizzle[1] = 1;
1723 r600_src->swizzle[2] = 4;
1724 r600_src->swizzle[3] = 4;
1725 r600_src->sel = load_sample_position(ctx, NULL, -1);
1726 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1727 r600_src->swizzle[0] = 3;
1728 r600_src->swizzle[1] = 3;
1729 r600_src->swizzle[2] = 3;
1730 r600_src->swizzle[3] = 3;
1731 r600_src->sel = 0;
1732 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1733 r600_src->swizzle[0] = 0;
1734 r600_src->swizzle[1] = 0;
1735 r600_src->swizzle[2] = 0;
1736 r600_src->swizzle[3] = 0;
1737 r600_src->sel = 0;
1738 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1739 r600_src->sel = 0;
1740 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1741 r600_src->sel = 1;
1742 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1743 r600_src->swizzle[0] = 3;
1744 r600_src->swizzle[1] = 3;
1745 r600_src->swizzle[2] = 3;
1746 r600_src->swizzle[3] = 3;
1747 r600_src->sel = 1;
1748 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1749 r600_src->swizzle[0] = 2;
1750 r600_src->swizzle[1] = 2;
1751 r600_src->swizzle[2] = 2;
1752 r600_src->swizzle[3] = 2;
1753 r600_src->sel = 0;
1754 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1755 r600_src->sel = 1;
1756 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1757 r600_src->sel = 3;
1758 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1759 r600_src->sel = 2;
1760 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1761 r600_src->sel = ctx->tess_input_info;
1762 r600_src->swizzle[0] = 2;
1763 r600_src->swizzle[1] = 2;
1764 r600_src->swizzle[2] = 2;
1765 r600_src->swizzle[3] = 2;
1766 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1767 r600_src->sel = 0;
1768 r600_src->swizzle[0] = 0;
1769 r600_src->swizzle[1] = 0;
1770 r600_src->swizzle[2] = 0;
1771 r600_src->swizzle[3] = 0;
1772 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1773 r600_src->sel = 0;
1774 r600_src->swizzle[0] = 3;
1775 r600_src->swizzle[1] = 3;
1776 r600_src->swizzle[2] = 3;
1777 r600_src->swizzle[3] = 3;
1778 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1779 r600_src->sel = load_block_grid_size(ctx, false);
1780 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1781 r600_src->sel = load_block_grid_size(ctx, true);
1782 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1783 r600_src->sel = ctx->helper_invoc_reg;
1784 r600_src->swizzle[0] = 0;
1785 r600_src->swizzle[1] = 0;
1786 r600_src->swizzle[2] = 0;
1787 r600_src->swizzle[3] = 0;
1788 }
1789 } else {
1790 if (tgsi_src->Register.Indirect)
1791 r600_src->rel = V_SQ_REL_RELATIVE;
1792 r600_src->sel = tgsi_src->Register.Index;
1793 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1794 }
1795 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1796 if (tgsi_src->Register.Dimension) {
1797 r600_src->kc_bank = tgsi_src->Dimension.Index;
1798 if (tgsi_src->Dimension.Indirect) {
1799 r600_src->kc_rel = 1;
1800 }
1801 }
1802 }
1803 }
1804
1805 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1806 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1807 unsigned int dst_reg)
1808 {
1809 struct r600_bytecode_vtx vtx;
1810 unsigned int ar_reg;
1811 int r;
1812
1813 if (offset) {
1814 struct r600_bytecode_alu alu;
1815
1816 memset(&alu, 0, sizeof(alu));
1817
1818 alu.op = ALU_OP2_ADD_INT;
1819 alu.src[0].sel = ctx->bc->ar_reg;
1820 alu.src[0].chan = ar_chan;
1821
1822 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1823 alu.src[1].value = offset;
1824
1825 alu.dst.sel = dst_reg;
1826 alu.dst.chan = ar_chan;
1827 alu.dst.write = 1;
1828 alu.last = 1;
1829
1830 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1831 return r;
1832
1833 ar_reg = dst_reg;
1834 } else {
1835 ar_reg = ctx->bc->ar_reg;
1836 }
1837
1838 memset(&vtx, 0, sizeof(vtx));
1839 vtx.buffer_id = cb_idx;
1840 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1841 vtx.src_gpr = ar_reg;
1842 vtx.src_sel_x = ar_chan;
1843 vtx.mega_fetch_count = 16;
1844 vtx.dst_gpr = dst_reg;
1845 vtx.dst_sel_x = 0; /* SEL_X */
1846 vtx.dst_sel_y = 1; /* SEL_Y */
1847 vtx.dst_sel_z = 2; /* SEL_Z */
1848 vtx.dst_sel_w = 3; /* SEL_W */
1849 vtx.data_format = FMT_32_32_32_32_FLOAT;
1850 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1851 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1852 vtx.endian = r600_endian_swap(32);
1853 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1854
1855 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1856 return r;
1857
1858 return 0;
1859 }
1860
1861 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1862 {
1863 struct r600_bytecode_vtx vtx;
1864 int r;
1865 unsigned index = src->Register.Index;
1866 unsigned vtx_id = src->Dimension.Index;
1867 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1868 int offset_chan = vtx_id % 3;
1869 int t2 = 0;
1870
1871 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1872 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1873
1874 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1875 offset_chan = 3;
1876
1877 if (src->Dimension.Indirect || src->Register.Indirect)
1878 t2 = r600_get_temp(ctx);
1879
1880 if (src->Dimension.Indirect) {
1881 int treg[3];
1882 struct r600_bytecode_alu alu;
1883 int r, i;
1884 unsigned addr_reg;
1885 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1886 if (src->DimIndirect.Index > 0) {
1887 r = single_alu_op2(ctx, ALU_OP1_MOV,
1888 ctx->bc->ar_reg, 0,
1889 addr_reg, 0,
1890 0, 0);
1891 if (r)
1892 return r;
1893 }
1894 /*
1895 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1896 at least this is what fglrx seems to do. */
1897 for (i = 0; i < 3; i++) {
1898 treg[i] = r600_get_temp(ctx);
1899 }
1900 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1901
1902 for (i = 0; i < 3; i++) {
1903 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1904 alu.op = ALU_OP1_MOV;
1905 alu.src[0].sel = ctx->gs_rotated_input[0];
1906 alu.src[0].chan = i == 2 ? 3 : i;
1907 alu.dst.sel = treg[i];
1908 alu.dst.chan = 0;
1909 alu.dst.write = 1;
1910 alu.last = 1;
1911 r = r600_bytecode_add_alu(ctx->bc, &alu);
1912 if (r)
1913 return r;
1914 }
1915 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1916 alu.op = ALU_OP1_MOV;
1917 alu.src[0].sel = treg[0];
1918 alu.src[0].rel = 1;
1919 alu.dst.sel = t2;
1920 alu.dst.write = 1;
1921 alu.last = 1;
1922 r = r600_bytecode_add_alu(ctx->bc, &alu);
1923 if (r)
1924 return r;
1925 offset_reg = t2;
1926 offset_chan = 0;
1927 }
1928
1929 if (src->Register.Indirect) {
1930 int addr_reg;
1931 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1932
1933 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1934
1935 /* pull the value from index_reg */
1936 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1937 t2, 1,
1938 addr_reg, 0,
1939 V_SQ_ALU_SRC_LITERAL, first);
1940 if (r)
1941 return r;
1942 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1943 t2, 0,
1944 t2, 1,
1945 V_SQ_ALU_SRC_LITERAL, 4,
1946 offset_reg, offset_chan);
1947 if (r)
1948 return r;
1949 offset_reg = t2;
1950 offset_chan = 0;
1951 index = src->Register.Index - first;
1952 }
1953
1954 memset(&vtx, 0, sizeof(vtx));
1955 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1956 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1957 vtx.src_gpr = offset_reg;
1958 vtx.src_sel_x = offset_chan;
1959 vtx.offset = index * 16; /*bytes*/
1960 vtx.mega_fetch_count = 16;
1961 vtx.dst_gpr = dst_reg;
1962 vtx.dst_sel_x = 0; /* SEL_X */
1963 vtx.dst_sel_y = 1; /* SEL_Y */
1964 vtx.dst_sel_z = 2; /* SEL_Z */
1965 vtx.dst_sel_w = 3; /* SEL_W */
1966 if (ctx->bc->chip_class >= EVERGREEN) {
1967 vtx.use_const_fields = 1;
1968 } else {
1969 vtx.data_format = FMT_32_32_32_32_FLOAT;
1970 }
1971
1972 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1973 return r;
1974
1975 return 0;
1976 }
1977
1978 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1979 {
1980 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1981 unsigned i;
1982
1983 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1984 struct tgsi_full_src_register *src = &inst->Src[i];
1985
1986 if (src->Register.File == TGSI_FILE_INPUT) {
1987 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1988 /* primitive id is in R0.z */
1989 ctx->src[i].sel = 0;
1990 ctx->src[i].swizzle[0] = 2;
1991 }
1992 }
1993 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1994 int treg = r600_get_temp(ctx);
1995
1996 fetch_gs_input(ctx, src, treg);
1997 ctx->src[i].sel = treg;
1998 ctx->src[i].rel = 0;
1999 }
2000 }
2001 return 0;
2002 }
2003
2004
2005 /* Tessellation shaders pass outputs to the next shader using LDS.
2006 *
2007 * LS outputs = TCS(HS) inputs
2008 * TCS(HS) outputs = TES(DS) inputs
2009 *
2010 * The LDS layout is:
2011 * - TCS inputs for patch 0
2012 * - TCS inputs for patch 1
2013 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
2014 * - ...
2015 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
2016 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
2017 * - TCS outputs for patch 1
2018 * - Per-patch TCS outputs for patch 1
2019 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
2020 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
2021 * - ...
2022 *
2023 * All three shaders VS(LS), TCS, TES share the same LDS space.
2024 */
2025 /* this will return with the dw address in temp_reg.x */
2026 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
2027 const struct tgsi_full_dst_register *dst,
2028 const struct tgsi_full_src_register *src,
2029 int stride_bytes_reg, int stride_bytes_chan)
2030 {
2031 struct tgsi_full_dst_register reg;
2032 ubyte *name, *index, *array_first;
2033 int r;
2034 int param;
2035 struct tgsi_shader_info *info = &ctx->info;
2036 /* Set the register description. The address computation is the same
2037 * for sources and destinations. */
2038 if (src) {
2039 reg.Register.File = src->Register.File;
2040 reg.Register.Index = src->Register.Index;
2041 reg.Register.Indirect = src->Register.Indirect;
2042 reg.Register.Dimension = src->Register.Dimension;
2043 reg.Indirect = src->Indirect;
2044 reg.Dimension = src->Dimension;
2045 reg.DimIndirect = src->DimIndirect;
2046 } else
2047 reg = *dst;
2048
2049 /* If the register is 2-dimensional (e.g. an array of vertices
2050 * in a primitive), calculate the base address of the vertex. */
2051 if (reg.Register.Dimension) {
2052 int sel, chan;
2053 if (reg.Dimension.Indirect) {
2054 unsigned addr_reg;
2055 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
2056
2057 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
2058 /* pull the value from index_reg */
2059 sel = addr_reg;
2060 chan = 0;
2061 } else {
2062 sel = V_SQ_ALU_SRC_LITERAL;
2063 chan = reg.Dimension.Index;
2064 }
2065
2066 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2067 temp_reg, 0,
2068 stride_bytes_reg, stride_bytes_chan,
2069 sel, chan,
2070 temp_reg, 0);
2071 if (r)
2072 return r;
2073 }
2074
2075 if (reg.Register.File == TGSI_FILE_INPUT) {
2076 name = info->input_semantic_name;
2077 index = info->input_semantic_index;
2078 array_first = info->input_array_first;
2079 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
2080 name = info->output_semantic_name;
2081 index = info->output_semantic_index;
2082 array_first = info->output_array_first;
2083 } else {
2084 assert(0);
2085 return -1;
2086 }
2087 if (reg.Register.Indirect) {
2088 int addr_reg;
2089 int first;
2090 /* Add the relative address of the element. */
2091 if (reg.Indirect.ArrayID)
2092 first = array_first[reg.Indirect.ArrayID];
2093 else
2094 first = reg.Register.Index;
2095
2096 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2097
2098 /* pull the value from index_reg */
2099 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2100 temp_reg, 0,
2101 V_SQ_ALU_SRC_LITERAL, 16,
2102 addr_reg, 0,
2103 temp_reg, 0);
2104 if (r)
2105 return r;
2106
2107 param = r600_get_lds_unique_index(name[first],
2108 index[first]);
2109
2110 } else {
2111 param = r600_get_lds_unique_index(name[reg.Register.Index],
2112 index[reg.Register.Index]);
2113 }
2114
2115 /* add to base_addr - passed in temp_reg.x */
2116 if (param) {
2117 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2118 temp_reg, 0,
2119 temp_reg, 0,
2120 V_SQ_ALU_SRC_LITERAL, param * 16);
2121 if (r)
2122 return r;
2123
2124 }
2125 return 0;
2126 }
2127
2128 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2129 unsigned dst_reg, unsigned mask)
2130 {
2131 struct r600_bytecode_alu alu;
2132 int r, i, lasti;
2133
2134 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2135 ctx->bc->force_add_cf = 1;
2136
2137 lasti = tgsi_last_instruction(mask);
2138 for (i = 1; i <= lasti; i++) {
2139 if (!(mask & (1 << i)))
2140 continue;
2141
2142 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2143 temp_reg, i,
2144 temp_reg, 0,
2145 V_SQ_ALU_SRC_LITERAL, 4 * i);
2146 if (r)
2147 return r;
2148 }
2149 for (i = 0; i <= lasti; i++) {
2150 if (!(mask & (1 << i)))
2151 continue;
2152
2153 /* emit an LDS_READ_RET */
2154 memset(&alu, 0, sizeof(alu));
2155 alu.op = LDS_OP1_LDS_READ_RET;
2156 alu.src[0].sel = temp_reg;
2157 alu.src[0].chan = i;
2158 alu.src[1].sel = V_SQ_ALU_SRC_0;
2159 alu.src[2].sel = V_SQ_ALU_SRC_0;
2160 alu.dst.chan = 0;
2161 alu.is_lds_idx_op = true;
2162 alu.last = 1;
2163 r = r600_bytecode_add_alu(ctx->bc, &alu);
2164 if (r)
2165 return r;
2166 }
2167 for (i = 0; i <= lasti; i++) {
2168 if (!(mask & (1 << i)))
2169 continue;
2170
2171 /* then read from LDS_OQ_A_POP */
2172 memset(&alu, 0, sizeof(alu));
2173
2174 alu.op = ALU_OP1_MOV;
2175 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2176 alu.src[0].chan = 0;
2177 alu.dst.sel = dst_reg;
2178 alu.dst.chan = i;
2179 alu.dst.write = 1;
2180 alu.last = 1;
2181 r = r600_bytecode_add_alu(ctx->bc, &alu);
2182 if (r)
2183 return r;
2184 }
2185 return 0;
2186 }
2187
2188 static int fetch_mask(struct tgsi_src_register *reg)
2189 {
2190 int mask = 0;
2191 mask |= 1 << reg->SwizzleX;
2192 mask |= 1 << reg->SwizzleY;
2193 mask |= 1 << reg->SwizzleZ;
2194 mask |= 1 << reg->SwizzleW;
2195 return mask;
2196 }
2197
2198 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2199 {
2200 int r;
2201 unsigned temp_reg = r600_get_temp(ctx);
2202
2203 r = get_lds_offset0(ctx, 2, temp_reg,
2204 src->Register.Dimension ? false : true);
2205 if (r)
2206 return r;
2207
2208 /* the base address is now in temp.x */
2209 r = r600_get_byte_address(ctx, temp_reg,
2210 NULL, src, ctx->tess_output_info, 1);
2211 if (r)
2212 return r;
2213
2214 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2215 if (r)
2216 return r;
2217 return 0;
2218 }
2219
2220 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2221 {
2222 int r;
2223 unsigned temp_reg = r600_get_temp(ctx);
2224
2225 /* t.x = ips * r0.y */
2226 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2227 temp_reg, 0,
2228 ctx->tess_input_info, 0,
2229 0, 1);
2230
2231 if (r)
2232 return r;
2233
2234 /* the base address is now in temp.x */
2235 r = r600_get_byte_address(ctx, temp_reg,
2236 NULL, src, ctx->tess_input_info, 1);
2237 if (r)
2238 return r;
2239
2240 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2241 if (r)
2242 return r;
2243 return 0;
2244 }
2245
2246 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2247 {
2248 int r;
2249 unsigned temp_reg = r600_get_temp(ctx);
2250
2251 r = get_lds_offset0(ctx, 1, temp_reg,
2252 src->Register.Dimension ? false : true);
2253 if (r)
2254 return r;
2255 /* the base address is now in temp.x */
2256 r = r600_get_byte_address(ctx, temp_reg,
2257 NULL, src,
2258 ctx->tess_output_info, 1);
2259 if (r)
2260 return r;
2261
2262 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2263 if (r)
2264 return r;
2265 return 0;
2266 }
2267
2268 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2269 {
2270 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2271 unsigned i;
2272
2273 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2274 struct tgsi_full_src_register *src = &inst->Src[i];
2275
2276 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2277 int treg = r600_get_temp(ctx);
2278 fetch_tes_input(ctx, src, treg);
2279 ctx->src[i].sel = treg;
2280 ctx->src[i].rel = 0;
2281 }
2282 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2283 int treg = r600_get_temp(ctx);
2284 fetch_tcs_input(ctx, src, treg);
2285 ctx->src[i].sel = treg;
2286 ctx->src[i].rel = 0;
2287 }
2288 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2289 int treg = r600_get_temp(ctx);
2290 fetch_tcs_output(ctx, src, treg);
2291 ctx->src[i].sel = treg;
2292 ctx->src[i].rel = 0;
2293 }
2294 }
2295 return 0;
2296 }
2297
2298 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2299 {
2300 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2301 struct r600_bytecode_alu alu;
2302 int i, j, k, nconst, r;
2303
2304 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2305 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2306 nconst++;
2307 }
2308 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2309 }
2310 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2311 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2312 continue;
2313 }
2314
2315 if (ctx->src[i].rel) {
2316 int chan = inst->Src[i].Indirect.Swizzle;
2317 int treg = r600_get_temp(ctx);
2318 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2319 return r;
2320
2321 ctx->src[i].kc_bank = 0;
2322 ctx->src[i].kc_rel = 0;
2323 ctx->src[i].sel = treg;
2324 ctx->src[i].rel = 0;
2325 j--;
2326 } else if (j > 0) {
2327 int treg = r600_get_temp(ctx);
2328 for (k = 0; k < 4; k++) {
2329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2330 alu.op = ALU_OP1_MOV;
2331 alu.src[0].sel = ctx->src[i].sel;
2332 alu.src[0].chan = k;
2333 alu.src[0].rel = ctx->src[i].rel;
2334 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2335 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2336 alu.dst.sel = treg;
2337 alu.dst.chan = k;
2338 alu.dst.write = 1;
2339 if (k == 3)
2340 alu.last = 1;
2341 r = r600_bytecode_add_alu(ctx->bc, &alu);
2342 if (r)
2343 return r;
2344 }
2345 ctx->src[i].sel = treg;
2346 ctx->src[i].rel =0;
2347 j--;
2348 }
2349 }
2350 return 0;
2351 }
2352
2353 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2354 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2355 {
2356 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2357 struct r600_bytecode_alu alu;
2358 int i, j, k, nliteral, r;
2359
2360 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2361 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2362 nliteral++;
2363 }
2364 }
2365 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2366 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2367 int treg = r600_get_temp(ctx);
2368 for (k = 0; k < 4; k++) {
2369 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2370 alu.op = ALU_OP1_MOV;
2371 alu.src[0].sel = ctx->src[i].sel;
2372 alu.src[0].chan = k;
2373 alu.src[0].value = ctx->src[i].value[k];
2374 alu.dst.sel = treg;
2375 alu.dst.chan = k;
2376 alu.dst.write = 1;
2377 if (k == 3)
2378 alu.last = 1;
2379 r = r600_bytecode_add_alu(ctx->bc, &alu);
2380 if (r)
2381 return r;
2382 }
2383 ctx->src[i].sel = treg;
2384 j--;
2385 }
2386 }
2387 return 0;
2388 }
2389
2390 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2391 {
2392 int i, r, count = ctx->shader->ninput;
2393
2394 for (i = 0; i < count; i++) {
2395 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2396 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2397 if (r)
2398 return r;
2399 }
2400 }
2401 return 0;
2402 }
2403
2404 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2405 int stream, unsigned *stream_item_size UNUSED)
2406 {
2407 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2408 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2409 int j, r;
2410 unsigned i;
2411
2412 /* Sanity checking. */
2413 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2414 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2415 r = -EINVAL;
2416 goto out_err;
2417 }
2418 for (i = 0; i < so->num_outputs; i++) {
2419 if (so->output[i].output_buffer >= 4) {
2420 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2421 so->output[i].output_buffer);
2422 r = -EINVAL;
2423 goto out_err;
2424 }
2425 }
2426
2427 /* Initialize locations where the outputs are stored. */
2428 for (i = 0; i < so->num_outputs; i++) {
2429
2430 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2431 start_comp[i] = so->output[i].start_component;
2432 /* Lower outputs with dst_offset < start_component.
2433 *
2434 * We can only output 4D vectors with a write mask, e.g. we can
2435 * only output the W component at offset 3, etc. If we want
2436 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2437 * to move it to X and output X. */
2438 if (so->output[i].dst_offset < so->output[i].start_component) {
2439 unsigned tmp = r600_get_temp(ctx);
2440
2441 for (j = 0; j < so->output[i].num_components; j++) {
2442 struct r600_bytecode_alu alu;
2443 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2444 alu.op = ALU_OP1_MOV;
2445 alu.src[0].sel = so_gpr[i];
2446 alu.src[0].chan = so->output[i].start_component + j;
2447
2448 alu.dst.sel = tmp;
2449 alu.dst.chan = j;
2450 alu.dst.write = 1;
2451 if (j == so->output[i].num_components - 1)
2452 alu.last = 1;
2453 r = r600_bytecode_add_alu(ctx->bc, &alu);
2454 if (r)
2455 return r;
2456 }
2457 start_comp[i] = 0;
2458 so_gpr[i] = tmp;
2459 }
2460 }
2461
2462 /* Write outputs to buffers. */
2463 for (i = 0; i < so->num_outputs; i++) {
2464 struct r600_bytecode_output output;
2465
2466 if (stream != -1 && stream != so->output[i].stream)
2467 continue;
2468
2469 memset(&output, 0, sizeof(struct r600_bytecode_output));
2470 output.gpr = so_gpr[i];
2471 output.elem_size = so->output[i].num_components - 1;
2472 if (output.elem_size == 2)
2473 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2474 output.array_base = so->output[i].dst_offset - start_comp[i];
2475 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2476 output.burst_count = 1;
2477 /* array_size is an upper limit for the burst_count
2478 * with MEM_STREAM instructions */
2479 output.array_size = 0xFFF;
2480 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2481
2482 if (ctx->bc->chip_class >= EVERGREEN) {
2483 switch (so->output[i].output_buffer) {
2484 case 0:
2485 output.op = CF_OP_MEM_STREAM0_BUF0;
2486 break;
2487 case 1:
2488 output.op = CF_OP_MEM_STREAM0_BUF1;
2489 break;
2490 case 2:
2491 output.op = CF_OP_MEM_STREAM0_BUF2;
2492 break;
2493 case 3:
2494 output.op = CF_OP_MEM_STREAM0_BUF3;
2495 break;
2496 }
2497 output.op += so->output[i].stream * 4;
2498 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2499 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2500 } else {
2501 switch (so->output[i].output_buffer) {
2502 case 0:
2503 output.op = CF_OP_MEM_STREAM0;
2504 break;
2505 case 1:
2506 output.op = CF_OP_MEM_STREAM1;
2507 break;
2508 case 2:
2509 output.op = CF_OP_MEM_STREAM2;
2510 break;
2511 case 3:
2512 output.op = CF_OP_MEM_STREAM3;
2513 break;
2514 }
2515 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2516 }
2517 r = r600_bytecode_add_output(ctx->bc, &output);
2518 if (r)
2519 goto out_err;
2520 }
2521 return 0;
2522 out_err:
2523 return r;
2524 }
2525
2526 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2527 {
2528 struct r600_bytecode_alu alu;
2529 unsigned reg;
2530
2531 if (!ctx->shader->vs_out_edgeflag)
2532 return;
2533
2534 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2535
2536 /* clamp(x, 0, 1) */
2537 memset(&alu, 0, sizeof(alu));
2538 alu.op = ALU_OP1_MOV;
2539 alu.src[0].sel = reg;
2540 alu.dst.sel = reg;
2541 alu.dst.write = 1;
2542 alu.dst.clamp = 1;
2543 alu.last = 1;
2544 r600_bytecode_add_alu(ctx->bc, &alu);
2545
2546 memset(&alu, 0, sizeof(alu));
2547 alu.op = ALU_OP1_FLT_TO_INT;
2548 alu.src[0].sel = reg;
2549 alu.dst.sel = reg;
2550 alu.dst.write = 1;
2551 alu.last = 1;
2552 r600_bytecode_add_alu(ctx->bc, &alu);
2553 }
2554
2555 int generate_gs_copy_shader(struct r600_context *rctx,
2556 struct r600_pipe_shader *gs,
2557 struct pipe_stream_output_info *so)
2558 {
2559 struct r600_shader_ctx ctx = {};
2560 struct r600_shader *gs_shader = &gs->shader;
2561 struct r600_pipe_shader *cshader;
2562 unsigned ocnt = gs_shader->noutput;
2563 struct r600_bytecode_alu alu;
2564 struct r600_bytecode_vtx vtx;
2565 struct r600_bytecode_output output;
2566 struct r600_bytecode_cf *cf_jump, *cf_pop,
2567 *last_exp_pos = NULL, *last_exp_param = NULL;
2568 int next_clip_pos = 61, next_param = 0;
2569 unsigned i, j;
2570 int ring;
2571 bool only_ring_0 = true;
2572 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2573 if (!cshader)
2574 return 0;
2575
2576 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2577 sizeof(struct r600_shader_io));
2578
2579 cshader->shader.noutput = ocnt;
2580
2581 ctx.shader = &cshader->shader;
2582 ctx.bc = &ctx.shader->bc;
2583 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2584
2585 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2586 rctx->screen->has_compressed_msaa_texturing);
2587
2588 ctx.bc->isa = rctx->isa;
2589
2590 cf_jump = NULL;
2591 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2592
2593 /* R0.x = R0.x & 0x3fffffff */
2594 memset(&alu, 0, sizeof(alu));
2595 alu.op = ALU_OP2_AND_INT;
2596 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2597 alu.src[1].value = 0x3fffffff;
2598 alu.dst.write = 1;
2599 r600_bytecode_add_alu(ctx.bc, &alu);
2600
2601 /* R0.y = R0.x >> 30 */
2602 memset(&alu, 0, sizeof(alu));
2603 alu.op = ALU_OP2_LSHR_INT;
2604 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2605 alu.src[1].value = 0x1e;
2606 alu.dst.chan = 1;
2607 alu.dst.write = 1;
2608 alu.last = 1;
2609 r600_bytecode_add_alu(ctx.bc, &alu);
2610
2611 /* fetch vertex data from GSVS ring */
2612 for (i = 0; i < ocnt; ++i) {
2613 struct r600_shader_io *out = &ctx.shader->output[i];
2614
2615 out->gpr = i + 1;
2616 out->ring_offset = i * 16;
2617
2618 memset(&vtx, 0, sizeof(vtx));
2619 vtx.op = FETCH_OP_VFETCH;
2620 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2621 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2622 vtx.mega_fetch_count = 16;
2623 vtx.offset = out->ring_offset;
2624 vtx.dst_gpr = out->gpr;
2625 vtx.src_gpr = 0;
2626 vtx.dst_sel_x = 0;
2627 vtx.dst_sel_y = 1;
2628 vtx.dst_sel_z = 2;
2629 vtx.dst_sel_w = 3;
2630 if (rctx->b.chip_class >= EVERGREEN) {
2631 vtx.use_const_fields = 1;
2632 } else {
2633 vtx.data_format = FMT_32_32_32_32_FLOAT;
2634 }
2635
2636 r600_bytecode_add_vtx(ctx.bc, &vtx);
2637 }
2638 ctx.temp_reg = i + 1;
2639 for (ring = 3; ring >= 0; --ring) {
2640 bool enabled = false;
2641 for (i = 0; i < so->num_outputs; i++) {
2642 if (so->output[i].stream == ring) {
2643 enabled = true;
2644 if (ring > 0)
2645 only_ring_0 = false;
2646 break;
2647 }
2648 }
2649 if (ring != 0 && !enabled) {
2650 cshader->shader.ring_item_sizes[ring] = 0;
2651 continue;
2652 }
2653
2654 if (cf_jump) {
2655 // Patch up jump label
2656 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2657 cf_pop = ctx.bc->cf_last;
2658
2659 cf_jump->cf_addr = cf_pop->id + 2;
2660 cf_jump->pop_count = 1;
2661 cf_pop->cf_addr = cf_pop->id + 2;
2662 cf_pop->pop_count = 1;
2663 }
2664
2665 /* PRED_SETE_INT __, R0.y, ring */
2666 memset(&alu, 0, sizeof(alu));
2667 alu.op = ALU_OP2_PRED_SETE_INT;
2668 alu.src[0].chan = 1;
2669 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2670 alu.src[1].value = ring;
2671 alu.execute_mask = 1;
2672 alu.update_pred = 1;
2673 alu.last = 1;
2674 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2675
2676 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2677 cf_jump = ctx.bc->cf_last;
2678
2679 if (enabled)
2680 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2681 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2682 }
2683
2684 /* bc adds nops - copy it */
2685 if (ctx.bc->chip_class == R600) {
2686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2687 alu.op = ALU_OP0_NOP;
2688 alu.last = 1;
2689 r600_bytecode_add_alu(ctx.bc, &alu);
2690
2691 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2692 }
2693
2694 /* export vertex data */
2695 /* XXX factor out common code with r600_shader_from_tgsi ? */
2696 for (i = 0; i < ocnt; ++i) {
2697 struct r600_shader_io *out = &ctx.shader->output[i];
2698 bool instream0 = true;
2699 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2700 continue;
2701
2702 for (j = 0; j < so->num_outputs; j++) {
2703 if (so->output[j].register_index == i) {
2704 if (so->output[j].stream == 0)
2705 break;
2706 if (so->output[j].stream > 0)
2707 instream0 = false;
2708 }
2709 }
2710 if (!instream0)
2711 continue;
2712 memset(&output, 0, sizeof(output));
2713 output.gpr = out->gpr;
2714 output.elem_size = 3;
2715 output.swizzle_x = 0;
2716 output.swizzle_y = 1;
2717 output.swizzle_z = 2;
2718 output.swizzle_w = 3;
2719 output.burst_count = 1;
2720 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2721 output.op = CF_OP_EXPORT;
2722 switch (out->name) {
2723 case TGSI_SEMANTIC_POSITION:
2724 output.array_base = 60;
2725 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2726 break;
2727
2728 case TGSI_SEMANTIC_PSIZE:
2729 output.array_base = 61;
2730 if (next_clip_pos == 61)
2731 next_clip_pos = 62;
2732 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2733 output.swizzle_y = 7;
2734 output.swizzle_z = 7;
2735 output.swizzle_w = 7;
2736 ctx.shader->vs_out_misc_write = 1;
2737 ctx.shader->vs_out_point_size = 1;
2738 break;
2739 case TGSI_SEMANTIC_LAYER:
2740 if (out->spi_sid) {
2741 /* duplicate it as PARAM to pass to the pixel shader */
2742 output.array_base = next_param++;
2743 r600_bytecode_add_output(ctx.bc, &output);
2744 last_exp_param = ctx.bc->cf_last;
2745 }
2746 output.array_base = 61;
2747 if (next_clip_pos == 61)
2748 next_clip_pos = 62;
2749 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2750 output.swizzle_x = 7;
2751 output.swizzle_y = 7;
2752 output.swizzle_z = 0;
2753 output.swizzle_w = 7;
2754 ctx.shader->vs_out_misc_write = 1;
2755 ctx.shader->vs_out_layer = 1;
2756 break;
2757 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2758 if (out->spi_sid) {
2759 /* duplicate it as PARAM to pass to the pixel shader */
2760 output.array_base = next_param++;
2761 r600_bytecode_add_output(ctx.bc, &output);
2762 last_exp_param = ctx.bc->cf_last;
2763 }
2764 output.array_base = 61;
2765 if (next_clip_pos == 61)
2766 next_clip_pos = 62;
2767 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2768 ctx.shader->vs_out_misc_write = 1;
2769 ctx.shader->vs_out_viewport = 1;
2770 output.swizzle_x = 7;
2771 output.swizzle_y = 7;
2772 output.swizzle_z = 7;
2773 output.swizzle_w = 0;
2774 break;
2775 case TGSI_SEMANTIC_CLIPDIST:
2776 /* spi_sid is 0 for clipdistance outputs that were generated
2777 * for clipvertex - we don't need to pass them to PS */
2778 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2779 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2780 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2781 if (out->spi_sid) {
2782 /* duplicate it as PARAM to pass to the pixel shader */
2783 output.array_base = next_param++;
2784 r600_bytecode_add_output(ctx.bc, &output);
2785 last_exp_param = ctx.bc->cf_last;
2786 }
2787 output.array_base = next_clip_pos++;
2788 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2789 break;
2790 case TGSI_SEMANTIC_FOG:
2791 output.swizzle_y = 4; /* 0 */
2792 output.swizzle_z = 4; /* 0 */
2793 output.swizzle_w = 5; /* 1 */
2794 break;
2795 default:
2796 output.array_base = next_param++;
2797 break;
2798 }
2799 r600_bytecode_add_output(ctx.bc, &output);
2800 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2801 last_exp_param = ctx.bc->cf_last;
2802 else
2803 last_exp_pos = ctx.bc->cf_last;
2804 }
2805
2806 if (!last_exp_pos) {
2807 memset(&output, 0, sizeof(output));
2808 output.gpr = 0;
2809 output.elem_size = 3;
2810 output.swizzle_x = 7;
2811 output.swizzle_y = 7;
2812 output.swizzle_z = 7;
2813 output.swizzle_w = 7;
2814 output.burst_count = 1;
2815 output.type = 2;
2816 output.op = CF_OP_EXPORT;
2817 output.array_base = 60;
2818 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2819 r600_bytecode_add_output(ctx.bc, &output);
2820 last_exp_pos = ctx.bc->cf_last;
2821 }
2822
2823 if (!last_exp_param) {
2824 memset(&output, 0, sizeof(output));
2825 output.gpr = 0;
2826 output.elem_size = 3;
2827 output.swizzle_x = 7;
2828 output.swizzle_y = 7;
2829 output.swizzle_z = 7;
2830 output.swizzle_w = 7;
2831 output.burst_count = 1;
2832 output.type = 2;
2833 output.op = CF_OP_EXPORT;
2834 output.array_base = next_param++;
2835 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2836 r600_bytecode_add_output(ctx.bc, &output);
2837 last_exp_param = ctx.bc->cf_last;
2838 }
2839
2840 last_exp_pos->op = CF_OP_EXPORT_DONE;
2841 last_exp_param->op = CF_OP_EXPORT_DONE;
2842
2843 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2844 cf_pop = ctx.bc->cf_last;
2845
2846 cf_jump->cf_addr = cf_pop->id + 2;
2847 cf_jump->pop_count = 1;
2848 cf_pop->cf_addr = cf_pop->id + 2;
2849 cf_pop->pop_count = 1;
2850
2851 if (ctx.bc->chip_class == CAYMAN)
2852 cm_bytecode_add_cf_end(ctx.bc);
2853 else {
2854 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2855 ctx.bc->cf_last->end_of_program = 1;
2856 }
2857
2858 gs->gs_copy_shader = cshader;
2859 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2860
2861 ctx.bc->nstack = 1;
2862
2863 return r600_bytecode_build(ctx.bc);
2864 }
2865
2866 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2867 {
2868 if (ind) {
2869 struct r600_bytecode_alu alu;
2870 int r;
2871
2872 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2873 alu.op = ALU_OP2_ADD_INT;
2874 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2875 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2876 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2877 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2878 alu.dst.write = 1;
2879 alu.last = 1;
2880 r = r600_bytecode_add_alu(ctx->bc, &alu);
2881 if (r)
2882 return r;
2883 }
2884 return 0;
2885 }
2886
2887 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2888 {
2889 struct r600_bytecode_output output;
2890 int ring_offset;
2891 unsigned i, k;
2892 int effective_stream = stream == -1 ? 0 : stream;
2893 int idx = 0;
2894
2895 for (i = 0; i < ctx->shader->noutput; i++) {
2896 if (ctx->gs_for_vs) {
2897 /* for ES we need to lookup corresponding ring offset expected by GS
2898 * (map this output to GS input by name and sid) */
2899 /* FIXME precompute offsets */
2900 ring_offset = -1;
2901 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2902 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2903 struct r600_shader_io *out = &ctx->shader->output[i];
2904 if (in->name == out->name && in->sid == out->sid)
2905 ring_offset = in->ring_offset;
2906 }
2907
2908 if (ring_offset == -1)
2909 continue;
2910 } else {
2911 ring_offset = idx * 16;
2912 idx++;
2913 }
2914
2915 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2916 continue;
2917 /* next_ring_offset after parsing input decls contains total size of
2918 * single vertex data, gs_next_vertex - current vertex index */
2919 if (!ind)
2920 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2921
2922 memset(&output, 0, sizeof(struct r600_bytecode_output));
2923 output.gpr = ctx->shader->output[i].gpr;
2924 output.elem_size = 3;
2925 output.comp_mask = 0xF;
2926 output.burst_count = 1;
2927
2928 if (ind)
2929 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2930 else
2931 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2932
2933 switch (stream) {
2934 default:
2935 case 0:
2936 output.op = CF_OP_MEM_RING; break;
2937 case 1:
2938 output.op = CF_OP_MEM_RING1; break;
2939 case 2:
2940 output.op = CF_OP_MEM_RING2; break;
2941 case 3:
2942 output.op = CF_OP_MEM_RING3; break;
2943 }
2944
2945 if (ind) {
2946 output.array_base = ring_offset >> 2; /* in dwords */
2947 output.array_size = 0xfff;
2948 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2949 } else
2950 output.array_base = ring_offset >> 2; /* in dwords */
2951 r600_bytecode_add_output(ctx->bc, &output);
2952 }
2953
2954 ++ctx->gs_next_vertex;
2955 return 0;
2956 }
2957
2958
2959 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2960 {
2961 int r;
2962 struct r600_bytecode_vtx vtx;
2963 int temp_val = ctx->temp_reg;
2964 /* need to store the TCS output somewhere */
2965 r = single_alu_op2(ctx, ALU_OP1_MOV,
2966 temp_val, 0,
2967 V_SQ_ALU_SRC_LITERAL, 0,
2968 0, 0);
2969 if (r)
2970 return r;
2971
2972 /* used by VS/TCS */
2973 if (ctx->tess_input_info) {
2974 /* fetch tcs input values into resv space */
2975 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2976 vtx.op = FETCH_OP_VFETCH;
2977 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2978 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2979 vtx.mega_fetch_count = 16;
2980 vtx.data_format = FMT_32_32_32_32;
2981 vtx.num_format_all = 2;
2982 vtx.format_comp_all = 1;
2983 vtx.use_const_fields = 0;
2984 vtx.endian = r600_endian_swap(32);
2985 vtx.srf_mode_all = 1;
2986 vtx.offset = 0;
2987 vtx.dst_gpr = ctx->tess_input_info;
2988 vtx.dst_sel_x = 0;
2989 vtx.dst_sel_y = 1;
2990 vtx.dst_sel_z = 2;
2991 vtx.dst_sel_w = 3;
2992 vtx.src_gpr = temp_val;
2993 vtx.src_sel_x = 0;
2994
2995 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2996 if (r)
2997 return r;
2998 }
2999
3000 /* used by TCS/TES */
3001 if (ctx->tess_output_info) {
3002 /* fetch tcs output values into resv space */
3003 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
3004 vtx.op = FETCH_OP_VFETCH;
3005 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
3006 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
3007 vtx.mega_fetch_count = 16;
3008 vtx.data_format = FMT_32_32_32_32;
3009 vtx.num_format_all = 2;
3010 vtx.format_comp_all = 1;
3011 vtx.use_const_fields = 0;
3012 vtx.endian = r600_endian_swap(32);
3013 vtx.srf_mode_all = 1;
3014 vtx.offset = 16;
3015 vtx.dst_gpr = ctx->tess_output_info;
3016 vtx.dst_sel_x = 0;
3017 vtx.dst_sel_y = 1;
3018 vtx.dst_sel_z = 2;
3019 vtx.dst_sel_w = 3;
3020 vtx.src_gpr = temp_val;
3021 vtx.src_sel_x = 0;
3022
3023 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
3024 if (r)
3025 return r;
3026 }
3027 return 0;
3028 }
3029
3030 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
3031 {
3032 int j, r;
3033 int temp_reg;
3034 unsigned i;
3035
3036 /* fetch tcs input values into input_vals */
3037 ctx->tess_input_info = r600_get_temp(ctx);
3038 ctx->tess_output_info = 0;
3039 r = r600_fetch_tess_io_info(ctx);
3040 if (r)
3041 return r;
3042
3043 temp_reg = r600_get_temp(ctx);
3044 /* dst reg contains LDS address stride * idx */
3045 /* MUL vertexID, vertex_dw_stride */
3046 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
3047 temp_reg, 0,
3048 ctx->tess_input_info, 1,
3049 0, 1); /* rel id in r0.y? */
3050 if (r)
3051 return r;
3052
3053 for (i = 0; i < ctx->shader->noutput; i++) {
3054 struct r600_bytecode_alu alu;
3055 int param = r600_get_lds_unique_index(ctx->shader->output[i].name,
3056 ctx->shader->output[i].sid);
3057
3058 if (param) {
3059 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3060 temp_reg, 1,
3061 temp_reg, 0,
3062 V_SQ_ALU_SRC_LITERAL, param * 16);
3063 if (r)
3064 return r;
3065 }
3066
3067 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3068 temp_reg, 2,
3069 temp_reg, param ? 1 : 0,
3070 V_SQ_ALU_SRC_LITERAL, 8);
3071 if (r)
3072 return r;
3073
3074
3075 for (j = 0; j < 2; j++) {
3076 int chan = (j == 1) ? 2 : (param ? 1 : 0);
3077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3078 alu.op = LDS_OP3_LDS_WRITE_REL;
3079 alu.src[0].sel = temp_reg;
3080 alu.src[0].chan = chan;
3081 alu.src[1].sel = ctx->shader->output[i].gpr;
3082 alu.src[1].chan = j * 2;
3083 alu.src[2].sel = ctx->shader->output[i].gpr;
3084 alu.src[2].chan = (j * 2) + 1;
3085 alu.last = 1;
3086 alu.dst.chan = 0;
3087 alu.lds_idx = 1;
3088 alu.is_lds_idx_op = true;
3089 r = r600_bytecode_add_alu(ctx->bc, &alu);
3090 if (r)
3091 return r;
3092 }
3093 }
3094 return 0;
3095 }
3096
3097 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3098 {
3099 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3100 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3101 int i, r, lasti;
3102 int temp_reg = r600_get_temp(ctx);
3103 struct r600_bytecode_alu alu;
3104 unsigned write_mask = dst->Register.WriteMask;
3105
3106 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3107 return 0;
3108
3109 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3110 if (r)
3111 return r;
3112
3113 /* the base address is now in temp.x */
3114 r = r600_get_byte_address(ctx, temp_reg,
3115 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3116 if (r)
3117 return r;
3118
3119 /* LDS write */
3120 lasti = tgsi_last_instruction(write_mask);
3121 for (i = 1; i <= lasti; i++) {
3122
3123 if (!(write_mask & (1 << i)))
3124 continue;
3125 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3126 temp_reg, i,
3127 temp_reg, 0,
3128 V_SQ_ALU_SRC_LITERAL, 4 * i);
3129 if (r)
3130 return r;
3131 }
3132
3133 for (i = 0; i <= lasti; i++) {
3134 if (!(write_mask & (1 << i)))
3135 continue;
3136
3137 if ((i == 0 && ((write_mask & 3) == 3)) ||
3138 (i == 2 && ((write_mask & 0xc) == 0xc))) {
3139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3140 alu.op = LDS_OP3_LDS_WRITE_REL;
3141 alu.src[0].sel = temp_reg;
3142 alu.src[0].chan = i;
3143
3144 alu.src[1].sel = dst->Register.Index;
3145 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3146 alu.src[1].chan = i;
3147
3148 alu.src[2].sel = dst->Register.Index;
3149 alu.src[2].sel += ctx->file_offset[dst->Register.File];
3150 alu.src[2].chan = i + 1;
3151 alu.lds_idx = 1;
3152 alu.dst.chan = 0;
3153 alu.last = 1;
3154 alu.is_lds_idx_op = true;
3155 r = r600_bytecode_add_alu(ctx->bc, &alu);
3156 if (r)
3157 return r;
3158 i += 1;
3159 continue;
3160 }
3161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3162 alu.op = LDS_OP2_LDS_WRITE;
3163 alu.src[0].sel = temp_reg;
3164 alu.src[0].chan = i;
3165
3166 alu.src[1].sel = dst->Register.Index;
3167 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3168 alu.src[1].chan = i;
3169
3170 alu.src[2].sel = V_SQ_ALU_SRC_0;
3171 alu.dst.chan = 0;
3172 alu.last = 1;
3173 alu.is_lds_idx_op = true;
3174 r = r600_bytecode_add_alu(ctx->bc, &alu);
3175 if (r)
3176 return r;
3177 }
3178 return 0;
3179 }
3180
3181 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3182 int output_idx, int nc)
3183 {
3184 int param;
3185 unsigned temp_reg = r600_get_temp(ctx);
3186 unsigned name = ctx->shader->output[output_idx].name;
3187 int dreg = ctx->shader->output[output_idx].gpr;
3188 int r;
3189
3190 param = r600_get_lds_unique_index(name, 0);
3191 r = get_lds_offset0(ctx, 1, temp_reg, true);
3192 if (r)
3193 return r;
3194
3195 if (param) {
3196 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3197 temp_reg, 0,
3198 temp_reg, 0,
3199 V_SQ_ALU_SRC_LITERAL, param * 16);
3200 if (r)
3201 return r;
3202 }
3203
3204 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3205 return 0;
3206 }
3207
3208 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3209 {
3210 int stride, outer_comps, inner_comps;
3211 int tessinner_idx = -1, tessouter_idx = -1;
3212 int i, r;
3213 unsigned j;
3214 int temp_reg = r600_get_temp(ctx);
3215 int treg[3] = {-1, -1, -1};
3216 struct r600_bytecode_alu alu;
3217 struct r600_bytecode_cf *cf_jump, *cf_pop;
3218
3219 /* only execute factor emission for invocation 0 */
3220 /* PRED_SETE_INT __, R0.x, 0 */
3221 memset(&alu, 0, sizeof(alu));
3222 alu.op = ALU_OP2_PRED_SETE_INT;
3223 alu.src[0].chan = 2;
3224 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3225 alu.execute_mask = 1;
3226 alu.update_pred = 1;
3227 alu.last = 1;
3228 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3229
3230 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3231 cf_jump = ctx->bc->cf_last;
3232
3233 treg[0] = r600_get_temp(ctx);
3234 switch (ctx->shader->tcs_prim_mode) {
3235 case PIPE_PRIM_LINES:
3236 stride = 8; /* 2 dwords, 1 vec2 store */
3237 outer_comps = 2;
3238 inner_comps = 0;
3239 break;
3240 case PIPE_PRIM_TRIANGLES:
3241 stride = 16; /* 4 dwords, 1 vec4 store */
3242 outer_comps = 3;
3243 inner_comps = 1;
3244 treg[1] = r600_get_temp(ctx);
3245 break;
3246 case PIPE_PRIM_QUADS:
3247 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3248 outer_comps = 4;
3249 inner_comps = 2;
3250 treg[1] = r600_get_temp(ctx);
3251 treg[2] = r600_get_temp(ctx);
3252 break;
3253 default:
3254 assert(0);
3255 return -1;
3256 }
3257
3258 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3259 /* TF_WRITE takes index in R.x, value in R.y */
3260 for (j = 0; j < ctx->shader->noutput; j++) {
3261 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3262 tessinner_idx = j;
3263 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3264 tessouter_idx = j;
3265 }
3266
3267 if (tessouter_idx == -1)
3268 return -1;
3269
3270 if (tessinner_idx == -1 && inner_comps)
3271 return -1;
3272
3273 if (tessouter_idx != -1) {
3274 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3275 if (r)
3276 return r;
3277 }
3278
3279 if (tessinner_idx != -1) {
3280 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3281 if (r)
3282 return r;
3283 }
3284
3285 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3286 /* r.x = relpatchid(r0.y) * tf_stride */
3287
3288 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
3289 /* add incoming r0.w to it: t.x = t.x + r0.w */
3290 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3291 temp_reg, 0,
3292 0, 1,
3293 V_SQ_ALU_SRC_LITERAL, stride,
3294 0, 3);
3295 if (r)
3296 return r;
3297
3298 for (i = 0; i < outer_comps + inner_comps; i++) {
3299 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3300 int out_comp = i >= outer_comps ? i - outer_comps : i;
3301
3302 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3303 if (out_comp == 1)
3304 out_comp = 0;
3305 else if (out_comp == 0)
3306 out_comp = 1;
3307 }
3308
3309 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3310 treg[i / 2], (2 * (i % 2)),
3311 temp_reg, 0,
3312 V_SQ_ALU_SRC_LITERAL, 4 * i);
3313 if (r)
3314 return r;
3315 r = single_alu_op2(ctx, ALU_OP1_MOV,
3316 treg[i / 2], 1 + (2 * (i%2)),
3317 ctx->shader->output[out_idx].gpr, out_comp,
3318 0, 0);
3319 if (r)
3320 return r;
3321 }
3322 for (i = 0; i < outer_comps + inner_comps; i++) {
3323 struct r600_bytecode_gds gds;
3324
3325 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3326 gds.src_gpr = treg[i / 2];
3327 gds.src_sel_x = 2 * (i % 2);
3328 gds.src_sel_y = 1 + (2 * (i % 2));
3329 gds.src_sel_z = 4;
3330 gds.dst_sel_x = 7;
3331 gds.dst_sel_y = 7;
3332 gds.dst_sel_z = 7;
3333 gds.dst_sel_w = 7;
3334 gds.op = FETCH_OP_TF_WRITE;
3335 r = r600_bytecode_add_gds(ctx->bc, &gds);
3336 if (r)
3337 return r;
3338 }
3339
3340 // Patch up jump label
3341 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3342 cf_pop = ctx->bc->cf_last;
3343
3344 cf_jump->cf_addr = cf_pop->id + 2;
3345 cf_jump->pop_count = 1;
3346 cf_pop->cf_addr = cf_pop->id + 2;
3347 cf_pop->pop_count = 1;
3348
3349 return 0;
3350 }
3351
3352 /*
3353 * We have to work out the thread ID for load and atomic
3354 * operations, which store the returned value to an index
3355 * in an intermediate buffer.
3356 * The index is calculated by taking the thread id,
3357 * calculated from the MBCNT instructions.
3358 * Then the shader engine ID is multiplied by 256,
3359 * and the wave id is added.
3360 * Then the result is multipled by 64 and thread id is
3361 * added.
3362 */
3363 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3364 {
3365 struct r600_bytecode_alu alu;
3366 int r;
3367
3368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3369 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3370 alu.dst.sel = ctx->temp_reg;
3371 alu.dst.chan = 0;
3372 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3373 alu.src[0].value = 0xffffffff;
3374 alu.dst.write = 1;
3375 r = r600_bytecode_add_alu(ctx->bc, &alu);
3376 if (r)
3377 return r;
3378
3379 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3380 alu.op = ALU_OP1_MBCNT_32HI_INT;
3381 alu.dst.sel = ctx->temp_reg;
3382 alu.dst.chan = 1;
3383 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3384 alu.src[0].value = 0xffffffff;
3385 alu.dst.write = 1;
3386 r = r600_bytecode_add_alu(ctx->bc, &alu);
3387 if (r)
3388 return r;
3389
3390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3391 alu.op = ALU_OP3_MULADD_UINT24;
3392 alu.dst.sel = ctx->temp_reg;
3393 alu.dst.chan = 2;
3394 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3395 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3396 alu.src[1].value = 256;
3397 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3398 alu.dst.write = 1;
3399 alu.is_op3 = 1;
3400 alu.last = 1;
3401 r = r600_bytecode_add_alu(ctx->bc, &alu);
3402 if (r)
3403 return r;
3404
3405 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3406 ctx->thread_id_gpr, 1,
3407 ctx->temp_reg, 2,
3408 V_SQ_ALU_SRC_LITERAL, 0x40,
3409 ctx->temp_reg, 0);
3410 if (r)
3411 return r;
3412 return 0;
3413 }
3414
3415 static int r600_shader_from_tgsi(struct r600_context *rctx,
3416 struct r600_pipe_shader *pipeshader,
3417 union r600_shader_key key)
3418 {
3419 struct r600_screen *rscreen = rctx->screen;
3420 struct r600_shader *shader = &pipeshader->shader;
3421 struct tgsi_token *tokens = pipeshader->selector->tokens;
3422 struct pipe_stream_output_info so = pipeshader->selector->so;
3423 struct tgsi_full_immediate *immediate;
3424 struct r600_shader_ctx ctx;
3425 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3426 unsigned output_done, noutput;
3427 unsigned opcode;
3428 int j, k, r = 0;
3429 unsigned i;
3430 int next_param_base = 0, next_clip_base;
3431 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3432 bool indirect_gprs;
3433 bool ring_outputs = false;
3434 bool lds_outputs = false;
3435 bool lds_inputs = false;
3436 bool pos_emitted = false;
3437
3438 ctx.bc = &shader->bc;
3439 ctx.shader = shader;
3440
3441 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3442 rscreen->has_compressed_msaa_texturing);
3443 ctx.tokens = tokens;
3444 tgsi_scan_shader(tokens, &ctx.info);
3445 shader->indirect_files = ctx.info.indirect_files;
3446
3447 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3448 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3449 ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3450 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3451
3452 shader->uses_helper_invocation = false;
3453 shader->uses_doubles = ctx.info.uses_doubles;
3454 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3455 shader->nsys_inputs = 0;
3456
3457 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3458 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3459 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3460 tgsi_parse_init(&ctx.parse, tokens);
3461 ctx.type = ctx.info.processor;
3462 shader->processor_type = ctx.type;
3463 ctx.bc->type = shader->processor_type;
3464
3465 switch (ctx.type) {
3466 case PIPE_SHADER_VERTEX:
3467 shader->vs_as_gs_a = key.vs.as_gs_a;
3468 shader->vs_as_es = key.vs.as_es;
3469 shader->vs_as_ls = key.vs.as_ls;
3470 shader->atomic_base = key.vs.first_atomic_counter;
3471 if (shader->vs_as_es)
3472 ring_outputs = true;
3473 if (shader->vs_as_ls)
3474 lds_outputs = true;
3475 break;
3476 case PIPE_SHADER_GEOMETRY:
3477 ring_outputs = true;
3478 shader->atomic_base = key.gs.first_atomic_counter;
3479 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3480 break;
3481 case PIPE_SHADER_TESS_CTRL:
3482 shader->tcs_prim_mode = key.tcs.prim_mode;
3483 shader->atomic_base = key.tcs.first_atomic_counter;
3484 lds_outputs = true;
3485 lds_inputs = true;
3486 break;
3487 case PIPE_SHADER_TESS_EVAL:
3488 shader->tes_as_es = key.tes.as_es;
3489 shader->atomic_base = key.tes.first_atomic_counter;
3490 lds_inputs = true;
3491 if (shader->tes_as_es)
3492 ring_outputs = true;
3493 break;
3494 case PIPE_SHADER_FRAGMENT:
3495 shader->two_side = key.ps.color_two_side;
3496 shader->atomic_base = key.ps.first_atomic_counter;
3497 shader->rat_base = key.ps.nr_cbufs;
3498 shader->image_size_const_offset = key.ps.image_size_const_offset;
3499 break;
3500 case PIPE_SHADER_COMPUTE:
3501 shader->rat_base = 0;
3502 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3503 break;
3504 default:
3505 break;
3506 }
3507
3508 if (shader->vs_as_es || shader->tes_as_es) {
3509 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3510 } else {
3511 ctx.gs_for_vs = NULL;
3512 }
3513
3514 ctx.next_ring_offset = 0;
3515 ctx.gs_out_ring_offset = 0;
3516 ctx.gs_next_vertex = 0;
3517 ctx.gs_stream_output_info = &so;
3518
3519 ctx.thread_id_gpr = -1;
3520 ctx.face_gpr = -1;
3521 ctx.fixed_pt_position_gpr = -1;
3522 ctx.fragcoord_input = -1;
3523 ctx.colors_used = 0;
3524 ctx.clip_vertex_write = 0;
3525
3526 ctx.helper_invoc_reg = -1;
3527 ctx.cs_block_size_reg = -1;
3528 ctx.cs_grid_size_reg = -1;
3529 ctx.cs_block_size_loaded = false;
3530 ctx.cs_grid_size_loaded = false;
3531
3532 shader->nr_ps_color_exports = 0;
3533 shader->nr_ps_max_color_exports = 0;
3534
3535
3536 /* register allocations */
3537 /* Values [0,127] correspond to GPR[0..127].
3538 * Values [128,159] correspond to constant buffer bank 0
3539 * Values [160,191] correspond to constant buffer bank 1
3540 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3541 * Values [256,287] correspond to constant buffer bank 2 (EG)
3542 * Values [288,319] correspond to constant buffer bank 3 (EG)
3543 * Other special values are shown in the list below.
3544 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3545 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3546 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3547 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3548 * 248 SQ_ALU_SRC_0: special constant 0.0.
3549 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3550 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3551 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3552 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3553 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3554 * 254 SQ_ALU_SRC_PV: previous vector result.
3555 * 255 SQ_ALU_SRC_PS: previous scalar result.
3556 */
3557 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3558 ctx.file_offset[i] = 0;
3559 }
3560
3561 if (ctx.type == PIPE_SHADER_VERTEX) {
3562
3563 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3564 if (ctx.info.num_inputs)
3565 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3566 }
3567 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3568 if (ctx.bc->chip_class >= EVERGREEN)
3569 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3570 else
3571 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3572
3573 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3574 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3575 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3576 shader->uses_helper_invocation = true;
3577 }
3578 }
3579 }
3580 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3581 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3582 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3583 }
3584 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3585 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3586 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3587 bool add_tesscoord = false, add_tess_inout = false;
3588 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3589 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3590 /* if we have tesscoord save one reg */
3591 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3592 add_tesscoord = true;
3593 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3594 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3595 add_tess_inout = true;
3596 }
3597 if (add_tesscoord || add_tess_inout)
3598 ctx.file_offset[TGSI_FILE_INPUT]++;
3599 if (add_tess_inout)
3600 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3601 }
3602 if (ctx.type == PIPE_SHADER_COMPUTE) {
3603 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3604 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3605 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3606 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3607 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3608 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3609 }
3610 }
3611
3612 ctx.file_offset[TGSI_FILE_OUTPUT] =
3613 ctx.file_offset[TGSI_FILE_INPUT] +
3614 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3615 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3616 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3617
3618 /* Outside the GPR range. This will be translated to one of the
3619 * kcache banks later. */
3620 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3621 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3622
3623 pipeshader->scratch_space_needed = 0;
3624 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3625 ctx.info.file_max[TGSI_FILE_TEMPORARY];
3626 if (regno > 124) {
3627 choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3628 shader->indirect_files = ctx.info.indirect_files;
3629 }
3630 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3631
3632 ctx.bc->ar_reg = ++regno;
3633 ctx.bc->index_reg[0] = ++regno;
3634 ctx.bc->index_reg[1] = ++regno;
3635
3636 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3637 ctx.tess_input_info = ++regno;
3638 ctx.tess_output_info = ++regno;
3639 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3640 ctx.tess_input_info = ++regno;
3641 ctx.tess_output_info = ++regno;
3642 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3643 ctx.gs_export_gpr_tregs[0] = ++regno;
3644 ctx.gs_export_gpr_tregs[1] = ++regno;
3645 ctx.gs_export_gpr_tregs[2] = ++regno;
3646 ctx.gs_export_gpr_tregs[3] = ++regno;
3647 if (ctx.shader->gs_tri_strip_adj_fix) {
3648 ctx.gs_rotated_input[0] = ++regno;
3649 ctx.gs_rotated_input[1] = ++regno;
3650 } else {
3651 ctx.gs_rotated_input[0] = 0;
3652 ctx.gs_rotated_input[1] = 1;
3653 }
3654 }
3655
3656 if (shader->uses_images) {
3657 ctx.thread_id_gpr = ++regno;
3658 }
3659 ctx.temp_reg = ++regno;
3660
3661 shader->max_arrays = 0;
3662 shader->num_arrays = 0;
3663 if (indirect_gprs) {
3664
3665 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3666 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3667 ctx.file_offset[TGSI_FILE_OUTPUT] -
3668 ctx.file_offset[TGSI_FILE_INPUT],
3669 0x0F);
3670 }
3671 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3672 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3673 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3674 ctx.file_offset[TGSI_FILE_OUTPUT],
3675 0x0F);
3676 }
3677 }
3678
3679 ctx.nliterals = 0;
3680 ctx.literals = NULL;
3681 ctx.max_driver_temp_used = 0;
3682
3683 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3684 ctx.info.colors_written == 1;
3685 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3686 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3687
3688 if (ctx.type == PIPE_SHADER_VERTEX ||
3689 ctx.type == PIPE_SHADER_GEOMETRY ||
3690 ctx.type == PIPE_SHADER_TESS_EVAL) {
3691 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3692 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3693 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3694 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3695 }
3696
3697 if (shader->vs_as_gs_a)
3698 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3699
3700 if (ctx.thread_id_gpr != -1) {
3701 r = load_thread_id_gpr(&ctx);
3702 if (r)
3703 return r;
3704 }
3705
3706 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3707 r600_fetch_tess_io_info(&ctx);
3708
3709 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3710 tgsi_parse_token(&ctx.parse);
3711 switch (ctx.parse.FullToken.Token.Type) {
3712 case TGSI_TOKEN_TYPE_IMMEDIATE:
3713 immediate = &ctx.parse.FullToken.FullImmediate;
3714 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3715 if(ctx.literals == NULL) {
3716 r = -ENOMEM;
3717 goto out_err;
3718 }
3719 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3720 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3721 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3722 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3723 ctx.nliterals++;
3724 break;
3725 case TGSI_TOKEN_TYPE_DECLARATION:
3726 r = tgsi_declaration(&ctx);
3727 if (r)
3728 goto out_err;
3729 break;
3730 case TGSI_TOKEN_TYPE_INSTRUCTION:
3731 case TGSI_TOKEN_TYPE_PROPERTY:
3732 break;
3733 default:
3734 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3735 r = -EINVAL;
3736 goto out_err;
3737 }
3738 }
3739
3740 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3741 shader->ring_item_sizes[1] = 0;
3742 shader->ring_item_sizes[2] = 0;
3743 shader->ring_item_sizes[3] = 0;
3744
3745 /* Process two side if needed */
3746 if (shader->two_side && ctx.colors_used) {
3747 int i, count = ctx.shader->ninput;
3748 unsigned next_lds_loc = ctx.shader->nlds;
3749
3750 /* additional inputs will be allocated right after the existing inputs,
3751 * we won't need them after the color selection, so we don't need to
3752 * reserve these gprs for the rest of the shader code and to adjust
3753 * output offsets etc. */
3754 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3755 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3756
3757 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3758 if (ctx.face_gpr == -1) {
3759 i = ctx.shader->ninput++;
3760 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3761 ctx.shader->input[i].spi_sid = 0;
3762 ctx.shader->input[i].gpr = gpr++;
3763 ctx.face_gpr = ctx.shader->input[i].gpr;
3764 }
3765
3766 for (i = 0; i < count; i++) {
3767 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3768 int ni = ctx.shader->ninput++;
3769 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3770 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3771 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3772 ctx.shader->input[ni].gpr = gpr++;
3773 // TGSI to LLVM needs to know the lds position of inputs.
3774 // Non LLVM path computes it later (in process_twoside_color)
3775 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3776 ctx.shader->input[i].back_color_input = ni;
3777 if (ctx.bc->chip_class >= EVERGREEN) {
3778 if ((r = evergreen_interp_input(&ctx, ni)))
3779 return r;
3780 }
3781 }
3782 }
3783 }
3784
3785 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3786 shader->nr_ps_max_color_exports = 8;
3787
3788 if (ctx.shader->uses_helper_invocation) {
3789 if (ctx.bc->chip_class == CAYMAN)
3790 r = cm_load_helper_invocation(&ctx);
3791 else
3792 r = eg_load_helper_invocation(&ctx);
3793 if (r)
3794 return r;
3795 }
3796
3797 /*
3798 * XXX this relies on fixed_pt_position_gpr only being present when
3799 * this shader should be executed per sample. Should be the case for now...
3800 */
3801 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3802 /*
3803 * Fix up sample mask. The hw always gives us coverage mask for
3804 * the pixel. However, for per-sample shading, we need the
3805 * coverage for the shader invocation only.
3806 * Also, with disabled msaa, only the first bit should be set
3807 * (luckily the same fixup works for both problems).
3808 * For now, we can only do it if we know this shader is always
3809 * executed per sample (due to usage of bits in the shader
3810 * forcing per-sample execution).
3811 * If the fb is not multisampled, we'd do unnecessary work but
3812 * it should still be correct.
3813 * It will however do nothing for sample shading according
3814 * to MinSampleShading.
3815 */
3816 struct r600_bytecode_alu alu;
3817 int tmp = r600_get_temp(&ctx);
3818 assert(ctx.face_gpr != -1);
3819 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3820
3821 alu.op = ALU_OP2_LSHL_INT;
3822 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3823 alu.src[0].value = 0x1;
3824 alu.src[1].sel = ctx.fixed_pt_position_gpr;
3825 alu.src[1].chan = 3;
3826 alu.dst.sel = tmp;
3827 alu.dst.chan = 0;
3828 alu.dst.write = 1;
3829 alu.last = 1;
3830 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3831 return r;
3832
3833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3834 alu.op = ALU_OP2_AND_INT;
3835 alu.src[0].sel = tmp;
3836 alu.src[1].sel = ctx.face_gpr;
3837 alu.src[1].chan = 2;
3838 alu.dst.sel = ctx.face_gpr;
3839 alu.dst.chan = 2;
3840 alu.dst.write = 1;
3841 alu.last = 1;
3842 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3843 return r;
3844 }
3845
3846 if (ctx.fragcoord_input >= 0) {
3847 if (ctx.bc->chip_class == CAYMAN) {
3848 for (j = 0 ; j < 4; j++) {
3849 struct r600_bytecode_alu alu;
3850 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3851 alu.op = ALU_OP1_RECIP_IEEE;
3852 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3853 alu.src[0].chan = 3;
3854
3855 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3856 alu.dst.chan = j;
3857 alu.dst.write = (j == 3);
3858 alu.last = (j == 3);
3859 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3860 return r;
3861 }
3862 } else {
3863 struct r600_bytecode_alu alu;
3864 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3865 alu.op = ALU_OP1_RECIP_IEEE;
3866 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3867 alu.src[0].chan = 3;
3868
3869 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3870 alu.dst.chan = 3;
3871 alu.dst.write = 1;
3872 alu.last = 1;
3873 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3874 return r;
3875 }
3876 }
3877
3878 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3879 struct r600_bytecode_alu alu;
3880 int r;
3881
3882 /* GS thread with no output workaround - emit a cut at start of GS */
3883 if (ctx.bc->chip_class == R600)
3884 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3885
3886 for (j = 0; j < 4; j++) {
3887 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3888 alu.op = ALU_OP1_MOV;
3889 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3890 alu.src[0].value = 0;
3891 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3892 alu.dst.write = 1;
3893 alu.last = 1;
3894 r = r600_bytecode_add_alu(ctx.bc, &alu);
3895 if (r)
3896 return r;
3897 }
3898
3899 if (ctx.shader->gs_tri_strip_adj_fix) {
3900 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3901 ctx.gs_rotated_input[0], 2,
3902 0, 2,
3903 V_SQ_ALU_SRC_LITERAL, 1);
3904 if (r)
3905 return r;
3906
3907 for (i = 0; i < 6; i++) {
3908 int rotated = (i + 4) % 6;
3909 int offset_reg = i / 3;
3910 int offset_chan = i % 3;
3911 int rotated_offset_reg = rotated / 3;
3912 int rotated_offset_chan = rotated % 3;
3913
3914 if (offset_reg == 0 && offset_chan == 2)
3915 offset_chan = 3;
3916 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3917 rotated_offset_chan = 3;
3918
3919 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3920 ctx.gs_rotated_input[offset_reg], offset_chan,
3921 ctx.gs_rotated_input[0], 2,
3922 offset_reg, offset_chan,
3923 rotated_offset_reg, rotated_offset_chan);
3924 if (r)
3925 return r;
3926 }
3927 }
3928 }
3929
3930 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3931 r600_fetch_tess_io_info(&ctx);
3932
3933 if (shader->two_side && ctx.colors_used) {
3934 if ((r = process_twoside_color_inputs(&ctx)))
3935 return r;
3936 }
3937
3938 tgsi_parse_init(&ctx.parse, tokens);
3939 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3940 tgsi_parse_token(&ctx.parse);
3941 switch (ctx.parse.FullToken.Token.Type) {
3942 case TGSI_TOKEN_TYPE_INSTRUCTION:
3943 r = tgsi_is_supported(&ctx);
3944 if (r)
3945 goto out_err;
3946 ctx.max_driver_temp_used = 0;
3947 /* reserve first tmp for everyone */
3948 r600_get_temp(&ctx);
3949
3950 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3951 if ((r = tgsi_split_constant(&ctx)))
3952 goto out_err;
3953 if ((r = tgsi_split_literal_constant(&ctx)))
3954 goto out_err;
3955 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3956 if ((r = tgsi_split_gs_inputs(&ctx)))
3957 goto out_err;
3958 } else if (lds_inputs) {
3959 if ((r = tgsi_split_lds_inputs(&ctx)))
3960 goto out_err;
3961 }
3962 if (ctx.bc->chip_class == CAYMAN)
3963 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3964 else if (ctx.bc->chip_class >= EVERGREEN)
3965 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3966 else
3967 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3968
3969 ctx.bc->precise |= ctx.parse.FullToken.FullInstruction.Instruction.Precise;
3970
3971 r = ctx.inst_info->process(&ctx);
3972 if (r)
3973 goto out_err;
3974
3975 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3976 r = r600_store_tcs_output(&ctx);
3977 if (r)
3978 goto out_err;
3979 }
3980 break;
3981 default:
3982 break;
3983 }
3984 }
3985
3986 /* Reset the temporary register counter. */
3987 ctx.max_driver_temp_used = 0;
3988
3989 noutput = shader->noutput;
3990
3991 if (!ring_outputs && ctx.clip_vertex_write) {
3992 unsigned clipdist_temp[2];
3993
3994 clipdist_temp[0] = r600_get_temp(&ctx);
3995 clipdist_temp[1] = r600_get_temp(&ctx);
3996
3997 /* need to convert a clipvertex write into clipdistance writes and not export
3998 the clip vertex anymore */
3999
4000 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
4001 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4002 shader->output[noutput].gpr = clipdist_temp[0];
4003 noutput++;
4004 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
4005 shader->output[noutput].gpr = clipdist_temp[1];
4006 noutput++;
4007
4008 /* reset spi_sid for clipvertex output to avoid confusing spi */
4009 shader->output[ctx.cv_output].spi_sid = 0;
4010
4011 shader->clip_dist_write = 0xFF;
4012 shader->cc_dist_mask = 0xFF;
4013
4014 for (i = 0; i < 8; i++) {
4015 int oreg = i >> 2;
4016 int ochan = i & 3;
4017
4018 for (j = 0; j < 4; j++) {
4019 struct r600_bytecode_alu alu;
4020 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4021 alu.op = ALU_OP2_DOT4;
4022 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
4023 alu.src[0].chan = j;
4024
4025 alu.src[1].sel = 512 + i;
4026 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4027 alu.src[1].chan = j;
4028
4029 alu.dst.sel = clipdist_temp[oreg];
4030 alu.dst.chan = j;
4031 alu.dst.write = (j == ochan);
4032 if (j == 3)
4033 alu.last = 1;
4034 r = r600_bytecode_add_alu(ctx.bc, &alu);
4035 if (r)
4036 return r;
4037 }
4038 }
4039 }
4040
4041 /* Add stream outputs. */
4042 if (so.num_outputs) {
4043 bool emit = false;
4044 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
4045 emit = true;
4046 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
4047 emit = true;
4048 if (emit)
4049 emit_streamout(&ctx, &so, -1, NULL);
4050 }
4051 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
4052 convert_edgeflag_to_int(&ctx);
4053
4054 if (ctx.type == PIPE_SHADER_TESS_CTRL)
4055 r600_emit_tess_factor(&ctx);
4056
4057 if (lds_outputs) {
4058 if (ctx.type == PIPE_SHADER_VERTEX) {
4059 if (ctx.shader->noutput)
4060 emit_lds_vs_writes(&ctx);
4061 }
4062 } else if (ring_outputs) {
4063 if (shader->vs_as_es || shader->tes_as_es) {
4064 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
4065 ctx.gs_export_gpr_tregs[1] = -1;
4066 ctx.gs_export_gpr_tregs[2] = -1;
4067 ctx.gs_export_gpr_tregs[3] = -1;
4068
4069 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
4070 }
4071 } else {
4072 /* Export output */
4073 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
4074
4075 for (i = 0, j = 0; i < noutput; i++, j++) {
4076 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4077 output[j].gpr = shader->output[i].gpr;
4078 output[j].elem_size = 3;
4079 output[j].swizzle_x = 0;
4080 output[j].swizzle_y = 1;
4081 output[j].swizzle_z = 2;
4082 output[j].swizzle_w = 3;
4083 output[j].burst_count = 1;
4084 output[j].type = 0xffffffff;
4085 output[j].op = CF_OP_EXPORT;
4086 switch (ctx.type) {
4087 case PIPE_SHADER_VERTEX:
4088 case PIPE_SHADER_TESS_EVAL:
4089 switch (shader->output[i].name) {
4090 case TGSI_SEMANTIC_POSITION:
4091 output[j].array_base = 60;
4092 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4093 pos_emitted = true;
4094 break;
4095
4096 case TGSI_SEMANTIC_PSIZE:
4097 output[j].array_base = 61;
4098 output[j].swizzle_y = 7;
4099 output[j].swizzle_z = 7;
4100 output[j].swizzle_w = 7;
4101 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4102 pos_emitted = true;
4103 break;
4104 case TGSI_SEMANTIC_EDGEFLAG:
4105 output[j].array_base = 61;
4106 output[j].swizzle_x = 7;
4107 output[j].swizzle_y = 0;
4108 output[j].swizzle_z = 7;
4109 output[j].swizzle_w = 7;
4110 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4111 pos_emitted = true;
4112 break;
4113 case TGSI_SEMANTIC_LAYER:
4114 /* spi_sid is 0 for outputs that are
4115 * not consumed by PS */
4116 if (shader->output[i].spi_sid) {
4117 output[j].array_base = next_param_base++;
4118 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4119 j++;
4120 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4121 }
4122 output[j].array_base = 61;
4123 output[j].swizzle_x = 7;
4124 output[j].swizzle_y = 7;
4125 output[j].swizzle_z = 0;
4126 output[j].swizzle_w = 7;
4127 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4128 pos_emitted = true;
4129 break;
4130 case TGSI_SEMANTIC_VIEWPORT_INDEX:
4131 /* spi_sid is 0 for outputs that are
4132 * not consumed by PS */
4133 if (shader->output[i].spi_sid) {
4134 output[j].array_base = next_param_base++;
4135 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4136 j++;
4137 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4138 }
4139 output[j].array_base = 61;
4140 output[j].swizzle_x = 7;
4141 output[j].swizzle_y = 7;
4142 output[j].swizzle_z = 7;
4143 output[j].swizzle_w = 0;
4144 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4145 pos_emitted = true;
4146 break;
4147 case TGSI_SEMANTIC_CLIPVERTEX:
4148 j--;
4149 break;
4150 case TGSI_SEMANTIC_CLIPDIST:
4151 output[j].array_base = next_clip_base++;
4152 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4153 pos_emitted = true;
4154 /* spi_sid is 0 for clipdistance outputs that were generated
4155 * for clipvertex - we don't need to pass them to PS */
4156 if (shader->output[i].spi_sid) {
4157 j++;
4158 /* duplicate it as PARAM to pass to the pixel shader */
4159 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4160 output[j].array_base = next_param_base++;
4161 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4162 }
4163 break;
4164 case TGSI_SEMANTIC_FOG:
4165 output[j].swizzle_y = 4; /* 0 */
4166 output[j].swizzle_z = 4; /* 0 */
4167 output[j].swizzle_w = 5; /* 1 */
4168 break;
4169 case TGSI_SEMANTIC_PRIMID:
4170 output[j].swizzle_x = 2;
4171 output[j].swizzle_y = 4; /* 0 */
4172 output[j].swizzle_z = 4; /* 0 */
4173 output[j].swizzle_w = 4; /* 0 */
4174 break;
4175 }
4176
4177 break;
4178 case PIPE_SHADER_FRAGMENT:
4179 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4180 /* never export more colors than the number of CBs */
4181 if (shader->output[i].sid >= max_color_exports) {
4182 /* skip export */
4183 j--;
4184 continue;
4185 }
4186 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4187 output[j].array_base = shader->output[i].sid;
4188 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4189 shader->nr_ps_color_exports++;
4190 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4191
4192 /* If the i-th target format is set, all previous target formats must
4193 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4194 */
4195 if (shader->output[i].sid > 0)
4196 for (unsigned x = 0; x < shader->output[i].sid; x++)
4197 shader->ps_color_export_mask |= (1 << (x*4));
4198
4199 if (shader->output[i].sid > shader->ps_export_highest)
4200 shader->ps_export_highest = shader->output[i].sid;
4201 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4202 for (k = 1; k < max_color_exports; k++) {
4203 j++;
4204 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4205 output[j].gpr = shader->output[i].gpr;
4206 output[j].elem_size = 3;
4207 output[j].swizzle_x = 0;
4208 output[j].swizzle_y = 1;
4209 output[j].swizzle_z = 2;
4210 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4211 output[j].burst_count = 1;
4212 output[j].array_base = k;
4213 output[j].op = CF_OP_EXPORT;
4214 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4215 shader->nr_ps_color_exports++;
4216 if (k > shader->ps_export_highest)
4217 shader->ps_export_highest = k;
4218 shader->ps_color_export_mask |= (0xf << (j * 4));
4219 }
4220 }
4221 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4222 output[j].array_base = 61;
4223 output[j].swizzle_x = 2;
4224 output[j].swizzle_y = 7;
4225 output[j].swizzle_z = output[j].swizzle_w = 7;
4226 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4227 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4228 output[j].array_base = 61;
4229 output[j].swizzle_x = 7;
4230 output[j].swizzle_y = 1;
4231 output[j].swizzle_z = output[j].swizzle_w = 7;
4232 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4233 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4234 output[j].array_base = 61;
4235 output[j].swizzle_x = 7;
4236 output[j].swizzle_y = 7;
4237 output[j].swizzle_z = 0;
4238 output[j].swizzle_w = 7;
4239 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4240 } else {
4241 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4242 r = -EINVAL;
4243 goto out_err;
4244 }
4245 break;
4246 case PIPE_SHADER_TESS_CTRL:
4247 break;
4248 default:
4249 R600_ERR("unsupported processor type %d\n", ctx.type);
4250 r = -EINVAL;
4251 goto out_err;
4252 }
4253
4254 if (output[j].type == 0xffffffff) {
4255 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4256 output[j].array_base = next_param_base++;
4257 }
4258 }
4259
4260 /* add fake position export */
4261 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4262 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4263 output[j].gpr = 0;
4264 output[j].elem_size = 3;
4265 output[j].swizzle_x = 7;
4266 output[j].swizzle_y = 7;
4267 output[j].swizzle_z = 7;
4268 output[j].swizzle_w = 7;
4269 output[j].burst_count = 1;
4270 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4271 output[j].array_base = 60;
4272 output[j].op = CF_OP_EXPORT;
4273 j++;
4274 }
4275
4276 /* add fake param output for vertex shader if no param is exported */
4277 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4278 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4279 output[j].gpr = 0;
4280 output[j].elem_size = 3;
4281 output[j].swizzle_x = 7;
4282 output[j].swizzle_y = 7;
4283 output[j].swizzle_z = 7;
4284 output[j].swizzle_w = 7;
4285 output[j].burst_count = 1;
4286 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4287 output[j].array_base = 0;
4288 output[j].op = CF_OP_EXPORT;
4289 j++;
4290 }
4291
4292 /* add fake pixel export */
4293 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4294 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4295 output[j].gpr = 0;
4296 output[j].elem_size = 3;
4297 output[j].swizzle_x = 7;
4298 output[j].swizzle_y = 7;
4299 output[j].swizzle_z = 7;
4300 output[j].swizzle_w = 7;
4301 output[j].burst_count = 1;
4302 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4303 output[j].array_base = 0;
4304 output[j].op = CF_OP_EXPORT;
4305 j++;
4306 shader->nr_ps_color_exports++;
4307 shader->ps_color_export_mask = 0xf;
4308 }
4309
4310 noutput = j;
4311
4312 /* set export done on last export of each type */
4313 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4314 if (!(output_done & (1 << output[k].type))) {
4315 output_done |= (1 << output[k].type);
4316 output[k].op = CF_OP_EXPORT_DONE;
4317 }
4318 }
4319 /* add output to bytecode */
4320 for (i = 0; i < noutput; i++) {
4321 r = r600_bytecode_add_output(ctx.bc, &output[i]);
4322 if (r)
4323 goto out_err;
4324 }
4325 }
4326
4327 /* add program end */
4328 if (ctx.bc->chip_class == CAYMAN)
4329 cm_bytecode_add_cf_end(ctx.bc);
4330 else {
4331 const struct cf_op_info *last = NULL;
4332
4333 if (ctx.bc->cf_last)
4334 last = r600_isa_cf(ctx.bc->cf_last->op);
4335
4336 /* alu clause instructions don't have EOP bit, so add NOP */
4337 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4338 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4339
4340 ctx.bc->cf_last->end_of_program = 1;
4341 }
4342
4343 /* check GPR limit - we have 124 = 128 - 4
4344 * (4 are reserved as alu clause temporary registers) */
4345 if (ctx.bc->ngpr > 124) {
4346 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4347 r = -ENOMEM;
4348 goto out_err;
4349 }
4350
4351 if (ctx.type == PIPE_SHADER_GEOMETRY) {
4352 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4353 return r;
4354 }
4355
4356 free(ctx.spilled_arrays);
4357 free(ctx.array_infos);
4358 free(ctx.literals);
4359 tgsi_parse_free(&ctx.parse);
4360 return 0;
4361 out_err:
4362 free(ctx.spilled_arrays);
4363 free(ctx.array_infos);
4364 free(ctx.literals);
4365 tgsi_parse_free(&ctx.parse);
4366 return r;
4367 }
4368
4369 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4370 {
4371 const unsigned tgsi_opcode =
4372 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4373 R600_ERR("%s tgsi opcode unsupported\n",
4374 tgsi_get_opcode_name(tgsi_opcode));
4375 return -EINVAL;
4376 }
4377
4378 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4379 {
4380 return 0;
4381 }
4382
4383 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4384 const struct r600_shader_src *shader_src,
4385 unsigned chan)
4386 {
4387 bc_src->sel = shader_src->sel;
4388 bc_src->chan = shader_src->swizzle[chan];
4389 bc_src->neg = shader_src->neg;
4390 bc_src->abs = shader_src->abs;
4391 bc_src->rel = shader_src->rel;
4392 bc_src->value = shader_src->value[bc_src->chan];
4393 bc_src->kc_bank = shader_src->kc_bank;
4394 bc_src->kc_rel = shader_src->kc_rel;
4395 }
4396
4397 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4398 {
4399 bc_src->abs = 1;
4400 bc_src->neg = 0;
4401 }
4402
4403 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4404 {
4405 bc_src->neg = !bc_src->neg;
4406 }
4407
4408 static void tgsi_dst(struct r600_shader_ctx *ctx,
4409 const struct tgsi_full_dst_register *tgsi_dst,
4410 unsigned swizzle,
4411 struct r600_bytecode_alu_dst *r600_dst)
4412 {
4413 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4414
4415 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4416 bool spilled;
4417 unsigned idx;
4418
4419 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4420
4421 if (spilled) {
4422 struct r600_bytecode_output cf;
4423 int reg = 0;
4424 int r;
4425 bool add_pending_output = true;
4426
4427 memset(&cf, 0, sizeof(struct r600_bytecode_output));
4428 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4429 &cf.array_base, &cf.array_size);
4430
4431 /* If no component has spilled, reserve a register and add the spill code
4432 * ctx->bc->n_pending_outputs is cleared after each instruction group */
4433 if (ctx->bc->n_pending_outputs == 0) {
4434 reg = r600_get_temp(ctx);
4435 } else {
4436 /* If we are already spilling and the output address is the same like
4437 * before then just reuse the same slot */
4438 struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
4439 if ((cf.array_base + idx == tmpl->array_base) ||
4440 (cf.array_base == tmpl->array_base &&
4441 tmpl->index_gpr == ctx->bc->ar_reg &&
4442 tgsi_dst->Register.Indirect)) {
4443 reg = ctx->bc->pending_outputs[0].gpr;
4444 add_pending_output = false;
4445 } else {
4446 reg = r600_get_temp(ctx);
4447 }
4448 }
4449
4450 r600_dst->sel = reg;
4451 r600_dst->chan = swizzle;
4452 r600_dst->write = 1;
4453 if (inst->Instruction.Saturate) {
4454 r600_dst->clamp = 1;
4455 }
4456
4457 /* Add new outputs as pending */
4458 if (add_pending_output) {
4459 cf.op = CF_OP_MEM_SCRATCH;
4460 cf.elem_size = 3;
4461 cf.gpr = reg;
4462 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4463 cf.mark = 1;
4464 cf.comp_mask = inst->Dst[0].Register.WriteMask;
4465 cf.swizzle_x = 0;
4466 cf.swizzle_y = 1;
4467 cf.swizzle_z = 2;
4468 cf.swizzle_w = 3;
4469 cf.burst_count = 1;
4470
4471 if (tgsi_dst->Register.Indirect) {
4472 if (ctx->bc->chip_class < R700)
4473 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4474 else
4475 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4476 cf.index_gpr = ctx->bc->ar_reg;
4477 }
4478 else {
4479 cf.array_base += idx;
4480 cf.array_size = 0;
4481 }
4482
4483 r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4484 if (r)
4485 return;
4486
4487 if (ctx->bc->chip_class >= R700)
4488 r600_bytecode_need_wait_ack(ctx->bc, true);
4489 }
4490 return;
4491 }
4492 else {
4493 r600_dst->sel = idx;
4494 }
4495 }
4496 else {
4497 r600_dst->sel = tgsi_dst->Register.Index;
4498 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4499 }
4500 r600_dst->chan = swizzle;
4501 r600_dst->write = 1;
4502 if (inst->Instruction.Saturate) {
4503 r600_dst->clamp = 1;
4504 }
4505 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4506 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4507 return;
4508 }
4509 }
4510 if (tgsi_dst->Register.Indirect)
4511 r600_dst->rel = V_SQ_REL_RELATIVE;
4512
4513 }
4514
4515 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4516 {
4517 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4518 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4519 struct r600_bytecode_alu alu;
4520 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4521 int use_tmp = 0;
4522 int swizzle_x = inst->Src[0].Register.SwizzleX;
4523
4524 if (singledest) {
4525 switch (write_mask) {
4526 case 0x1:
4527 if (swizzle_x == 2) {
4528 write_mask = 0xc;
4529 use_tmp = 3;
4530 } else
4531 write_mask = 0x3;
4532 break;
4533 case 0x2:
4534 if (swizzle_x == 2) {
4535 write_mask = 0xc;
4536 use_tmp = 3;
4537 } else {
4538 write_mask = 0x3;
4539 use_tmp = 1;
4540 }
4541 break;
4542 case 0x4:
4543 if (swizzle_x == 0) {
4544 write_mask = 0x3;
4545 use_tmp = 1;
4546 } else
4547 write_mask = 0xc;
4548 break;
4549 case 0x8:
4550 if (swizzle_x == 0) {
4551 write_mask = 0x3;
4552 use_tmp = 1;
4553 } else {
4554 write_mask = 0xc;
4555 use_tmp = 3;
4556 }
4557 break;
4558 }
4559 }
4560
4561 lasti = tgsi_last_instruction(write_mask);
4562 for (i = 0; i <= lasti; i++) {
4563
4564 if (!(write_mask & (1 << i)))
4565 continue;
4566
4567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4568
4569 if (singledest) {
4570 if (use_tmp || dest_temp) {
4571 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4572 alu.dst.chan = i;
4573 alu.dst.write = 1;
4574 } else {
4575 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4576 }
4577 if (i == 1 || i == 3)
4578 alu.dst.write = 0;
4579 } else
4580 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4581
4582 alu.op = op_override ? op_override : ctx->inst_info->op;
4583 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4584 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4585 } else if (!swap) {
4586 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4587 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4588 }
4589 } else {
4590 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4591 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4592 }
4593
4594 /* handle some special cases */
4595 if (i == 1 || i == 3) {
4596 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4597 case TGSI_OPCODE_DABS:
4598 r600_bytecode_src_set_abs(&alu.src[0]);
4599 break;
4600 default:
4601 break;
4602 }
4603 }
4604 if (i == lasti) {
4605 alu.last = 1;
4606 }
4607 r = r600_bytecode_add_alu(ctx->bc, &alu);
4608 if (r)
4609 return r;
4610 }
4611
4612 if (use_tmp) {
4613 write_mask = inst->Dst[0].Register.WriteMask;
4614
4615 lasti = tgsi_last_instruction(write_mask);
4616 /* move result from temp to dst */
4617 for (i = 0; i <= lasti; i++) {
4618 if (!(write_mask & (1 << i)))
4619 continue;
4620
4621 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4622 alu.op = ALU_OP1_MOV;
4623
4624 if (dest_temp) {
4625 alu.dst.sel = dest_temp;
4626 alu.dst.chan = i;
4627 alu.dst.write = 1;
4628 } else
4629 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4630 alu.src[0].sel = ctx->temp_reg;
4631 alu.src[0].chan = use_tmp - 1;
4632 alu.last = (i == lasti);
4633
4634 r = r600_bytecode_add_alu(ctx->bc, &alu);
4635 if (r)
4636 return r;
4637 }
4638 }
4639 return 0;
4640 }
4641
4642 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4643 {
4644 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4645 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4646 /* confirm writemasking */
4647 if ((write_mask & 0x3) != 0x3 &&
4648 (write_mask & 0xc) != 0xc) {
4649 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4650 return -1;
4651 }
4652 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4653 }
4654
4655 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4656 {
4657 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4658 }
4659
4660 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4661 {
4662 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4663 }
4664
4665 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4666 {
4667 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4668 struct r600_bytecode_alu alu;
4669 int i, j, r;
4670 int lasti = 3;
4671 int tmp = r600_get_temp(ctx);
4672
4673 for (i = 0; i < lasti + 1; i++) {
4674
4675 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4676 alu.op = ctx->inst_info->op;
4677 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4678 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4679 }
4680
4681 if (inst->Dst[0].Register.WriteMask & (1 << i))
4682 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4683 else
4684 alu.dst.sel = tmp;
4685
4686 alu.dst.chan = i;
4687 alu.is_op3 = 1;
4688 if (i == lasti) {
4689 alu.last = 1;
4690 }
4691 r = r600_bytecode_add_alu(ctx->bc, &alu);
4692 if (r)
4693 return r;
4694 }
4695 return 0;
4696 }
4697
4698 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4699 {
4700 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4701 struct r600_bytecode_alu alu;
4702 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4703 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4704 /* use temp register if trans_only and more than one dst component */
4705 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4706 unsigned op = ctx->inst_info->op;
4707
4708 if (op == ALU_OP2_MUL_IEEE &&
4709 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4710 op = ALU_OP2_MUL;
4711
4712 for (i = 0; i <= lasti; i++) {
4713 if (!(write_mask & (1 << i)))
4714 continue;
4715
4716 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4717 if (use_tmp) {
4718 alu.dst.sel = ctx->temp_reg;
4719 alu.dst.chan = i;
4720 alu.dst.write = 1;
4721 } else
4722 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4723
4724 alu.op = op;
4725 if (!swap) {
4726 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4727 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4728 }
4729 } else {
4730 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4731 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4732 }
4733 if (i == lasti || trans_only) {
4734 alu.last = 1;
4735 }
4736 r = r600_bytecode_add_alu(ctx->bc, &alu);
4737 if (r)
4738 return r;
4739 }
4740
4741 if (use_tmp) {
4742 /* move result from temp to dst */
4743 for (i = 0; i <= lasti; i++) {
4744 if (!(write_mask & (1 << i)))
4745 continue;
4746
4747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4748 alu.op = ALU_OP1_MOV;
4749 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4750 alu.src[0].sel = ctx->temp_reg;
4751 alu.src[0].chan = i;
4752 alu.last = (i == lasti);
4753
4754 r = r600_bytecode_add_alu(ctx->bc, &alu);
4755 if (r)
4756 return r;
4757 }
4758 }
4759 return 0;
4760 }
4761
4762 static int tgsi_op2(struct r600_shader_ctx *ctx)
4763 {
4764 return tgsi_op2_s(ctx, 0, 0);
4765 }
4766
4767 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4768 {
4769 return tgsi_op2_s(ctx, 1, 0);
4770 }
4771
4772 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4773 {
4774 return tgsi_op2_s(ctx, 0, 1);
4775 }
4776
4777 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4778 {
4779 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4780 struct r600_bytecode_alu alu;
4781 int i, r;
4782 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4783
4784 for (i = 0; i < lasti + 1; i++) {
4785
4786 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4787 continue;
4788 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4789 alu.op = ctx->inst_info->op;
4790
4791 alu.src[0].sel = V_SQ_ALU_SRC_0;
4792
4793 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4794
4795 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4796
4797 if (i == lasti) {
4798 alu.last = 1;
4799 }
4800 r = r600_bytecode_add_alu(ctx->bc, &alu);
4801 if (r)
4802 return r;
4803 }
4804 return 0;
4805
4806 }
4807
4808 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4809 {
4810 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4811 struct r600_bytecode_alu alu;
4812 int i, r;
4813 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4814
4815 for (i = 0; i < lasti + 1; i++) {
4816
4817 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4818 continue;
4819 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4820 alu.op = ALU_OP1_MOV;
4821
4822 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4823
4824 if (i == 1 || i == 3)
4825 r600_bytecode_src_toggle_neg(&alu.src[0]);
4826 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4827
4828 if (i == lasti) {
4829 alu.last = 1;
4830 }
4831 r = r600_bytecode_add_alu(ctx->bc, &alu);
4832 if (r)
4833 return r;
4834 }
4835 return 0;
4836
4837 }
4838
4839 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4840 {
4841 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4842 struct r600_bytecode_alu alu;
4843 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4844 int i, j, r;
4845
4846 for (i = 0; i <= 3; i++) {
4847 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4848 alu.op = ctx->inst_info->op;
4849
4850 alu.dst.sel = ctx->temp_reg;
4851 alu.dst.chan = i;
4852 alu.dst.write = 1;
4853 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4854 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4855 }
4856
4857 if (i == 3)
4858 alu.last = 1;
4859
4860 r = r600_bytecode_add_alu(ctx->bc, &alu);
4861 if (r)
4862 return r;
4863 }
4864
4865 /* Replicate significand result across channels. */
4866 for (i = 0; i <= 3; i++) {
4867 if (!(write_mask & (1 << i)))
4868 continue;
4869
4870 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4871 alu.op = ALU_OP1_MOV;
4872 alu.src[0].chan = (i & 1) + 2;
4873 alu.src[0].sel = ctx->temp_reg;
4874
4875 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4876 alu.dst.write = 1;
4877 alu.last = 1;
4878 r = r600_bytecode_add_alu(ctx->bc, &alu);
4879 if (r)
4880 return r;
4881 }
4882
4883 for (i = 0; i <= 3; i++) {
4884 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4885 /* MOV third channels to writemask dst1 */
4886 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4887 alu.op = ALU_OP1_MOV;
4888 alu.src[0].chan = 1;
4889 alu.src[0].sel = ctx->temp_reg;
4890
4891 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4892 alu.last = 1;
4893 r = r600_bytecode_add_alu(ctx->bc, &alu);
4894 if (r)
4895 return r;
4896 break;
4897 }
4898 }
4899 return 0;
4900 }
4901
4902
4903 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4904 {
4905 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4906 struct r600_bytecode_alu alu;
4907 int i, c, r;
4908 int write_mask = inst->Dst[0].Register.WriteMask;
4909 int temp_reg = r600_get_temp(ctx);
4910
4911 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4912 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4913
4914 for (c = 0; c < 2; c++) {
4915 int dchan = c * 2;
4916 if (write_mask & (0x3 << dchan)) {
4917 /* split into 24-bit int and 8-bit int */
4918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4919 alu.op = ALU_OP2_AND_INT;
4920 alu.dst.sel = temp_reg;
4921 alu.dst.chan = dchan;
4922 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4923 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4924 alu.src[1].value = 0xffffff00;
4925 alu.dst.write = 1;
4926 r = r600_bytecode_add_alu(ctx->bc, &alu);
4927 if (r)
4928 return r;
4929
4930 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4931 alu.op = ALU_OP2_AND_INT;
4932 alu.dst.sel = temp_reg;
4933 alu.dst.chan = dchan + 1;
4934 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4935 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4936 alu.src[1].value = 0xff;
4937 alu.dst.write = 1;
4938 alu.last = 1;
4939 r = r600_bytecode_add_alu(ctx->bc, &alu);
4940 if (r)
4941 return r;
4942 }
4943 }
4944
4945 for (c = 0; c < 2; c++) {
4946 int dchan = c * 2;
4947 if (write_mask & (0x3 << dchan)) {
4948 for (i = dchan; i <= dchan + 1; i++) {
4949 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4950 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4951
4952 alu.src[0].sel = temp_reg;
4953 alu.src[0].chan = i;
4954 alu.dst.sel = temp_reg;
4955 alu.dst.chan = i;
4956 alu.dst.write = 1;
4957 if (ctx->bc->chip_class == CAYMAN)
4958 alu.last = i == dchan + 1;
4959 else
4960 alu.last = 1; /* trans only ops on evergreen */
4961
4962 r = r600_bytecode_add_alu(ctx->bc, &alu);
4963 if (r)
4964 return r;
4965 }
4966 }
4967 }
4968
4969 for (c = 0; c < 2; c++) {
4970 int dchan = c * 2;
4971 if (write_mask & (0x3 << dchan)) {
4972 for (i = 0; i < 4; i++) {
4973 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4974 alu.op = ALU_OP1_FLT32_TO_FLT64;
4975
4976 alu.src[0].chan = dchan + (i / 2);
4977 if (i == 0 || i == 2)
4978 alu.src[0].sel = temp_reg;
4979 else {
4980 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4981 alu.src[0].value = 0x0;
4982 }
4983 alu.dst.sel = ctx->temp_reg;
4984 alu.dst.chan = i;
4985 alu.last = i == 3;
4986 alu.dst.write = 1;
4987
4988 r = r600_bytecode_add_alu(ctx->bc, &alu);
4989 if (r)
4990 return r;
4991 }
4992
4993 for (i = 0; i <= 1; i++) {
4994 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4995 alu.op = ALU_OP2_ADD_64;
4996
4997 alu.src[0].chan = fp64_switch(i);
4998 alu.src[0].sel = ctx->temp_reg;
4999
5000 alu.src[1].chan = fp64_switch(i + 2);
5001 alu.src[1].sel = ctx->temp_reg;
5002 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
5003 alu.last = i == 1;
5004
5005 r = r600_bytecode_add_alu(ctx->bc, &alu);
5006 if (r)
5007 return r;
5008 }
5009 }
5010 }
5011
5012 return 0;
5013 }
5014
5015 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
5016 {
5017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5018 struct r600_bytecode_alu alu;
5019 int i, r;
5020 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5021 int treg = r600_get_temp(ctx);
5022 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
5023 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
5024
5025 /* do a 64->32 into a temp register */
5026 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
5027 if (r)
5028 return r;
5029
5030 for (i = 0; i <= lasti; i++) {
5031 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5032 continue;
5033 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5034 alu.op = ctx->inst_info->op;
5035
5036 alu.src[0].chan = i;
5037 alu.src[0].sel = treg;
5038 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5039 alu.last = (i == lasti);
5040
5041 r = r600_bytecode_add_alu(ctx->bc, &alu);
5042 if (r)
5043 return r;
5044 }
5045
5046 return 0;
5047 }
5048
5049 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
5050 unsigned op,
5051 int dst_reg,
5052 struct r600_shader_src *src,
5053 bool abs)
5054 {
5055 struct r600_bytecode_alu alu;
5056 const int last_slot = 3;
5057 int r;
5058
5059 /* these have to write the result to X/Y by the looks of it */
5060 for (int i = 0 ; i < last_slot; i++) {
5061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5062 alu.op = op;
5063
5064 r600_bytecode_src(&alu.src[0], src, 1);
5065 r600_bytecode_src(&alu.src[1], src, 0);
5066
5067 if (abs)
5068 r600_bytecode_src_set_abs(&alu.src[1]);
5069
5070 alu.dst.sel = dst_reg;
5071 alu.dst.chan = i;
5072 alu.dst.write = (i == 0 || i == 1);
5073
5074 if (bc->chip_class != CAYMAN || i == last_slot - 1)
5075 alu.last = 1;
5076 r = r600_bytecode_add_alu(bc, &alu);
5077 if (r)
5078 return r;
5079 }
5080
5081 return 0;
5082 }
5083
5084 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
5085 {
5086 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5087 int i, r;
5088 struct r600_bytecode_alu alu;
5089 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5090 int t1 = ctx->temp_reg;
5091
5092 /* should only be one src regs */
5093 assert(inst->Instruction.NumSrcRegs == 1);
5094
5095 /* only support one double at a time */
5096 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5097 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5098
5099 r = cayman_emit_unary_double_raw(
5100 ctx->bc, ctx->inst_info->op, t1,
5101 &ctx->src[0],
5102 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
5103 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
5104 if (r)
5105 return r;
5106
5107 for (i = 0 ; i <= lasti; i++) {
5108 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5109 continue;
5110 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5111 alu.op = ALU_OP1_MOV;
5112 alu.src[0].sel = t1;
5113 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
5114 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5115 alu.dst.write = 1;
5116 if (i == lasti)
5117 alu.last = 1;
5118 r = r600_bytecode_add_alu(ctx->bc, &alu);
5119 if (r)
5120 return r;
5121 }
5122 return 0;
5123 }
5124
5125 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5126 {
5127 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5128 int i, j, r;
5129 struct r600_bytecode_alu alu;
5130 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5131
5132 for (i = 0 ; i < last_slot; i++) {
5133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5134 alu.op = ctx->inst_info->op;
5135 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5136 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5137
5138 /* RSQ should take the absolute value of src */
5139 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5140 r600_bytecode_src_set_abs(&alu.src[j]);
5141 }
5142 }
5143 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5144 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5145
5146 if (i == last_slot - 1)
5147 alu.last = 1;
5148 r = r600_bytecode_add_alu(ctx->bc, &alu);
5149 if (r)
5150 return r;
5151 }
5152 return 0;
5153 }
5154
5155 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5156 {
5157 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5158 int i, j, k, r;
5159 struct r600_bytecode_alu alu;
5160 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5161 int t1 = ctx->temp_reg;
5162
5163 for (k = 0; k <= lasti; k++) {
5164 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5165 continue;
5166
5167 for (i = 0 ; i < 4; i++) {
5168 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5169 alu.op = ctx->inst_info->op;
5170 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5171 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5172 }
5173 alu.dst.sel = t1;
5174 alu.dst.chan = i;
5175 alu.dst.write = (i == k);
5176 if (i == 3)
5177 alu.last = 1;
5178 r = r600_bytecode_add_alu(ctx->bc, &alu);
5179 if (r)
5180 return r;
5181 }
5182 }
5183
5184 for (i = 0 ; i <= lasti; i++) {
5185 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5186 continue;
5187 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5188 alu.op = ALU_OP1_MOV;
5189 alu.src[0].sel = t1;
5190 alu.src[0].chan = i;
5191 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5192 alu.dst.write = 1;
5193 if (i == lasti)
5194 alu.last = 1;
5195 r = r600_bytecode_add_alu(ctx->bc, &alu);
5196 if (r)
5197 return r;
5198 }
5199
5200 return 0;
5201 }
5202
5203
5204 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5205 {
5206 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5207 int i, j, k, r;
5208 struct r600_bytecode_alu alu;
5209 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5210 int t1 = ctx->temp_reg;
5211
5212 /* t1 would get overwritten below if we actually tried to
5213 * multiply two pairs of doubles at a time. */
5214 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5215 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5216
5217 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5218
5219 for (i = 0; i < 4; i++) {
5220 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5221 alu.op = ctx->inst_info->op;
5222 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5223 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5224 }
5225 alu.dst.sel = t1;
5226 alu.dst.chan = i;
5227 alu.dst.write = 1;
5228 if (i == 3)
5229 alu.last = 1;
5230 r = r600_bytecode_add_alu(ctx->bc, &alu);
5231 if (r)
5232 return r;
5233 }
5234
5235 for (i = 0; i <= lasti; i++) {
5236 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5237 continue;
5238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5239 alu.op = ALU_OP1_MOV;
5240 alu.src[0].sel = t1;
5241 alu.src[0].chan = i;
5242 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5243 alu.dst.write = 1;
5244 if (i == lasti)
5245 alu.last = 1;
5246 r = r600_bytecode_add_alu(ctx->bc, &alu);
5247 if (r)
5248 return r;
5249 }
5250
5251 return 0;
5252 }
5253
5254 /*
5255 * Emit RECIP_64 + MUL_64 to implement division.
5256 */
5257 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5258 {
5259 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5260 int r;
5261 struct r600_bytecode_alu alu;
5262 int t1 = ctx->temp_reg;
5263 int k;
5264
5265 /* Only support one double at a time. This is the same constraint as
5266 * in DMUL lowering. */
5267 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5268 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5269
5270 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5271
5272 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5273 if (r)
5274 return r;
5275
5276 for (int i = 0; i < 4; i++) {
5277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5278 alu.op = ALU_OP2_MUL_64;
5279
5280 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5281
5282 alu.src[1].sel = t1;
5283 alu.src[1].chan = (i == 3) ? 0 : 1;
5284
5285 alu.dst.sel = t1;
5286 alu.dst.chan = i;
5287 alu.dst.write = 1;
5288 if (i == 3)
5289 alu.last = 1;
5290 r = r600_bytecode_add_alu(ctx->bc, &alu);
5291 if (r)
5292 return r;
5293 }
5294
5295 for (int i = 0; i < 2; i++) {
5296 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5297 alu.op = ALU_OP1_MOV;
5298 alu.src[0].sel = t1;
5299 alu.src[0].chan = i;
5300 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5301 alu.dst.write = 1;
5302 if (i == 1)
5303 alu.last = 1;
5304 r = r600_bytecode_add_alu(ctx->bc, &alu);
5305 if (r)
5306 return r;
5307 }
5308 return 0;
5309 }
5310
5311 /*
5312 * r600 - trunc to -PI..PI range
5313 * r700 - normalize by dividing by 2PI
5314 * see fdo bug 27901
5315 */
5316 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5317 {
5318 int r;
5319 struct r600_bytecode_alu alu;
5320
5321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5322 alu.op = ALU_OP3_MULADD;
5323 alu.is_op3 = 1;
5324
5325 alu.dst.chan = 0;
5326 alu.dst.sel = ctx->temp_reg;
5327 alu.dst.write = 1;
5328
5329 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5330
5331 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5332 alu.src[1].chan = 0;
5333 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5334 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5335 alu.src[2].chan = 0;
5336 alu.last = 1;
5337 r = r600_bytecode_add_alu(ctx->bc, &alu);
5338 if (r)
5339 return r;
5340
5341 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5342 alu.op = ALU_OP1_FRACT;
5343
5344 alu.dst.chan = 0;
5345 alu.dst.sel = ctx->temp_reg;
5346 alu.dst.write = 1;
5347
5348 alu.src[0].sel = ctx->temp_reg;
5349 alu.src[0].chan = 0;
5350 alu.last = 1;
5351 r = r600_bytecode_add_alu(ctx->bc, &alu);
5352 if (r)
5353 return r;
5354
5355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5356 alu.op = ALU_OP3_MULADD;
5357 alu.is_op3 = 1;
5358
5359 alu.dst.chan = 0;
5360 alu.dst.sel = ctx->temp_reg;
5361 alu.dst.write = 1;
5362
5363 alu.src[0].sel = ctx->temp_reg;
5364 alu.src[0].chan = 0;
5365
5366 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5367 alu.src[1].chan = 0;
5368 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5369 alu.src[2].chan = 0;
5370
5371 if (ctx->bc->chip_class == R600) {
5372 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5373 alu.src[2].value = u_bitcast_f2u(-M_PI);
5374 } else {
5375 alu.src[1].sel = V_SQ_ALU_SRC_1;
5376 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5377 alu.src[2].neg = 1;
5378 }
5379
5380 alu.last = 1;
5381 r = r600_bytecode_add_alu(ctx->bc, &alu);
5382 if (r)
5383 return r;
5384 return 0;
5385 }
5386
5387 static int cayman_trig(struct r600_shader_ctx *ctx)
5388 {
5389 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5390 struct r600_bytecode_alu alu;
5391 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5392 int i, r;
5393
5394 r = tgsi_setup_trig(ctx);
5395 if (r)
5396 return r;
5397
5398
5399 for (i = 0; i < last_slot; i++) {
5400 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5401 alu.op = ctx->inst_info->op;
5402 alu.dst.chan = i;
5403
5404 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5405 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5406
5407 alu.src[0].sel = ctx->temp_reg;
5408 alu.src[0].chan = 0;
5409 if (i == last_slot - 1)
5410 alu.last = 1;
5411 r = r600_bytecode_add_alu(ctx->bc, &alu);
5412 if (r)
5413 return r;
5414 }
5415 return 0;
5416 }
5417
5418 static int tgsi_trig(struct r600_shader_ctx *ctx)
5419 {
5420 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5421 struct r600_bytecode_alu alu;
5422 int i, r;
5423 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5424
5425 r = tgsi_setup_trig(ctx);
5426 if (r)
5427 return r;
5428
5429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5430 alu.op = ctx->inst_info->op;
5431 alu.dst.chan = 0;
5432 alu.dst.sel = ctx->temp_reg;
5433 alu.dst.write = 1;
5434
5435 alu.src[0].sel = ctx->temp_reg;
5436 alu.src[0].chan = 0;
5437 alu.last = 1;
5438 r = r600_bytecode_add_alu(ctx->bc, &alu);
5439 if (r)
5440 return r;
5441
5442 /* replicate result */
5443 for (i = 0; i < lasti + 1; i++) {
5444 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5445 continue;
5446
5447 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5448 alu.op = ALU_OP1_MOV;
5449
5450 alu.src[0].sel = ctx->temp_reg;
5451 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5452 if (i == lasti)
5453 alu.last = 1;
5454 r = r600_bytecode_add_alu(ctx->bc, &alu);
5455 if (r)
5456 return r;
5457 }
5458 return 0;
5459 }
5460
5461 static int tgsi_kill(struct r600_shader_ctx *ctx)
5462 {
5463 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5464 struct r600_bytecode_alu alu;
5465 int i, r;
5466
5467 for (i = 0; i < 4; i++) {
5468 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5469 alu.op = ctx->inst_info->op;
5470
5471 alu.dst.chan = i;
5472
5473 alu.src[0].sel = V_SQ_ALU_SRC_0;
5474
5475 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5476 alu.src[1].sel = V_SQ_ALU_SRC_1;
5477 alu.src[1].neg = 1;
5478 } else {
5479 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5480 }
5481 if (i == 3) {
5482 alu.last = 1;
5483 }
5484 r = r600_bytecode_add_alu(ctx->bc, &alu);
5485 if (r)
5486 return r;
5487 }
5488
5489 /* kill must be last in ALU */
5490 ctx->bc->force_add_cf = 1;
5491 ctx->shader->uses_kill = TRUE;
5492 return 0;
5493 }
5494
5495 static int tgsi_lit(struct r600_shader_ctx *ctx)
5496 {
5497 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5498 struct r600_bytecode_alu alu;
5499 int r;
5500
5501 /* tmp.x = max(src.y, 0.0) */
5502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5503 alu.op = ALU_OP2_MAX;
5504 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5505 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5506 alu.src[1].chan = 1;
5507
5508 alu.dst.sel = ctx->temp_reg;
5509 alu.dst.chan = 0;
5510 alu.dst.write = 1;
5511
5512 alu.last = 1;
5513 r = r600_bytecode_add_alu(ctx->bc, &alu);
5514 if (r)
5515 return r;
5516
5517 if (inst->Dst[0].Register.WriteMask & (1 << 2))
5518 {
5519 int chan;
5520 int sel;
5521 unsigned i;
5522
5523 if (ctx->bc->chip_class == CAYMAN) {
5524 for (i = 0; i < 3; i++) {
5525 /* tmp.z = log(tmp.x) */
5526 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5527 alu.op = ALU_OP1_LOG_CLAMPED;
5528 alu.src[0].sel = ctx->temp_reg;
5529 alu.src[0].chan = 0;
5530 alu.dst.sel = ctx->temp_reg;
5531 alu.dst.chan = i;
5532 if (i == 2) {
5533 alu.dst.write = 1;
5534 alu.last = 1;
5535 } else
5536 alu.dst.write = 0;
5537
5538 r = r600_bytecode_add_alu(ctx->bc, &alu);
5539 if (r)
5540 return r;
5541 }
5542 } else {
5543 /* tmp.z = log(tmp.x) */
5544 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5545 alu.op = ALU_OP1_LOG_CLAMPED;
5546 alu.src[0].sel = ctx->temp_reg;
5547 alu.src[0].chan = 0;
5548 alu.dst.sel = ctx->temp_reg;
5549 alu.dst.chan = 2;
5550 alu.dst.write = 1;
5551 alu.last = 1;
5552 r = r600_bytecode_add_alu(ctx->bc, &alu);
5553 if (r)
5554 return r;
5555 }
5556
5557 chan = alu.dst.chan;
5558 sel = alu.dst.sel;
5559
5560 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5561 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5562 alu.op = ALU_OP3_MUL_LIT;
5563 alu.src[0].sel = sel;
5564 alu.src[0].chan = chan;
5565 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5566 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5567 alu.dst.sel = ctx->temp_reg;
5568 alu.dst.chan = 0;
5569 alu.dst.write = 1;
5570 alu.is_op3 = 1;
5571 alu.last = 1;
5572 r = r600_bytecode_add_alu(ctx->bc, &alu);
5573 if (r)
5574 return r;
5575
5576 if (ctx->bc->chip_class == CAYMAN) {
5577 for (i = 0; i < 3; i++) {
5578 /* dst.z = exp(tmp.x) */
5579 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5580 alu.op = ALU_OP1_EXP_IEEE;
5581 alu.src[0].sel = ctx->temp_reg;
5582 alu.src[0].chan = 0;
5583 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5584 if (i == 2) {
5585 alu.dst.write = 1;
5586 alu.last = 1;
5587 } else
5588 alu.dst.write = 0;
5589 r = r600_bytecode_add_alu(ctx->bc, &alu);
5590 if (r)
5591 return r;
5592 }
5593 } else {
5594 /* dst.z = exp(tmp.x) */
5595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5596 alu.op = ALU_OP1_EXP_IEEE;
5597 alu.src[0].sel = ctx->temp_reg;
5598 alu.src[0].chan = 0;
5599 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5600 alu.last = 1;
5601 r = r600_bytecode_add_alu(ctx->bc, &alu);
5602 if (r)
5603 return r;
5604 }
5605 }
5606
5607 /* dst.x, <- 1.0 */
5608 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5609 alu.op = ALU_OP1_MOV;
5610 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5611 alu.src[0].chan = 0;
5612 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5613 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5614 r = r600_bytecode_add_alu(ctx->bc, &alu);
5615 if (r)
5616 return r;
5617
5618 /* dst.y = max(src.x, 0.0) */
5619 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5620 alu.op = ALU_OP2_MAX;
5621 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5622 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5623 alu.src[1].chan = 0;
5624 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5625 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5626 r = r600_bytecode_add_alu(ctx->bc, &alu);
5627 if (r)
5628 return r;
5629
5630 /* dst.w, <- 1.0 */
5631 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5632 alu.op = ALU_OP1_MOV;
5633 alu.src[0].sel = V_SQ_ALU_SRC_1;
5634 alu.src[0].chan = 0;
5635 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5636 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5637 alu.last = 1;
5638 r = r600_bytecode_add_alu(ctx->bc, &alu);
5639 if (r)
5640 return r;
5641
5642 return 0;
5643 }
5644
5645 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5646 {
5647 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5648 struct r600_bytecode_alu alu;
5649 int i, r;
5650
5651 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5652
5653 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5654
5655 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5656 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5657 r600_bytecode_src_set_abs(&alu.src[i]);
5658 }
5659 alu.dst.sel = ctx->temp_reg;
5660 alu.dst.write = 1;
5661 alu.last = 1;
5662 r = r600_bytecode_add_alu(ctx->bc, &alu);
5663 if (r)
5664 return r;
5665 /* replicate result */
5666 return tgsi_helper_tempx_replicate(ctx);
5667 }
5668
5669 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5670 {
5671 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5672 struct r600_bytecode_alu alu;
5673 int i, r;
5674
5675 for (i = 0; i < 4; i++) {
5676 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5677 alu.src[0].sel = ctx->temp_reg;
5678 alu.op = ALU_OP1_MOV;
5679 alu.dst.chan = i;
5680 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5681 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5682 if (i == 3)
5683 alu.last = 1;
5684 r = r600_bytecode_add_alu(ctx->bc, &alu);
5685 if (r)
5686 return r;
5687 }
5688 return 0;
5689 }
5690
5691 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5692 {
5693 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5694 struct r600_bytecode_alu alu;
5695 int i, r;
5696
5697 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5698 alu.op = ctx->inst_info->op;
5699 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5700 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5701 }
5702 alu.dst.sel = ctx->temp_reg;
5703 alu.dst.write = 1;
5704 alu.last = 1;
5705 r = r600_bytecode_add_alu(ctx->bc, &alu);
5706 if (r)
5707 return r;
5708 /* replicate result */
5709 return tgsi_helper_tempx_replicate(ctx);
5710 }
5711
5712 static int cayman_pow(struct r600_shader_ctx *ctx)
5713 {
5714 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5715 int i, r;
5716 struct r600_bytecode_alu alu;
5717 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5718
5719 for (i = 0; i < 3; i++) {
5720 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5721 alu.op = ALU_OP1_LOG_IEEE;
5722 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5723 alu.dst.sel = ctx->temp_reg;
5724 alu.dst.chan = i;
5725 alu.dst.write = 1;
5726 if (i == 2)
5727 alu.last = 1;
5728 r = r600_bytecode_add_alu(ctx->bc, &alu);
5729 if (r)
5730 return r;
5731 }
5732
5733 /* b * LOG2(a) */
5734 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5735 alu.op = ALU_OP2_MUL;
5736 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5737 alu.src[1].sel = ctx->temp_reg;
5738 alu.dst.sel = ctx->temp_reg;
5739 alu.dst.write = 1;
5740 alu.last = 1;
5741 r = r600_bytecode_add_alu(ctx->bc, &alu);
5742 if (r)
5743 return r;
5744
5745 for (i = 0; i < last_slot; i++) {
5746 /* POW(a,b) = EXP2(b * LOG2(a))*/
5747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5748 alu.op = ALU_OP1_EXP_IEEE;
5749 alu.src[0].sel = ctx->temp_reg;
5750
5751 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5752 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5753 if (i == last_slot - 1)
5754 alu.last = 1;
5755 r = r600_bytecode_add_alu(ctx->bc, &alu);
5756 if (r)
5757 return r;
5758 }
5759 return 0;
5760 }
5761
5762 static int tgsi_pow(struct r600_shader_ctx *ctx)
5763 {
5764 struct r600_bytecode_alu alu;
5765 int r;
5766
5767 /* LOG2(a) */
5768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5769 alu.op = ALU_OP1_LOG_IEEE;
5770 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5771 alu.dst.sel = ctx->temp_reg;
5772 alu.dst.write = 1;
5773 alu.last = 1;
5774 r = r600_bytecode_add_alu(ctx->bc, &alu);
5775 if (r)
5776 return r;
5777 /* b * LOG2(a) */
5778 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5779 alu.op = ALU_OP2_MUL;
5780 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5781 alu.src[1].sel = ctx->temp_reg;
5782 alu.dst.sel = ctx->temp_reg;
5783 alu.dst.write = 1;
5784 alu.last = 1;
5785 r = r600_bytecode_add_alu(ctx->bc, &alu);
5786 if (r)
5787 return r;
5788 /* POW(a,b) = EXP2(b * LOG2(a))*/
5789 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5790 alu.op = ALU_OP1_EXP_IEEE;
5791 alu.src[0].sel = ctx->temp_reg;
5792 alu.dst.sel = ctx->temp_reg;
5793 alu.dst.write = 1;
5794 alu.last = 1;
5795 r = r600_bytecode_add_alu(ctx->bc, &alu);
5796 if (r)
5797 return r;
5798 return tgsi_helper_tempx_replicate(ctx);
5799 }
5800
5801 static int emit_mul_int_op(struct r600_bytecode *bc,
5802 struct r600_bytecode_alu *alu_src)
5803 {
5804 struct r600_bytecode_alu alu;
5805 int i, r;
5806 alu = *alu_src;
5807 if (bc->chip_class == CAYMAN) {
5808 for (i = 0; i < 4; i++) {
5809 alu.dst.chan = i;
5810 alu.dst.write = (i == alu_src->dst.chan);
5811 alu.last = (i == 3);
5812
5813 r = r600_bytecode_add_alu(bc, &alu);
5814 if (r)
5815 return r;
5816 }
5817 } else {
5818 alu.last = 1;
5819 r = r600_bytecode_add_alu(bc, &alu);
5820 if (r)
5821 return r;
5822 }
5823 return 0;
5824 }
5825
5826 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5827 {
5828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5829 struct r600_bytecode_alu alu;
5830 int i, r, j;
5831 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5832 int lasti = tgsi_last_instruction(write_mask);
5833 int tmp0 = ctx->temp_reg;
5834 int tmp1 = r600_get_temp(ctx);
5835 int tmp2 = r600_get_temp(ctx);
5836 int tmp3 = r600_get_temp(ctx);
5837 int tmp4 = 0;
5838
5839 /* Use additional temp if dst register and src register are the same */
5840 if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
5841 inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
5842 tmp4 = r600_get_temp(ctx);
5843 }
5844
5845 /* Unsigned path:
5846 *
5847 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5848 *
5849 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5850 * 2. tmp0.z = lo (tmp0.x * src2)
5851 * 3. tmp0.w = -tmp0.z
5852 * 4. tmp0.y = hi (tmp0.x * src2)
5853 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5854 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5855 * 7. tmp1.x = tmp0.x - tmp0.w
5856 * 8. tmp1.y = tmp0.x + tmp0.w
5857 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5858 * 10. tmp0.z = hi(tmp0.x * src1) = q
5859 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5860 *
5861 * 12. tmp0.w = src1 - tmp0.y = r
5862 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5863 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5864 *
5865 * if DIV
5866 *
5867 * 15. tmp1.z = tmp0.z + 1 = q + 1
5868 * 16. tmp1.w = tmp0.z - 1 = q - 1
5869 *
5870 * else MOD
5871 *
5872 * 15. tmp1.z = tmp0.w - src2 = r - src2
5873 * 16. tmp1.w = tmp0.w + src2 = r + src2
5874 *
5875 * endif
5876 *
5877 * 17. tmp1.x = tmp1.x & tmp1.y
5878 *
5879 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5880 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5881 *
5882 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5883 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5884 *
5885 * Signed path:
5886 *
5887 * Same as unsigned, using abs values of the operands,
5888 * and fixing the sign of the result in the end.
5889 */
5890
5891 for (i = 0; i < 4; i++) {
5892 if (!(write_mask & (1<<i)))
5893 continue;
5894
5895 if (signed_op) {
5896
5897 /* tmp2.x = -src0 */
5898 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5899 alu.op = ALU_OP2_SUB_INT;
5900
5901 alu.dst.sel = tmp2;
5902 alu.dst.chan = 0;
5903 alu.dst.write = 1;
5904
5905 alu.src[0].sel = V_SQ_ALU_SRC_0;
5906
5907 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5908
5909 alu.last = 1;
5910 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5911 return r;
5912
5913 /* tmp2.y = -src1 */
5914 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5915 alu.op = ALU_OP2_SUB_INT;
5916
5917 alu.dst.sel = tmp2;
5918 alu.dst.chan = 1;
5919 alu.dst.write = 1;
5920
5921 alu.src[0].sel = V_SQ_ALU_SRC_0;
5922
5923 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5924
5925 alu.last = 1;
5926 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5927 return r;
5928
5929 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5930 /* it will be a sign of the quotient */
5931 if (!mod) {
5932
5933 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5934 alu.op = ALU_OP2_XOR_INT;
5935
5936 alu.dst.sel = tmp2;
5937 alu.dst.chan = 2;
5938 alu.dst.write = 1;
5939
5940 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5941 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5942
5943 alu.last = 1;
5944 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5945 return r;
5946 }
5947
5948 /* tmp2.x = |src0| */
5949 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5950 alu.op = ALU_OP3_CNDGE_INT;
5951 alu.is_op3 = 1;
5952
5953 alu.dst.sel = tmp2;
5954 alu.dst.chan = 0;
5955 alu.dst.write = 1;
5956
5957 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5958 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5959 alu.src[2].sel = tmp2;
5960 alu.src[2].chan = 0;
5961
5962 alu.last = 1;
5963 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5964 return r;
5965
5966 /* tmp2.y = |src1| */
5967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5968 alu.op = ALU_OP3_CNDGE_INT;
5969 alu.is_op3 = 1;
5970
5971 alu.dst.sel = tmp2;
5972 alu.dst.chan = 1;
5973 alu.dst.write = 1;
5974
5975 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5976 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5977 alu.src[2].sel = tmp2;
5978 alu.src[2].chan = 1;
5979
5980 alu.last = 1;
5981 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5982 return r;
5983
5984 }
5985
5986 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5987 if (ctx->bc->chip_class == CAYMAN) {
5988 /* tmp3.x = u2f(src2) */
5989 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5990 alu.op = ALU_OP1_UINT_TO_FLT;
5991
5992 alu.dst.sel = tmp3;
5993 alu.dst.chan = 0;
5994 alu.dst.write = 1;
5995
5996 if (signed_op) {
5997 alu.src[0].sel = tmp2;
5998 alu.src[0].chan = 1;
5999 } else {
6000 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6001 }
6002
6003 alu.last = 1;
6004 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6005 return r;
6006
6007 /* tmp0.x = recip(tmp3.x) */
6008 for (j = 0 ; j < 3; j++) {
6009 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6010 alu.op = ALU_OP1_RECIP_IEEE;
6011
6012 alu.dst.sel = tmp0;
6013 alu.dst.chan = j;
6014 alu.dst.write = (j == 0);
6015
6016 alu.src[0].sel = tmp3;
6017 alu.src[0].chan = 0;
6018
6019 if (j == 2)
6020 alu.last = 1;
6021 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6022 return r;
6023 }
6024
6025 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6026 alu.op = ALU_OP2_MUL;
6027
6028 alu.src[0].sel = tmp0;
6029 alu.src[0].chan = 0;
6030
6031 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6032 alu.src[1].value = 0x4f800000;
6033
6034 alu.dst.sel = tmp3;
6035 alu.dst.write = 1;
6036 alu.last = 1;
6037 r = r600_bytecode_add_alu(ctx->bc, &alu);
6038 if (r)
6039 return r;
6040
6041 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6042 alu.op = ALU_OP1_FLT_TO_UINT;
6043
6044 alu.dst.sel = tmp0;
6045 alu.dst.chan = 0;
6046 alu.dst.write = 1;
6047
6048 alu.src[0].sel = tmp3;
6049 alu.src[0].chan = 0;
6050
6051 alu.last = 1;
6052 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6053 return r;
6054
6055 } else {
6056 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6057 alu.op = ALU_OP1_RECIP_UINT;
6058
6059 alu.dst.sel = tmp0;
6060 alu.dst.chan = 0;
6061 alu.dst.write = 1;
6062
6063 if (signed_op) {
6064 alu.src[0].sel = tmp2;
6065 alu.src[0].chan = 1;
6066 } else {
6067 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6068 }
6069
6070 alu.last = 1;
6071 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6072 return r;
6073 }
6074
6075 /* 2. tmp0.z = lo (tmp0.x * src2) */
6076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6077 alu.op = ALU_OP2_MULLO_UINT;
6078
6079 alu.dst.sel = tmp0;
6080 alu.dst.chan = 2;
6081 alu.dst.write = 1;
6082
6083 alu.src[0].sel = tmp0;
6084 alu.src[0].chan = 0;
6085 if (signed_op) {
6086 alu.src[1].sel = tmp2;
6087 alu.src[1].chan = 1;
6088 } else {
6089 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6090 }
6091
6092 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6093 return r;
6094
6095 /* 3. tmp0.w = -tmp0.z */
6096 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6097 alu.op = ALU_OP2_SUB_INT;
6098
6099 alu.dst.sel = tmp0;
6100 alu.dst.chan = 3;
6101 alu.dst.write = 1;
6102
6103 alu.src[0].sel = V_SQ_ALU_SRC_0;
6104 alu.src[1].sel = tmp0;
6105 alu.src[1].chan = 2;
6106
6107 alu.last = 1;
6108 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6109 return r;
6110
6111 /* 4. tmp0.y = hi (tmp0.x * src2) */
6112 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6113 alu.op = ALU_OP2_MULHI_UINT;
6114
6115 alu.dst.sel = tmp0;
6116 alu.dst.chan = 1;
6117 alu.dst.write = 1;
6118
6119 alu.src[0].sel = tmp0;
6120 alu.src[0].chan = 0;
6121
6122 if (signed_op) {
6123 alu.src[1].sel = tmp2;
6124 alu.src[1].chan = 1;
6125 } else {
6126 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6127 }
6128
6129 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6130 return r;
6131
6132 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
6133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6134 alu.op = ALU_OP3_CNDE_INT;
6135 alu.is_op3 = 1;
6136
6137 alu.dst.sel = tmp0;
6138 alu.dst.chan = 2;
6139 alu.dst.write = 1;
6140
6141 alu.src[0].sel = tmp0;
6142 alu.src[0].chan = 1;
6143 alu.src[1].sel = tmp0;
6144 alu.src[1].chan = 3;
6145 alu.src[2].sel = tmp0;
6146 alu.src[2].chan = 2;
6147
6148 alu.last = 1;
6149 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6150 return r;
6151
6152 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
6153 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6154 alu.op = ALU_OP2_MULHI_UINT;
6155
6156 alu.dst.sel = tmp0;
6157 alu.dst.chan = 3;
6158 alu.dst.write = 1;
6159
6160 alu.src[0].sel = tmp0;
6161 alu.src[0].chan = 2;
6162
6163 alu.src[1].sel = tmp0;
6164 alu.src[1].chan = 0;
6165
6166 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6167 return r;
6168
6169 /* 7. tmp1.x = tmp0.x - tmp0.w */
6170 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6171 alu.op = ALU_OP2_SUB_INT;
6172
6173 alu.dst.sel = tmp1;
6174 alu.dst.chan = 0;
6175 alu.dst.write = 1;
6176
6177 alu.src[0].sel = tmp0;
6178 alu.src[0].chan = 0;
6179 alu.src[1].sel = tmp0;
6180 alu.src[1].chan = 3;
6181
6182 alu.last = 1;
6183 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6184 return r;
6185
6186 /* 8. tmp1.y = tmp0.x + tmp0.w */
6187 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6188 alu.op = ALU_OP2_ADD_INT;
6189
6190 alu.dst.sel = tmp1;
6191 alu.dst.chan = 1;
6192 alu.dst.write = 1;
6193
6194 alu.src[0].sel = tmp0;
6195 alu.src[0].chan = 0;
6196 alu.src[1].sel = tmp0;
6197 alu.src[1].chan = 3;
6198
6199 alu.last = 1;
6200 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6201 return r;
6202
6203 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6205 alu.op = ALU_OP3_CNDE_INT;
6206 alu.is_op3 = 1;
6207
6208 alu.dst.sel = tmp0;
6209 alu.dst.chan = 0;
6210 alu.dst.write = 1;
6211
6212 alu.src[0].sel = tmp0;
6213 alu.src[0].chan = 1;
6214 alu.src[1].sel = tmp1;
6215 alu.src[1].chan = 1;
6216 alu.src[2].sel = tmp1;
6217 alu.src[2].chan = 0;
6218
6219 alu.last = 1;
6220 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6221 return r;
6222
6223 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
6224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6225 alu.op = ALU_OP2_MULHI_UINT;
6226
6227 alu.dst.sel = tmp0;
6228 alu.dst.chan = 2;
6229 alu.dst.write = 1;
6230
6231 alu.src[0].sel = tmp0;
6232 alu.src[0].chan = 0;
6233
6234 if (signed_op) {
6235 alu.src[1].sel = tmp2;
6236 alu.src[1].chan = 0;
6237 } else {
6238 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6239 }
6240
6241 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6242 return r;
6243
6244 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
6245 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6246 alu.op = ALU_OP2_MULLO_UINT;
6247
6248 alu.dst.sel = tmp0;
6249 alu.dst.chan = 1;
6250 alu.dst.write = 1;
6251
6252 if (signed_op) {
6253 alu.src[0].sel = tmp2;
6254 alu.src[0].chan = 1;
6255 } else {
6256 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6257 }
6258
6259 alu.src[1].sel = tmp0;
6260 alu.src[1].chan = 2;
6261
6262 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6263 return r;
6264
6265 /* 12. tmp0.w = src1 - tmp0.y = r */
6266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6267 alu.op = ALU_OP2_SUB_INT;
6268
6269 alu.dst.sel = tmp0;
6270 alu.dst.chan = 3;
6271 alu.dst.write = 1;
6272
6273 if (signed_op) {
6274 alu.src[0].sel = tmp2;
6275 alu.src[0].chan = 0;
6276 } else {
6277 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6278 }
6279
6280 alu.src[1].sel = tmp0;
6281 alu.src[1].chan = 1;
6282
6283 alu.last = 1;
6284 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6285 return r;
6286
6287 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
6288 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6289 alu.op = ALU_OP2_SETGE_UINT;
6290
6291 alu.dst.sel = tmp1;
6292 alu.dst.chan = 0;
6293 alu.dst.write = 1;
6294
6295 alu.src[0].sel = tmp0;
6296 alu.src[0].chan = 3;
6297 if (signed_op) {
6298 alu.src[1].sel = tmp2;
6299 alu.src[1].chan = 1;
6300 } else {
6301 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6302 }
6303
6304 alu.last = 1;
6305 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6306 return r;
6307
6308 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
6309 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6310 alu.op = ALU_OP2_SETGE_UINT;
6311
6312 alu.dst.sel = tmp1;
6313 alu.dst.chan = 1;
6314 alu.dst.write = 1;
6315
6316 if (signed_op) {
6317 alu.src[0].sel = tmp2;
6318 alu.src[0].chan = 0;
6319 } else {
6320 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6321 }
6322
6323 alu.src[1].sel = tmp0;
6324 alu.src[1].chan = 1;
6325
6326 alu.last = 1;
6327 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6328 return r;
6329
6330 if (mod) { /* UMOD */
6331
6332 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
6333 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6334 alu.op = ALU_OP2_SUB_INT;
6335
6336 alu.dst.sel = tmp1;
6337 alu.dst.chan = 2;
6338 alu.dst.write = 1;
6339
6340 alu.src[0].sel = tmp0;
6341 alu.src[0].chan = 3;
6342
6343 if (signed_op) {
6344 alu.src[1].sel = tmp2;
6345 alu.src[1].chan = 1;
6346 } else {
6347 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6348 }
6349
6350 alu.last = 1;
6351 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6352 return r;
6353
6354 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
6355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6356 alu.op = ALU_OP2_ADD_INT;
6357
6358 alu.dst.sel = tmp1;
6359 alu.dst.chan = 3;
6360 alu.dst.write = 1;
6361
6362 alu.src[0].sel = tmp0;
6363 alu.src[0].chan = 3;
6364 if (signed_op) {
6365 alu.src[1].sel = tmp2;
6366 alu.src[1].chan = 1;
6367 } else {
6368 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6369 }
6370
6371 alu.last = 1;
6372 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6373 return r;
6374
6375 } else { /* UDIV */
6376
6377 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
6378 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6379 alu.op = ALU_OP2_ADD_INT;
6380
6381 alu.dst.sel = tmp1;
6382 alu.dst.chan = 2;
6383 alu.dst.write = 1;
6384
6385 alu.src[0].sel = tmp0;
6386 alu.src[0].chan = 2;
6387 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6388
6389 alu.last = 1;
6390 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6391 return r;
6392
6393 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
6394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6395 alu.op = ALU_OP2_ADD_INT;
6396
6397 alu.dst.sel = tmp1;
6398 alu.dst.chan = 3;
6399 alu.dst.write = 1;
6400
6401 alu.src[0].sel = tmp0;
6402 alu.src[0].chan = 2;
6403 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6404
6405 alu.last = 1;
6406 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6407 return r;
6408
6409 }
6410
6411 /* 17. tmp1.x = tmp1.x & tmp1.y */
6412 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6413 alu.op = ALU_OP2_AND_INT;
6414
6415 alu.dst.sel = tmp1;
6416 alu.dst.chan = 0;
6417 alu.dst.write = 1;
6418
6419 alu.src[0].sel = tmp1;
6420 alu.src[0].chan = 0;
6421 alu.src[1].sel = tmp1;
6422 alu.src[1].chan = 1;
6423
6424 alu.last = 1;
6425 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6426 return r;
6427
6428 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
6429 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
6430 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6431 alu.op = ALU_OP3_CNDE_INT;
6432 alu.is_op3 = 1;
6433
6434 alu.dst.sel = tmp0;
6435 alu.dst.chan = 2;
6436 alu.dst.write = 1;
6437
6438 alu.src[0].sel = tmp1;
6439 alu.src[0].chan = 0;
6440 alu.src[1].sel = tmp0;
6441 alu.src[1].chan = mod ? 3 : 2;
6442 alu.src[2].sel = tmp1;
6443 alu.src[2].chan = 2;
6444
6445 alu.last = 1;
6446 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6447 return r;
6448
6449 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6450 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6451 alu.op = ALU_OP3_CNDE_INT;
6452 alu.is_op3 = 1;
6453
6454 if (signed_op) {
6455 alu.dst.sel = tmp0;
6456 alu.dst.chan = 2;
6457 alu.dst.write = 1;
6458 } else {
6459 if (tmp4 > 0) {
6460 alu.dst.sel = tmp4;
6461 alu.dst.chan = i;
6462 alu.dst.write = 1;
6463 } else {
6464 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6465 }
6466 }
6467
6468 alu.src[0].sel = tmp1;
6469 alu.src[0].chan = 1;
6470 alu.src[1].sel = tmp1;
6471 alu.src[1].chan = 3;
6472 alu.src[2].sel = tmp0;
6473 alu.src[2].chan = 2;
6474
6475 alu.last = 1;
6476 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6477 return r;
6478
6479 if (signed_op) {
6480
6481 /* fix the sign of the result */
6482
6483 if (mod) {
6484
6485 /* tmp0.x = -tmp0.z */
6486 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6487 alu.op = ALU_OP2_SUB_INT;
6488
6489 alu.dst.sel = tmp0;
6490 alu.dst.chan = 0;
6491 alu.dst.write = 1;
6492
6493 alu.src[0].sel = V_SQ_ALU_SRC_0;
6494 alu.src[1].sel = tmp0;
6495 alu.src[1].chan = 2;
6496
6497 alu.last = 1;
6498 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6499 return r;
6500
6501 /* sign of the remainder is the same as the sign of src0 */
6502 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6503 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6504 alu.op = ALU_OP3_CNDGE_INT;
6505 alu.is_op3 = 1;
6506
6507 if (tmp4 > 0) {
6508 alu.dst.sel = tmp4;
6509 alu.dst.chan = i;
6510 alu.dst.write = 1;
6511 } else {
6512 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6513 }
6514
6515 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6516 alu.src[1].sel = tmp0;
6517 alu.src[1].chan = 2;
6518 alu.src[2].sel = tmp0;
6519 alu.src[2].chan = 0;
6520
6521 alu.last = 1;
6522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6523 return r;
6524
6525 } else {
6526
6527 /* tmp0.x = -tmp0.z */
6528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6529 alu.op = ALU_OP2_SUB_INT;
6530
6531 alu.dst.sel = tmp0;
6532 alu.dst.chan = 0;
6533 alu.dst.write = 1;
6534
6535 alu.src[0].sel = V_SQ_ALU_SRC_0;
6536 alu.src[1].sel = tmp0;
6537 alu.src[1].chan = 2;
6538
6539 alu.last = 1;
6540 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6541 return r;
6542
6543 /* fix the quotient sign (same as the sign of src0*src1) */
6544 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6545 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6546 alu.op = ALU_OP3_CNDGE_INT;
6547 alu.is_op3 = 1;
6548
6549 if (tmp4 > 0) {
6550 alu.dst.sel = tmp4;
6551 alu.dst.chan = i;
6552 alu.dst.write = 1;
6553 } else {
6554 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6555 }
6556
6557 alu.src[0].sel = tmp2;
6558 alu.src[0].chan = 2;
6559 alu.src[1].sel = tmp0;
6560 alu.src[1].chan = 2;
6561 alu.src[2].sel = tmp0;
6562 alu.src[2].chan = 0;
6563
6564 alu.last = 1;
6565 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6566 return r;
6567 }
6568 }
6569 }
6570
6571 if (tmp4 > 0) {
6572 for (i = 0; i <= lasti; ++i) {
6573 if (!(write_mask & (1<<i)))
6574 continue;
6575
6576 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6577 alu.op = ALU_OP1_MOV;
6578 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6579 alu.src[0].sel = tmp4;
6580 alu.src[0].chan = i;
6581
6582 if (i == lasti)
6583 alu.last = 1;
6584 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6585 return r;
6586 }
6587 }
6588
6589 return 0;
6590 }
6591
6592 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6593 {
6594 return tgsi_divmod(ctx, 0, 0);
6595 }
6596
6597 static int tgsi_umod(struct r600_shader_ctx *ctx)
6598 {
6599 return tgsi_divmod(ctx, 1, 0);
6600 }
6601
6602 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6603 {
6604 return tgsi_divmod(ctx, 0, 1);
6605 }
6606
6607 static int tgsi_imod(struct r600_shader_ctx *ctx)
6608 {
6609 return tgsi_divmod(ctx, 1, 1);
6610 }
6611
6612
6613 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6614 {
6615 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6616 struct r600_bytecode_alu alu;
6617 int i, r;
6618 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6619 int last_inst = tgsi_last_instruction(write_mask);
6620
6621 for (i = 0; i < 4; i++) {
6622 if (!(write_mask & (1<<i)))
6623 continue;
6624
6625 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6626 alu.op = ALU_OP1_TRUNC;
6627
6628 alu.dst.sel = ctx->temp_reg;
6629 alu.dst.chan = i;
6630 alu.dst.write = 1;
6631
6632 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6633 if (i == last_inst)
6634 alu.last = 1;
6635 r = r600_bytecode_add_alu(ctx->bc, &alu);
6636 if (r)
6637 return r;
6638 }
6639
6640 for (i = 0; i < 4; i++) {
6641 if (!(write_mask & (1<<i)))
6642 continue;
6643
6644 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6645 alu.op = ctx->inst_info->op;
6646
6647 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6648
6649 alu.src[0].sel = ctx->temp_reg;
6650 alu.src[0].chan = i;
6651
6652 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6653 alu.last = 1;
6654 r = r600_bytecode_add_alu(ctx->bc, &alu);
6655 if (r)
6656 return r;
6657 }
6658
6659 return 0;
6660 }
6661
6662 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6663 {
6664 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6665 struct r600_bytecode_alu alu;
6666 int i, r;
6667 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6668 int last_inst = tgsi_last_instruction(write_mask);
6669
6670 /* tmp = -src */
6671 for (i = 0; i < 4; i++) {
6672 if (!(write_mask & (1<<i)))
6673 continue;
6674
6675 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6676 alu.op = ALU_OP2_SUB_INT;
6677
6678 alu.dst.sel = ctx->temp_reg;
6679 alu.dst.chan = i;
6680 alu.dst.write = 1;
6681
6682 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6683 alu.src[0].sel = V_SQ_ALU_SRC_0;
6684
6685 if (i == last_inst)
6686 alu.last = 1;
6687 r = r600_bytecode_add_alu(ctx->bc, &alu);
6688 if (r)
6689 return r;
6690 }
6691
6692 /* dst = (src >= 0 ? src : tmp) */
6693 for (i = 0; i < 4; i++) {
6694 if (!(write_mask & (1<<i)))
6695 continue;
6696
6697 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6698 alu.op = ALU_OP3_CNDGE_INT;
6699 alu.is_op3 = 1;
6700 alu.dst.write = 1;
6701
6702 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6703
6704 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6705 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6706 alu.src[2].sel = ctx->temp_reg;
6707 alu.src[2].chan = i;
6708
6709 if (i == last_inst)
6710 alu.last = 1;
6711 r = r600_bytecode_add_alu(ctx->bc, &alu);
6712 if (r)
6713 return r;
6714 }
6715 return 0;
6716 }
6717
6718 static int tgsi_issg(struct r600_shader_ctx *ctx)
6719 {
6720 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6721 struct r600_bytecode_alu alu;
6722 int i, r;
6723 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6724 int last_inst = tgsi_last_instruction(write_mask);
6725
6726 /* tmp = (src >= 0 ? src : -1) */
6727 for (i = 0; i < 4; i++) {
6728 if (!(write_mask & (1<<i)))
6729 continue;
6730
6731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6732 alu.op = ALU_OP3_CNDGE_INT;
6733 alu.is_op3 = 1;
6734
6735 alu.dst.sel = ctx->temp_reg;
6736 alu.dst.chan = i;
6737 alu.dst.write = 1;
6738
6739 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6740 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6741 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6742
6743 if (i == last_inst)
6744 alu.last = 1;
6745 r = r600_bytecode_add_alu(ctx->bc, &alu);
6746 if (r)
6747 return r;
6748 }
6749
6750 /* dst = (tmp > 0 ? 1 : tmp) */
6751 for (i = 0; i < 4; i++) {
6752 if (!(write_mask & (1<<i)))
6753 continue;
6754
6755 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6756 alu.op = ALU_OP3_CNDGT_INT;
6757 alu.is_op3 = 1;
6758 alu.dst.write = 1;
6759
6760 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6761
6762 alu.src[0].sel = ctx->temp_reg;
6763 alu.src[0].chan = i;
6764
6765 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6766
6767 alu.src[2].sel = ctx->temp_reg;
6768 alu.src[2].chan = i;
6769
6770 if (i == last_inst)
6771 alu.last = 1;
6772 r = r600_bytecode_add_alu(ctx->bc, &alu);
6773 if (r)
6774 return r;
6775 }
6776 return 0;
6777 }
6778
6779
6780
6781 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6782 {
6783 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6784 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6785 int last_inst = tgsi_last_instruction(write_mask);
6786 struct r600_bytecode_alu alu;
6787 int i, r;
6788
6789 /* tmp = (src > 0 ? 1 : src) */
6790 for (i = 0; i <= last_inst; i++) {
6791 if (!(write_mask & (1 << i)))
6792 continue;
6793 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6794 alu.op = ALU_OP3_CNDGT;
6795 alu.is_op3 = 1;
6796
6797 alu.dst.sel = ctx->temp_reg;
6798 alu.dst.chan = i;
6799
6800 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6801 alu.src[1].sel = V_SQ_ALU_SRC_1;
6802 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6803
6804 if (i == last_inst)
6805 alu.last = 1;
6806 r = r600_bytecode_add_alu(ctx->bc, &alu);
6807 if (r)
6808 return r;
6809 }
6810
6811 /* dst = (-tmp > 0 ? -1 : tmp) */
6812 for (i = 0; i <= last_inst; i++) {
6813 if (!(write_mask & (1 << i)))
6814 continue;
6815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6816 alu.op = ALU_OP3_CNDGT;
6817 alu.is_op3 = 1;
6818 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6819
6820 alu.src[0].sel = ctx->temp_reg;
6821 alu.src[0].chan = i;
6822 alu.src[0].neg = 1;
6823
6824 alu.src[1].sel = V_SQ_ALU_SRC_1;
6825 alu.src[1].neg = 1;
6826
6827 alu.src[2].sel = ctx->temp_reg;
6828 alu.src[2].chan = i;
6829
6830 if (i == last_inst)
6831 alu.last = 1;
6832 r = r600_bytecode_add_alu(ctx->bc, &alu);
6833 if (r)
6834 return r;
6835 }
6836 return 0;
6837 }
6838
6839 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6840 {
6841 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6842 struct r600_bytecode_alu alu;
6843 int i, r, t1, t2;
6844
6845 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6846 int last_inst = tgsi_last_instruction(write_mask);
6847
6848 t1 = r600_get_temp(ctx);
6849
6850 for (i = 0; i < 4; i++) {
6851 if (!(write_mask & (1<<i)))
6852 continue;
6853
6854 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6855 alu.op = ALU_OP2_SETGE_INT;
6856 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6857 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6858 alu.src[1].value = 32;
6859 alu.dst.sel = ctx->temp_reg;
6860 alu.dst.chan = i;
6861 alu.dst.write = 1;
6862 alu.last = i == last_inst;
6863 r = r600_bytecode_add_alu(ctx->bc, &alu);
6864 if (r)
6865 return r;
6866 }
6867
6868 for (i = 0; i < 4; i++) {
6869 if (!(write_mask & (1<<i)))
6870 continue;
6871
6872 /* create mask tmp */
6873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6874 alu.op = ALU_OP2_BFM_INT;
6875 alu.dst.sel = t1;
6876 alu.dst.chan = i;
6877 alu.dst.write = 1;
6878 alu.last = i == last_inst;
6879
6880 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6881 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6882
6883 r = r600_bytecode_add_alu(ctx->bc, &alu);
6884 if (r)
6885 return r;
6886 }
6887
6888 t2 = r600_get_temp(ctx);
6889
6890 for (i = 0; i < 4; i++) {
6891 if (!(write_mask & (1<<i)))
6892 continue;
6893
6894 /* shift insert left */
6895 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6896 alu.op = ALU_OP2_LSHL_INT;
6897 alu.dst.sel = t2;
6898 alu.dst.chan = i;
6899 alu.dst.write = 1;
6900 alu.last = i == last_inst;
6901
6902 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6903 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6904
6905 r = r600_bytecode_add_alu(ctx->bc, &alu);
6906 if (r)
6907 return r;
6908 }
6909
6910 for (i = 0; i < 4; i++) {
6911 if (!(write_mask & (1<<i)))
6912 continue;
6913
6914 /* actual bitfield insert */
6915 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6916 alu.op = ALU_OP3_BFI_INT;
6917 alu.is_op3 = 1;
6918 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6919 alu.dst.chan = i;
6920 alu.dst.write = 1;
6921 alu.last = i == last_inst;
6922
6923 alu.src[0].sel = t1;
6924 alu.src[0].chan = i;
6925 alu.src[1].sel = t2;
6926 alu.src[1].chan = i;
6927 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6928
6929 r = r600_bytecode_add_alu(ctx->bc, &alu);
6930 if (r)
6931 return r;
6932 }
6933
6934 for (i = 0; i < 4; i++) {
6935 if (!(write_mask & (1<<i)))
6936 continue;
6937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6938 alu.op = ALU_OP3_CNDE_INT;
6939 alu.is_op3 = 1;
6940 alu.src[0].sel = ctx->temp_reg;
6941 alu.src[0].chan = i;
6942 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6943
6944 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6945
6946 alu.src[1].sel = alu.dst.sel;
6947 alu.src[1].chan = i;
6948
6949 alu.last = i == last_inst;
6950 r = r600_bytecode_add_alu(ctx->bc, &alu);
6951 if (r)
6952 return r;
6953 }
6954 return 0;
6955 }
6956
6957 static int tgsi_msb(struct r600_shader_ctx *ctx)
6958 {
6959 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6960 struct r600_bytecode_alu alu;
6961 int i, r, t1, t2;
6962
6963 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6964 int last_inst = tgsi_last_instruction(write_mask);
6965
6966 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6967 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6968
6969 t1 = ctx->temp_reg;
6970
6971 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6972 for (i = 0; i < 4; i++) {
6973 if (!(write_mask & (1<<i)))
6974 continue;
6975
6976 /* t1 = FFBH_INT / FFBH_UINT */
6977 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6978 alu.op = ctx->inst_info->op;
6979 alu.dst.sel = t1;
6980 alu.dst.chan = i;
6981 alu.dst.write = 1;
6982 alu.last = i == last_inst;
6983
6984 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6985
6986 r = r600_bytecode_add_alu(ctx->bc, &alu);
6987 if (r)
6988 return r;
6989 }
6990
6991 t2 = r600_get_temp(ctx);
6992
6993 for (i = 0; i < 4; i++) {
6994 if (!(write_mask & (1<<i)))
6995 continue;
6996
6997 /* t2 = 31 - t1 */
6998 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6999 alu.op = ALU_OP2_SUB_INT;
7000 alu.dst.sel = t2;
7001 alu.dst.chan = i;
7002 alu.dst.write = 1;
7003 alu.last = i == last_inst;
7004
7005 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7006 alu.src[0].value = 31;
7007 alu.src[1].sel = t1;
7008 alu.src[1].chan = i;
7009
7010 r = r600_bytecode_add_alu(ctx->bc, &alu);
7011 if (r)
7012 return r;
7013 }
7014
7015 for (i = 0; i < 4; i++) {
7016 if (!(write_mask & (1<<i)))
7017 continue;
7018
7019 /* result = t1 >= 0 ? t2 : t1 */
7020 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7021 alu.op = ALU_OP3_CNDGE_INT;
7022 alu.is_op3 = 1;
7023 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7024 alu.dst.chan = i;
7025 alu.dst.write = 1;
7026 alu.last = i == last_inst;
7027
7028 alu.src[0].sel = t1;
7029 alu.src[0].chan = i;
7030 alu.src[1].sel = t2;
7031 alu.src[1].chan = i;
7032 alu.src[2].sel = t1;
7033 alu.src[2].chan = i;
7034
7035 r = r600_bytecode_add_alu(ctx->bc, &alu);
7036 if (r)
7037 return r;
7038 }
7039
7040 return 0;
7041 }
7042
7043 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
7044 {
7045 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7046 struct r600_bytecode_alu alu;
7047 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
7048 unsigned location;
7049 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
7050
7051 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
7052
7053 /* Interpolators have been marked for use already by allocate_system_value_inputs */
7054 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7055 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7056 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
7057 }
7058 else {
7059 location = TGSI_INTERPOLATE_LOC_CENTROID;
7060 ctx->shader->input[input].uses_interpolate_at_centroid = 1;
7061 }
7062
7063 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
7064 if (k < 0)
7065 k = 0;
7066 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
7067 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
7068
7069 /* NOTE: currently offset is not perspective correct */
7070 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7071 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7072 int sample_gpr = -1;
7073 int gradientsH, gradientsV;
7074 struct r600_bytecode_tex tex;
7075
7076 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7077 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
7078 }
7079
7080 gradientsH = r600_get_temp(ctx);
7081 gradientsV = r600_get_temp(ctx);
7082 for (i = 0; i < 2; i++) {
7083 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7084 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
7085 tex.src_gpr = interp_gpr;
7086 tex.src_sel_x = interp_base_chan + 0;
7087 tex.src_sel_y = interp_base_chan + 1;
7088 tex.src_sel_z = 0;
7089 tex.src_sel_w = 0;
7090 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
7091 tex.dst_sel_x = 0;
7092 tex.dst_sel_y = 1;
7093 tex.dst_sel_z = 7;
7094 tex.dst_sel_w = 7;
7095 tex.inst_mod = 1; // Use per pixel gradient calculation
7096 tex.sampler_id = 0;
7097 tex.resource_id = tex.sampler_id;
7098 r = r600_bytecode_add_tex(ctx->bc, &tex);
7099 if (r)
7100 return r;
7101 }
7102
7103 for (i = 0; i < 2; i++) {
7104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7105 alu.op = ALU_OP3_MULADD;
7106 alu.is_op3 = 1;
7107 alu.src[0].sel = gradientsH;
7108 alu.src[0].chan = i;
7109 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7110 alu.src[1].sel = sample_gpr;
7111 alu.src[1].chan = 2;
7112 }
7113 else {
7114 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
7115 }
7116 alu.src[2].sel = interp_gpr;
7117 alu.src[2].chan = interp_base_chan + i;
7118 alu.dst.sel = ctx->temp_reg;
7119 alu.dst.chan = i;
7120 alu.last = i == 1;
7121
7122 r = r600_bytecode_add_alu(ctx->bc, &alu);
7123 if (r)
7124 return r;
7125 }
7126
7127 for (i = 0; i < 2; i++) {
7128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7129 alu.op = ALU_OP3_MULADD;
7130 alu.is_op3 = 1;
7131 alu.src[0].sel = gradientsV;
7132 alu.src[0].chan = i;
7133 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7134 alu.src[1].sel = sample_gpr;
7135 alu.src[1].chan = 3;
7136 }
7137 else {
7138 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
7139 }
7140 alu.src[2].sel = ctx->temp_reg;
7141 alu.src[2].chan = i;
7142 alu.dst.sel = ctx->temp_reg;
7143 alu.dst.chan = i;
7144 alu.last = i == 1;
7145
7146 r = r600_bytecode_add_alu(ctx->bc, &alu);
7147 if (r)
7148 return r;
7149 }
7150 }
7151
7152 tmp = r600_get_temp(ctx);
7153 for (i = 0; i < 8; i++) {
7154 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7155 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
7156
7157 alu.dst.sel = tmp;
7158 if ((i > 1 && i < 6)) {
7159 alu.dst.write = 1;
7160 }
7161 else {
7162 alu.dst.write = 0;
7163 }
7164 alu.dst.chan = i % 4;
7165
7166 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
7167 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7168 alu.src[0].sel = ctx->temp_reg;
7169 alu.src[0].chan = 1 - (i % 2);
7170 } else {
7171 alu.src[0].sel = interp_gpr;
7172 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7173 }
7174 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7175 alu.src[1].chan = 0;
7176
7177 alu.last = i % 4 == 3;
7178 alu.bank_swizzle_force = SQ_ALU_VEC_210;
7179
7180 r = r600_bytecode_add_alu(ctx->bc, &alu);
7181 if (r)
7182 return r;
7183 }
7184
7185 // INTERP can't swizzle dst
7186 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7187 for (i = 0; i <= lasti; i++) {
7188 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7189 continue;
7190
7191 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7192 alu.op = ALU_OP1_MOV;
7193 alu.src[0].sel = tmp;
7194 alu.src[0].chan = ctx->src[0].swizzle[i];
7195 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7196 alu.dst.write = 1;
7197 alu.last = i == lasti;
7198 r = r600_bytecode_add_alu(ctx->bc, &alu);
7199 if (r)
7200 return r;
7201 }
7202
7203 return 0;
7204 }
7205
7206
7207 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7208 {
7209 struct r600_bytecode_alu alu;
7210 int i, r;
7211
7212 for (i = 0; i < 4; i++) {
7213 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7214 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7215 alu.op = ALU_OP0_NOP;
7216 alu.dst.chan = i;
7217 } else {
7218 alu.op = ALU_OP1_MOV;
7219 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7220 alu.src[0].sel = ctx->temp_reg;
7221 alu.src[0].chan = i;
7222 }
7223 if (i == 3) {
7224 alu.last = 1;
7225 }
7226 r = r600_bytecode_add_alu(ctx->bc, &alu);
7227 if (r)
7228 return r;
7229 }
7230 return 0;
7231 }
7232
7233 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7234 unsigned writemask,
7235 struct r600_bytecode_alu_src *bc_src,
7236 const struct r600_shader_src *shader_src)
7237 {
7238 struct r600_bytecode_alu alu;
7239 int i, r;
7240 int lasti = tgsi_last_instruction(writemask);
7241 int temp_reg = 0;
7242
7243 r600_bytecode_src(&bc_src[0], shader_src, 0);
7244 r600_bytecode_src(&bc_src[1], shader_src, 1);
7245 r600_bytecode_src(&bc_src[2], shader_src, 2);
7246 r600_bytecode_src(&bc_src[3], shader_src, 3);
7247
7248 if (bc_src->abs) {
7249 temp_reg = r600_get_temp(ctx);
7250
7251 for (i = 0; i < lasti + 1; i++) {
7252 if (!(writemask & (1 << i)))
7253 continue;
7254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7255 alu.op = ALU_OP1_MOV;
7256 alu.dst.sel = temp_reg;
7257 alu.dst.chan = i;
7258 alu.dst.write = 1;
7259 alu.src[0] = bc_src[i];
7260 if (i == lasti) {
7261 alu.last = 1;
7262 }
7263 r = r600_bytecode_add_alu(ctx->bc, &alu);
7264 if (r)
7265 return r;
7266 memset(&bc_src[i], 0, sizeof(*bc_src));
7267 bc_src[i].sel = temp_reg;
7268 bc_src[i].chan = i;
7269 }
7270 }
7271 return 0;
7272 }
7273
7274 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7275 {
7276 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7277 struct r600_bytecode_alu alu;
7278 struct r600_bytecode_alu_src srcs[4][4];
7279 int i, j, r;
7280 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7281 unsigned op = ctx->inst_info->op;
7282
7283 if (op == ALU_OP3_MULADD_IEEE &&
7284 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7285 op = ALU_OP3_MULADD;
7286
7287 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7288 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
7289 srcs[j], &ctx->src[j]);
7290 if (r)
7291 return r;
7292 }
7293
7294 for (i = 0; i < lasti + 1; i++) {
7295 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7296 continue;
7297
7298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7299 alu.op = op;
7300 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7301 alu.src[j] = srcs[j][i];
7302 }
7303
7304 if (dst == -1) {
7305 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7306 } else {
7307 alu.dst.sel = dst;
7308 }
7309 alu.dst.chan = i;
7310 alu.dst.write = 1;
7311 alu.is_op3 = 1;
7312 if (i == lasti) {
7313 alu.last = 1;
7314 }
7315 r = r600_bytecode_add_alu(ctx->bc, &alu);
7316 if (r)
7317 return r;
7318 }
7319 return 0;
7320 }
7321
7322 static int tgsi_op3(struct r600_shader_ctx *ctx)
7323 {
7324 return tgsi_op3_dst(ctx, -1);
7325 }
7326
7327 static int tgsi_dp(struct r600_shader_ctx *ctx)
7328 {
7329 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7330 struct r600_bytecode_alu alu;
7331 int i, j, r;
7332 unsigned op = ctx->inst_info->op;
7333 if (op == ALU_OP2_DOT4_IEEE &&
7334 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7335 op = ALU_OP2_DOT4;
7336
7337 for (i = 0; i < 4; i++) {
7338 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7339 alu.op = op;
7340 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7341 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7342 }
7343
7344 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7345 alu.dst.chan = i;
7346 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7347 /* handle some special cases */
7348 switch (inst->Instruction.Opcode) {
7349 case TGSI_OPCODE_DP2:
7350 if (i > 1) {
7351 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7352 alu.src[0].chan = alu.src[1].chan = 0;
7353 }
7354 break;
7355 case TGSI_OPCODE_DP3:
7356 if (i > 2) {
7357 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7358 alu.src[0].chan = alu.src[1].chan = 0;
7359 }
7360 break;
7361 default:
7362 break;
7363 }
7364 if (i == 3) {
7365 alu.last = 1;
7366 }
7367 r = r600_bytecode_add_alu(ctx->bc, &alu);
7368 if (r)
7369 return r;
7370 }
7371 return 0;
7372 }
7373
7374 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7375 unsigned index)
7376 {
7377 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7378 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7379 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7380 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7381 ctx->src[index].neg || ctx->src[index].abs ||
7382 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7383 }
7384
7385 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7386 unsigned index)
7387 {
7388 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7389 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7390 }
7391
7392 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7393 {
7394 struct r600_bytecode_vtx vtx;
7395 struct r600_bytecode_alu alu;
7396 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7397 int src_gpr, r, i;
7398 int id = tgsi_tex_get_src_gpr(ctx, 1);
7399 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7400
7401 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7402 if (src_requires_loading) {
7403 for (i = 0; i < 4; i++) {
7404 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7405 alu.op = ALU_OP1_MOV;
7406 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7407 alu.dst.sel = ctx->temp_reg;
7408 alu.dst.chan = i;
7409 if (i == 3)
7410 alu.last = 1;
7411 alu.dst.write = 1;
7412 r = r600_bytecode_add_alu(ctx->bc, &alu);
7413 if (r)
7414 return r;
7415 }
7416 src_gpr = ctx->temp_reg;
7417 }
7418
7419 memset(&vtx, 0, sizeof(vtx));
7420 vtx.op = FETCH_OP_VFETCH;
7421 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7422 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7423 vtx.src_gpr = src_gpr;
7424 vtx.mega_fetch_count = 16;
7425 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7426 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7427 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
7428 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
7429 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
7430 vtx.use_const_fields = 1;
7431 vtx.buffer_index_mode = sampler_index_mode;
7432
7433 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7434 return r;
7435
7436 if (ctx->bc->chip_class >= EVERGREEN)
7437 return 0;
7438
7439 for (i = 0; i < 4; i++) {
7440 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7441 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7442 continue;
7443
7444 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7445 alu.op = ALU_OP2_AND_INT;
7446
7447 alu.dst.chan = i;
7448 alu.dst.sel = vtx.dst_gpr;
7449 alu.dst.write = 1;
7450
7451 alu.src[0].sel = vtx.dst_gpr;
7452 alu.src[0].chan = i;
7453
7454 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7455 alu.src[1].sel += (id * 2);
7456 alu.src[1].chan = i % 4;
7457 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7458
7459 if (i == lasti)
7460 alu.last = 1;
7461 r = r600_bytecode_add_alu(ctx->bc, &alu);
7462 if (r)
7463 return r;
7464 }
7465
7466 if (inst->Dst[0].Register.WriteMask & 3) {
7467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7468 alu.op = ALU_OP2_OR_INT;
7469
7470 alu.dst.chan = 3;
7471 alu.dst.sel = vtx.dst_gpr;
7472 alu.dst.write = 1;
7473
7474 alu.src[0].sel = vtx.dst_gpr;
7475 alu.src[0].chan = 3;
7476
7477 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7478 alu.src[1].chan = 0;
7479 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7480
7481 alu.last = 1;
7482 r = r600_bytecode_add_alu(ctx->bc, &alu);
7483 if (r)
7484 return r;
7485 }
7486 return 0;
7487 }
7488
7489 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7490 {
7491 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7492 int r;
7493 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7494 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7495
7496 if (ctx->bc->chip_class < EVERGREEN) {
7497 struct r600_bytecode_alu alu;
7498 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7499 alu.op = ALU_OP1_MOV;
7500 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7501 /* r600 we have them at channel 2 of the second dword */
7502 alu.src[0].sel += (id * 2) + 1;
7503 alu.src[0].chan = 1;
7504 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7505 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7506 alu.last = 1;
7507 r = r600_bytecode_add_alu(ctx->bc, &alu);
7508 if (r)
7509 return r;
7510 return 0;
7511 } else {
7512 struct r600_bytecode_vtx vtx;
7513 memset(&vtx, 0, sizeof(vtx));
7514 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7515 vtx.buffer_id = id + eg_buffer_base;
7516 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7517 vtx.src_gpr = 0;
7518 vtx.mega_fetch_count = 16; /* no idea here really... */
7519 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7520 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7521 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
7522 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
7523 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
7524 vtx.data_format = FMT_32_32_32_32;
7525 vtx.buffer_index_mode = sampler_index_mode;
7526
7527 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7528 return r;
7529 return 0;
7530 }
7531 }
7532
7533
7534 static int tgsi_tex(struct r600_shader_ctx *ctx)
7535 {
7536 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7537 struct r600_bytecode_tex tex;
7538 struct r600_bytecode_tex grad_offs[3];
7539 struct r600_bytecode_alu alu;
7540 unsigned src_gpr;
7541 int r, i, j, n_grad_offs = 0;
7542 int opcode;
7543 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7544 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7545 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7546 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7547
7548 bool txf_add_offsets = inst->Texture.NumOffsets &&
7549 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7550 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7551
7552 /* Texture fetch instructions can only use gprs as source.
7553 * Also they cannot negate the source or take the absolute value */
7554 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7555 tgsi_tex_src_requires_loading(ctx, 0)) ||
7556 read_compressed_msaa || txf_add_offsets;
7557
7558 boolean src_loaded = FALSE;
7559 unsigned sampler_src_reg = 1;
7560 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7561 boolean has_txq_cube_array_z = false;
7562 unsigned sampler_index_mode;
7563 int array_index_offset_channel = -1;
7564
7565 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7566 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7567 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7568 if (inst->Dst[0].Register.WriteMask & 4) {
7569 ctx->shader->has_txq_cube_array_z_comp = true;
7570 has_txq_cube_array_z = true;
7571 }
7572
7573 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7574 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7575 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7576 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7577 sampler_src_reg = 2;
7578
7579 /* TGSI moves the sampler to src reg 3 for TXD */
7580 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7581 sampler_src_reg = 3;
7582
7583 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7584
7585 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7586
7587 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7588 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7589 if (ctx->bc->chip_class < EVERGREEN)
7590 ctx->shader->uses_tex_buffers = true;
7591 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7592 }
7593 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7594 if (ctx->bc->chip_class < EVERGREEN)
7595 ctx->shader->uses_tex_buffers = true;
7596 return do_vtx_fetch_inst(ctx, src_requires_loading);
7597 }
7598 }
7599
7600 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7601 int out_chan;
7602 /* Add perspective divide */
7603 if (ctx->bc->chip_class == CAYMAN) {
7604 out_chan = 2;
7605 for (i = 0; i < 3; i++) {
7606 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7607 alu.op = ALU_OP1_RECIP_IEEE;
7608 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7609
7610 alu.dst.sel = ctx->temp_reg;
7611 alu.dst.chan = i;
7612 if (i == 2)
7613 alu.last = 1;
7614 if (out_chan == i)
7615 alu.dst.write = 1;
7616 r = r600_bytecode_add_alu(ctx->bc, &alu);
7617 if (r)
7618 return r;
7619 }
7620
7621 } else {
7622 out_chan = 3;
7623 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7624 alu.op = ALU_OP1_RECIP_IEEE;
7625 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7626
7627 alu.dst.sel = ctx->temp_reg;
7628 alu.dst.chan = out_chan;
7629 alu.last = 1;
7630 alu.dst.write = 1;
7631 r = r600_bytecode_add_alu(ctx->bc, &alu);
7632 if (r)
7633 return r;
7634 }
7635
7636 for (i = 0; i < 3; i++) {
7637 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7638 alu.op = ALU_OP2_MUL;
7639 alu.src[0].sel = ctx->temp_reg;
7640 alu.src[0].chan = out_chan;
7641 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7642 alu.dst.sel = ctx->temp_reg;
7643 alu.dst.chan = i;
7644 alu.dst.write = 1;
7645 r = r600_bytecode_add_alu(ctx->bc, &alu);
7646 if (r)
7647 return r;
7648 }
7649 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7650 alu.op = ALU_OP1_MOV;
7651 alu.src[0].sel = V_SQ_ALU_SRC_1;
7652 alu.src[0].chan = 0;
7653 alu.dst.sel = ctx->temp_reg;
7654 alu.dst.chan = 3;
7655 alu.last = 1;
7656 alu.dst.write = 1;
7657 r = r600_bytecode_add_alu(ctx->bc, &alu);
7658 if (r)
7659 return r;
7660 src_loaded = TRUE;
7661 src_gpr = ctx->temp_reg;
7662 }
7663
7664
7665 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7666 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7667 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7668 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7669 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7670
7671 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7672 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7673
7674 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7675 for (i = 0; i < 4; i++) {
7676 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7677 alu.op = ALU_OP2_CUBE;
7678 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7679 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7680 alu.dst.sel = ctx->temp_reg;
7681 alu.dst.chan = i;
7682 if (i == 3)
7683 alu.last = 1;
7684 alu.dst.write = 1;
7685 r = r600_bytecode_add_alu(ctx->bc, &alu);
7686 if (r)
7687 return r;
7688 }
7689
7690 /* tmp1.z = RCP_e(|tmp1.z|) */
7691 if (ctx->bc->chip_class == CAYMAN) {
7692 for (i = 0; i < 3; i++) {
7693 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7694 alu.op = ALU_OP1_RECIP_IEEE;
7695 alu.src[0].sel = ctx->temp_reg;
7696 alu.src[0].chan = 2;
7697 alu.src[0].abs = 1;
7698 alu.dst.sel = ctx->temp_reg;
7699 alu.dst.chan = i;
7700 if (i == 2)
7701 alu.dst.write = 1;
7702 if (i == 2)
7703 alu.last = 1;
7704 r = r600_bytecode_add_alu(ctx->bc, &alu);
7705 if (r)
7706 return r;
7707 }
7708 } else {
7709 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7710 alu.op = ALU_OP1_RECIP_IEEE;
7711 alu.src[0].sel = ctx->temp_reg;
7712 alu.src[0].chan = 2;
7713 alu.src[0].abs = 1;
7714 alu.dst.sel = ctx->temp_reg;
7715 alu.dst.chan = 2;
7716 alu.dst.write = 1;
7717 alu.last = 1;
7718 r = r600_bytecode_add_alu(ctx->bc, &alu);
7719 if (r)
7720 return r;
7721 }
7722
7723 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7724 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7725 * muladd has no writemask, have to use another temp
7726 */
7727 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7728 alu.op = ALU_OP3_MULADD;
7729 alu.is_op3 = 1;
7730
7731 alu.src[0].sel = ctx->temp_reg;
7732 alu.src[0].chan = 0;
7733 alu.src[1].sel = ctx->temp_reg;
7734 alu.src[1].chan = 2;
7735
7736 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7737 alu.src[2].chan = 0;
7738 alu.src[2].value = u_bitcast_f2u(1.5f);
7739
7740 alu.dst.sel = ctx->temp_reg;
7741 alu.dst.chan = 0;
7742 alu.dst.write = 1;
7743
7744 r = r600_bytecode_add_alu(ctx->bc, &alu);
7745 if (r)
7746 return r;
7747
7748 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7749 alu.op = ALU_OP3_MULADD;
7750 alu.is_op3 = 1;
7751
7752 alu.src[0].sel = ctx->temp_reg;
7753 alu.src[0].chan = 1;
7754 alu.src[1].sel = ctx->temp_reg;
7755 alu.src[1].chan = 2;
7756
7757 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7758 alu.src[2].chan = 0;
7759 alu.src[2].value = u_bitcast_f2u(1.5f);
7760
7761 alu.dst.sel = ctx->temp_reg;
7762 alu.dst.chan = 1;
7763 alu.dst.write = 1;
7764
7765 alu.last = 1;
7766 r = r600_bytecode_add_alu(ctx->bc, &alu);
7767 if (r)
7768 return r;
7769 /* write initial compare value into Z component
7770 - W src 0 for shadow cube
7771 - X src 1 for shadow cube array */
7772 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7773 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7774 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7775 alu.op = ALU_OP1_MOV;
7776 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7777 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7778 else
7779 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7780 alu.dst.sel = ctx->temp_reg;
7781 alu.dst.chan = 2;
7782 alu.dst.write = 1;
7783 alu.last = 1;
7784 r = r600_bytecode_add_alu(ctx->bc, &alu);
7785 if (r)
7786 return r;
7787 }
7788
7789 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7790 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7791 if (ctx->bc->chip_class >= EVERGREEN) {
7792 int mytmp = r600_get_temp(ctx);
7793 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7794 alu.op = ALU_OP1_MOV;
7795 alu.src[0].sel = ctx->temp_reg;
7796 alu.src[0].chan = 3;
7797 alu.dst.sel = mytmp;
7798 alu.dst.chan = 0;
7799 alu.dst.write = 1;
7800 alu.last = 1;
7801 r = r600_bytecode_add_alu(ctx->bc, &alu);
7802 if (r)
7803 return r;
7804
7805 /* Evaluate the array index according to floor(idx + 0.5). This
7806 * needs to be done before merging the face select value, because
7807 * otherwise the fractional part of the array index will interfere
7808 * with the face select value */
7809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7810 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7811 alu.op = ALU_OP1_RNDNE;
7812 alu.dst.sel = ctx->temp_reg;
7813 alu.dst.chan = 3;
7814 alu.dst.write = 1;
7815 alu.last = 1;
7816 r = r600_bytecode_add_alu(ctx->bc, &alu);
7817 if (r)
7818 return r;
7819
7820 /* Because the array slice index and the cube face index are merged
7821 * into one value we have to make sure the array slice index is >= 0,
7822 * otherwise the face selection will fail */
7823 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7824 alu.op = ALU_OP2_MAX;
7825 alu.src[0].sel = ctx->temp_reg;
7826 alu.src[0].chan = 3;
7827 alu.src[1].sel = V_SQ_ALU_SRC_0;
7828 alu.dst.sel = ctx->temp_reg;
7829 alu.dst.chan = 3;
7830 alu.dst.write = 1;
7831 alu.last = 1;
7832 r = r600_bytecode_add_alu(ctx->bc, &alu);
7833 if (r)
7834 return r;
7835
7836 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7837 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7838 alu.op = ALU_OP3_MULADD;
7839 alu.is_op3 = 1;
7840 alu.src[0].sel = ctx->temp_reg;
7841 alu.src[0].chan = 3;
7842 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7843 alu.src[1].chan = 0;
7844 alu.src[1].value = u_bitcast_f2u(8.0f);
7845 alu.src[2].sel = mytmp;
7846 alu.src[2].chan = 0;
7847 alu.dst.sel = ctx->temp_reg;
7848 alu.dst.chan = 3;
7849 alu.dst.write = 1;
7850 alu.last = 1;
7851 r = r600_bytecode_add_alu(ctx->bc, &alu);
7852 if (r)
7853 return r;
7854 } else if (ctx->bc->chip_class < EVERGREEN) {
7855 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7856 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7857 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7858 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7859 tex.src_gpr = r600_get_temp(ctx);
7860 tex.src_sel_x = 0;
7861 tex.src_sel_y = 0;
7862 tex.src_sel_z = 0;
7863 tex.src_sel_w = 0;
7864 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7865 tex.coord_type_x = 1;
7866 tex.coord_type_y = 1;
7867 tex.coord_type_z = 1;
7868 tex.coord_type_w = 1;
7869 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7870 alu.op = ALU_OP1_MOV;
7871 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7872 alu.dst.sel = tex.src_gpr;
7873 alu.dst.chan = 0;
7874 alu.last = 1;
7875 alu.dst.write = 1;
7876 r = r600_bytecode_add_alu(ctx->bc, &alu);
7877 if (r)
7878 return r;
7879
7880 r = r600_bytecode_add_tex(ctx->bc, &tex);
7881 if (r)
7882 return r;
7883 }
7884
7885 }
7886
7887 /* for cube forms of lod and bias we need to route things */
7888 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7889 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7890 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7891 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7892 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7893 alu.op = ALU_OP1_MOV;
7894 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7895 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7896 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7897 else
7898 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7899 alu.dst.sel = ctx->temp_reg;
7900 alu.dst.chan = 2;
7901 alu.last = 1;
7902 alu.dst.write = 1;
7903 r = r600_bytecode_add_alu(ctx->bc, &alu);
7904 if (r)
7905 return r;
7906 }
7907
7908 src_loaded = TRUE;
7909 src_gpr = ctx->temp_reg;
7910 }
7911
7912 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7913 int temp_h = 0, temp_v = 0;
7914 int start_val = 0;
7915
7916 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7917 if (src_loaded == TRUE)
7918 start_val = 1;
7919 else
7920 src_loaded = TRUE;
7921 for (i = start_val; i < 3; i++) {
7922 int treg = r600_get_temp(ctx);
7923
7924 if (i == 0)
7925 src_gpr = treg;
7926 else if (i == 1)
7927 temp_h = treg;
7928 else
7929 temp_v = treg;
7930
7931 for (j = 0; j < 4; j++) {
7932 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7933 alu.op = ALU_OP1_MOV;
7934 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7935 alu.dst.sel = treg;
7936 alu.dst.chan = j;
7937 if (j == 3)
7938 alu.last = 1;
7939 alu.dst.write = 1;
7940 r = r600_bytecode_add_alu(ctx->bc, &alu);
7941 if (r)
7942 return r;
7943 }
7944 }
7945 for (i = 1; i < 3; i++) {
7946 /* set gradients h/v */
7947 struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
7948 memset(t, 0, sizeof(struct r600_bytecode_tex));
7949 t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7950 FETCH_OP_SET_GRADIENTS_V;
7951 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7952 t->sampler_index_mode = sampler_index_mode;
7953 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
7954 t->resource_index_mode = sampler_index_mode;
7955
7956 t->src_gpr = (i == 1) ? temp_h : temp_v;
7957 t->src_sel_x = 0;
7958 t->src_sel_y = 1;
7959 t->src_sel_z = 2;
7960 t->src_sel_w = 3;
7961
7962 t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7963 t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
7964 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7965 t->coord_type_x = 1;
7966 t->coord_type_y = 1;
7967 t->coord_type_z = 1;
7968 t->coord_type_w = 1;
7969 }
7970 }
7971 }
7972
7973 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7974 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
7975 * incorrectly forces nearest filtering if the texture format is integer.
7976 * The only effect it has on Gather4, which always returns 4 texels for
7977 * bilinear filtering, is that the final coordinates are off by 0.5 of
7978 * the texel size.
7979 *
7980 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7981 * or (0.5 / size) from the normalized coordinates.
7982 */
7983 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7984 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7985 int treg = r600_get_temp(ctx);
7986
7987 /* mov array and comparison oordinate to temp_reg if needed */
7988 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7989 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7990 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7991 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7992 for (i = 2; i <= end; i++) {
7993 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7994 alu.op = ALU_OP1_MOV;
7995 alu.dst.sel = ctx->temp_reg;
7996 alu.dst.chan = i;
7997 alu.dst.write = 1;
7998 alu.last = (i == end);
7999 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8000 r = r600_bytecode_add_alu(ctx->bc, &alu);
8001 if (r)
8002 return r;
8003 }
8004 }
8005
8006 if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
8007 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
8008 for (i = 0; i < 2; i++) {
8009 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8010 alu.op = ALU_OP2_ADD;
8011 alu.dst.sel = ctx->temp_reg;
8012 alu.dst.chan = i;
8013 alu.dst.write = 1;
8014 alu.last = i == 1;
8015 if (src_loaded) {
8016 alu.src[0].sel = ctx->temp_reg;
8017 alu.src[0].chan = i;
8018 } else
8019 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8020 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8021 alu.src[1].neg = 1;
8022 r = r600_bytecode_add_alu(ctx->bc, &alu);
8023 if (r)
8024 return r;
8025 }
8026 } else {
8027 /* execute a TXQ */
8028 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8029 tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
8030 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8031 tex.sampler_index_mode = sampler_index_mode;
8032 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8033 tex.resource_index_mode = sampler_index_mode;
8034 tex.dst_gpr = treg;
8035 tex.src_sel_x = 4;
8036 tex.src_sel_y = 4;
8037 tex.src_sel_z = 4;
8038 tex.src_sel_w = 4;
8039 tex.dst_sel_x = 0;
8040 tex.dst_sel_y = 1;
8041 tex.dst_sel_z = 7;
8042 tex.dst_sel_w = 7;
8043 r = r600_bytecode_add_tex(ctx->bc, &tex);
8044 if (r)
8045 return r;
8046
8047 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
8048 if (ctx->bc->chip_class == CAYMAN) {
8049 /* */
8050 for (i = 0; i < 2; i++) {
8051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8052 alu.op = ALU_OP1_INT_TO_FLT;
8053 alu.dst.sel = treg;
8054 alu.dst.chan = i;
8055 alu.dst.write = 1;
8056 alu.src[0].sel = treg;
8057 alu.src[0].chan = i;
8058 alu.last = (i == 1) ? 1 : 0;
8059 r = r600_bytecode_add_alu(ctx->bc, &alu);
8060 if (r)
8061 return r;
8062 }
8063 for (j = 0; j < 2; j++) {
8064 for (i = 0; i < 3; i++) {
8065 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8066 alu.op = ALU_OP1_RECIP_IEEE;
8067 alu.src[0].sel = treg;
8068 alu.src[0].chan = j;
8069 alu.dst.sel = treg;
8070 alu.dst.chan = i;
8071 if (i == 2)
8072 alu.last = 1;
8073 if (i == j)
8074 alu.dst.write = 1;
8075 r = r600_bytecode_add_alu(ctx->bc, &alu);
8076 if (r)
8077 return r;
8078 }
8079 }
8080 } else {
8081 for (i = 0; i < 2; i++) {
8082 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8083 alu.op = ALU_OP1_INT_TO_FLT;
8084 alu.dst.sel = treg;
8085 alu.dst.chan = i;
8086 alu.dst.write = 1;
8087 alu.src[0].sel = treg;
8088 alu.src[0].chan = i;
8089 alu.last = 1;
8090 r = r600_bytecode_add_alu(ctx->bc, &alu);
8091 if (r)
8092 return r;
8093 }
8094 for (i = 0; i < 2; i++) {
8095 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8096 alu.op = ALU_OP1_RECIP_IEEE;
8097 alu.src[0].sel = treg;
8098 alu.src[0].chan = i;
8099 alu.dst.sel = treg;
8100 alu.dst.chan = i;
8101 alu.last = 1;
8102 alu.dst.write = 1;
8103 r = r600_bytecode_add_alu(ctx->bc, &alu);
8104 if (r)
8105 return r;
8106 }
8107 }
8108 for (i = 0; i < 2; i++) {
8109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8110 alu.op = ALU_OP3_MULADD;
8111 alu.is_op3 = 1;
8112 alu.dst.sel = ctx->temp_reg;
8113 alu.dst.chan = i;
8114 alu.dst.write = 1;
8115 alu.last = i == 1;
8116 alu.src[0].sel = treg;
8117 alu.src[0].chan = i;
8118 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
8119 alu.src[1].neg = 1;
8120 if (src_loaded) {
8121 alu.src[2].sel = ctx->temp_reg;
8122 alu.src[2].chan = i;
8123 } else
8124 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
8125 r = r600_bytecode_add_alu(ctx->bc, &alu);
8126 if (r)
8127 return r;
8128 }
8129 }
8130 src_loaded = TRUE;
8131 src_gpr = ctx->temp_reg;
8132 }
8133 }
8134
8135 if (src_requires_loading && !src_loaded) {
8136 for (i = 0; i < 4; i++) {
8137 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8138 alu.op = ALU_OP1_MOV;
8139 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8140 alu.dst.sel = ctx->temp_reg;
8141 alu.dst.chan = i;
8142 if (i == 3)
8143 alu.last = 1;
8144 alu.dst.write = 1;
8145 r = r600_bytecode_add_alu(ctx->bc, &alu);
8146 if (r)
8147 return r;
8148 }
8149 src_loaded = TRUE;
8150 src_gpr = ctx->temp_reg;
8151 }
8152
8153 /* get offset values */
8154 if (inst->Texture.NumOffsets) {
8155 assert(inst->Texture.NumOffsets == 1);
8156
8157 /* The texture offset feature doesn't work with the TXF instruction
8158 * and must be emulated by adding the offset to the texture coordinates. */
8159 if (txf_add_offsets) {
8160 const struct tgsi_texture_offset *off = inst->TexOffsets;
8161
8162 switch (inst->Texture.Texture) {
8163 case TGSI_TEXTURE_3D:
8164 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8165 alu.op = ALU_OP2_ADD_INT;
8166 alu.src[0].sel = src_gpr;
8167 alu.src[0].chan = 2;
8168 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8169 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
8170 alu.dst.sel = src_gpr;
8171 alu.dst.chan = 2;
8172 alu.dst.write = 1;
8173 alu.last = 1;
8174 r = r600_bytecode_add_alu(ctx->bc, &alu);
8175 if (r)
8176 return r;
8177 /* fall through */
8178
8179 case TGSI_TEXTURE_2D:
8180 case TGSI_TEXTURE_SHADOW2D:
8181 case TGSI_TEXTURE_RECT:
8182 case TGSI_TEXTURE_SHADOWRECT:
8183 case TGSI_TEXTURE_2D_ARRAY:
8184 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8185 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8186 alu.op = ALU_OP2_ADD_INT;
8187 alu.src[0].sel = src_gpr;
8188 alu.src[0].chan = 1;
8189 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8190 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
8191 alu.dst.sel = src_gpr;
8192 alu.dst.chan = 1;
8193 alu.dst.write = 1;
8194 alu.last = 1;
8195 r = r600_bytecode_add_alu(ctx->bc, &alu);
8196 if (r)
8197 return r;
8198 /* fall through */
8199
8200 case TGSI_TEXTURE_1D:
8201 case TGSI_TEXTURE_SHADOW1D:
8202 case TGSI_TEXTURE_1D_ARRAY:
8203 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8205 alu.op = ALU_OP2_ADD_INT;
8206 alu.src[0].sel = src_gpr;
8207 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8208 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8209 alu.dst.sel = src_gpr;
8210 alu.dst.write = 1;
8211 alu.last = 1;
8212 r = r600_bytecode_add_alu(ctx->bc, &alu);
8213 if (r)
8214 return r;
8215 break;
8216 /* texture offsets do not apply to other texture targets */
8217 }
8218 } else {
8219 switch (inst->Texture.Texture) {
8220 case TGSI_TEXTURE_3D:
8221 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8222 /* fallthrough */
8223 case TGSI_TEXTURE_2D:
8224 case TGSI_TEXTURE_SHADOW2D:
8225 case TGSI_TEXTURE_RECT:
8226 case TGSI_TEXTURE_SHADOWRECT:
8227 case TGSI_TEXTURE_2D_ARRAY:
8228 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8229 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8230 /* fallthrough */
8231 case TGSI_TEXTURE_1D:
8232 case TGSI_TEXTURE_SHADOW1D:
8233 case TGSI_TEXTURE_1D_ARRAY:
8234 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8235 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8236 }
8237 }
8238 }
8239
8240 /* Obtain the sample index for reading a compressed MSAA color texture.
8241 * To read the FMASK, we use the ldfptr instruction, which tells us
8242 * where the samples are stored.
8243 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8244 * which is the identity mapping. Each nibble says which physical sample
8245 * should be fetched to get that sample.
8246 *
8247 * Assume src.z contains the sample index. It should be modified like this:
8248 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8249 * Then fetch the texel with src.
8250 */
8251 if (read_compressed_msaa) {
8252 unsigned sample_chan = 3;
8253 unsigned temp = r600_get_temp(ctx);
8254 assert(src_loaded);
8255
8256 /* temp.w = ldfptr() */
8257 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8258 tex.op = FETCH_OP_LD;
8259 tex.inst_mod = 1; /* to indicate this is ldfptr */
8260 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8261 tex.sampler_index_mode = sampler_index_mode;
8262 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8263 tex.resource_index_mode = sampler_index_mode;
8264 tex.src_gpr = src_gpr;
8265 tex.dst_gpr = temp;
8266 tex.dst_sel_x = 7; /* mask out these components */
8267 tex.dst_sel_y = 7;
8268 tex.dst_sel_z = 7;
8269 tex.dst_sel_w = 0; /* store X */
8270 tex.src_sel_x = 0;
8271 tex.src_sel_y = 1;
8272 tex.src_sel_z = 2;
8273 tex.src_sel_w = 3;
8274 tex.offset_x = offset_x;
8275 tex.offset_y = offset_y;
8276 tex.offset_z = offset_z;
8277 r = r600_bytecode_add_tex(ctx->bc, &tex);
8278 if (r)
8279 return r;
8280
8281 /* temp.x = sample_index*4 */
8282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8283 alu.op = ALU_OP2_MULLO_INT;
8284 alu.src[0].sel = src_gpr;
8285 alu.src[0].chan = sample_chan;
8286 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8287 alu.src[1].value = 4;
8288 alu.dst.sel = temp;
8289 alu.dst.chan = 0;
8290 alu.dst.write = 1;
8291 r = emit_mul_int_op(ctx->bc, &alu);
8292 if (r)
8293 return r;
8294
8295 /* sample_index = temp.w >> temp.x */
8296 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8297 alu.op = ALU_OP2_LSHR_INT;
8298 alu.src[0].sel = temp;
8299 alu.src[0].chan = 3;
8300 alu.src[1].sel = temp;
8301 alu.src[1].chan = 0;
8302 alu.dst.sel = src_gpr;
8303 alu.dst.chan = sample_chan;
8304 alu.dst.write = 1;
8305 alu.last = 1;
8306 r = r600_bytecode_add_alu(ctx->bc, &alu);
8307 if (r)
8308 return r;
8309
8310 /* sample_index & 0xF */
8311 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8312 alu.op = ALU_OP2_AND_INT;
8313 alu.src[0].sel = src_gpr;
8314 alu.src[0].chan = sample_chan;
8315 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8316 alu.src[1].value = 0xF;
8317 alu.dst.sel = src_gpr;
8318 alu.dst.chan = sample_chan;
8319 alu.dst.write = 1;
8320 alu.last = 1;
8321 r = r600_bytecode_add_alu(ctx->bc, &alu);
8322 if (r)
8323 return r;
8324 #if 0
8325 /* visualize the FMASK */
8326 for (i = 0; i < 4; i++) {
8327 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8328 alu.op = ALU_OP1_INT_TO_FLT;
8329 alu.src[0].sel = src_gpr;
8330 alu.src[0].chan = sample_chan;
8331 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8332 alu.dst.chan = i;
8333 alu.dst.write = 1;
8334 alu.last = 1;
8335 r = r600_bytecode_add_alu(ctx->bc, &alu);
8336 if (r)
8337 return r;
8338 }
8339 return 0;
8340 #endif
8341 }
8342
8343 /* does this shader want a num layers from TXQ for a cube array? */
8344 if (has_txq_cube_array_z) {
8345 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8346
8347 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8348 alu.op = ALU_OP1_MOV;
8349
8350 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8351 if (ctx->bc->chip_class >= EVERGREEN) {
8352 /* with eg each dword is number of cubes */
8353 alu.src[0].sel += id / 4;
8354 alu.src[0].chan = id % 4;
8355 } else {
8356 /* r600 we have them at channel 2 of the second dword */
8357 alu.src[0].sel += (id * 2) + 1;
8358 alu.src[0].chan = 2;
8359 }
8360 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8361 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8362 alu.last = 1;
8363 r = r600_bytecode_add_alu(ctx->bc, &alu);
8364 if (r)
8365 return r;
8366 /* disable writemask from texture instruction */
8367 inst->Dst[0].Register.WriteMask &= ~4;
8368 }
8369
8370 opcode = ctx->inst_info->op;
8371 if (opcode == FETCH_OP_GATHER4 &&
8372 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8373 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8374 struct r600_bytecode_tex *t;
8375 opcode = FETCH_OP_GATHER4_O;
8376
8377 /* GATHER4_O/GATHER4_C_O use offset values loaded by
8378 SET_TEXTURE_OFFSETS instruction. The immediate offset values
8379 encoded in the instruction are ignored. */
8380 t = &grad_offs[n_grad_offs++];
8381 memset(t, 0, sizeof(struct r600_bytecode_tex));
8382 t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
8383 t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8384 t->sampler_index_mode = sampler_index_mode;
8385 t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
8386 t->resource_index_mode = sampler_index_mode;
8387
8388 t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8389 t->src_sel_x = inst->TexOffsets[0].SwizzleX;
8390 t->src_sel_y = inst->TexOffsets[0].SwizzleY;
8391 if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8392 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
8393 /* make sure array index selector is 0, this is just a safety
8394 * precausion because TGSI seems to emit something strange here */
8395 t->src_sel_z = 4;
8396 else
8397 t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
8398
8399 t->src_sel_w = 4;
8400
8401 t->dst_sel_x = 7;
8402 t->dst_sel_y = 7;
8403 t->dst_sel_z = 7;
8404 t->dst_sel_w = 7;
8405 }
8406
8407 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8408 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8409 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8410 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8411 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8412 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8413 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8414 switch (opcode) {
8415 case FETCH_OP_SAMPLE:
8416 opcode = FETCH_OP_SAMPLE_C;
8417 break;
8418 case FETCH_OP_SAMPLE_L:
8419 opcode = FETCH_OP_SAMPLE_C_L;
8420 break;
8421 case FETCH_OP_SAMPLE_LB:
8422 opcode = FETCH_OP_SAMPLE_C_LB;
8423 break;
8424 case FETCH_OP_SAMPLE_G:
8425 opcode = FETCH_OP_SAMPLE_C_G;
8426 break;
8427 /* Texture gather variants */
8428 case FETCH_OP_GATHER4:
8429 opcode = FETCH_OP_GATHER4_C;
8430 break;
8431 case FETCH_OP_GATHER4_O:
8432 opcode = FETCH_OP_GATHER4_C_O;
8433 break;
8434 }
8435 }
8436
8437 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8438 tex.op = opcode;
8439
8440 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8441 tex.sampler_index_mode = sampler_index_mode;
8442 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8443 tex.resource_index_mode = sampler_index_mode;
8444 tex.src_gpr = src_gpr;
8445 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8446
8447 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8448 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8449 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8450 }
8451
8452 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8453 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8454 tex.inst_mod = texture_component_select;
8455
8456 if (ctx->bc->chip_class == CAYMAN) {
8457 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8458 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8459 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8460 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8461 } else {
8462 /* GATHER4 result order is different from TGSI TG4 */
8463 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8464 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8465 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8466 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8467 }
8468 }
8469 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8470 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8471 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8472 tex.dst_sel_z = 7;
8473 tex.dst_sel_w = 7;
8474 }
8475 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8476 tex.dst_sel_x = 3;
8477 tex.dst_sel_y = 7;
8478 tex.dst_sel_z = 7;
8479 tex.dst_sel_w = 7;
8480 }
8481 else {
8482 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8483 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8484 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8485 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8486 }
8487
8488
8489 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8490 tex.src_sel_x = 4;
8491 tex.src_sel_y = 4;
8492 tex.src_sel_z = 4;
8493 tex.src_sel_w = 4;
8494 } else if (src_loaded) {
8495 tex.src_sel_x = 0;
8496 tex.src_sel_y = 1;
8497 tex.src_sel_z = 2;
8498 tex.src_sel_w = 3;
8499 } else {
8500 tex.src_sel_x = ctx->src[0].swizzle[0];
8501 tex.src_sel_y = ctx->src[0].swizzle[1];
8502 tex.src_sel_z = ctx->src[0].swizzle[2];
8503 tex.src_sel_w = ctx->src[0].swizzle[3];
8504 tex.src_rel = ctx->src[0].rel;
8505 }
8506
8507 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8508 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8509 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8510 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8511 tex.src_sel_x = 1;
8512 tex.src_sel_y = 0;
8513 tex.src_sel_z = 3;
8514 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8515 }
8516
8517 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8518 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8519 tex.coord_type_x = 1;
8520 tex.coord_type_y = 1;
8521 }
8522 tex.coord_type_z = 1;
8523 tex.coord_type_w = 1;
8524
8525 tex.offset_x = offset_x;
8526 tex.offset_y = offset_y;
8527 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8528 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8529 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8530 tex.offset_z = 0;
8531 }
8532 else {
8533 tex.offset_z = offset_z;
8534 }
8535
8536 /* Put the depth for comparison in W.
8537 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8538 * Some instructions expect the depth in Z. */
8539 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8540 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8541 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8542 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8543 opcode != FETCH_OP_SAMPLE_C_L &&
8544 opcode != FETCH_OP_SAMPLE_C_LB) {
8545 tex.src_sel_w = tex.src_sel_z;
8546 }
8547
8548 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8549 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8550 if (opcode == FETCH_OP_SAMPLE_C_L ||
8551 opcode == FETCH_OP_SAMPLE_C_LB) {
8552 /* the array index is read from Y */
8553 tex.coord_type_y = 0;
8554 array_index_offset_channel = tex.src_sel_y;
8555 } else {
8556 /* the array index is read from Z */
8557 tex.coord_type_z = 0;
8558 tex.src_sel_z = tex.src_sel_y;
8559 array_index_offset_channel = tex.src_sel_z;
8560 }
8561 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8562 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
8563 tex.coord_type_z = 0;
8564 array_index_offset_channel = tex.src_sel_z;
8565 } else if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8566 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8567 (ctx->bc->chip_class >= EVERGREEN))
8568 /* the array index is read from Z, coordinate will be corrected elsewhere */
8569 tex.coord_type_z = 0;
8570
8571 /* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
8572 * evaluate the array index */
8573 if (array_index_offset_channel >= 0 &&
8574 opcode != FETCH_OP_LD &&
8575 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
8576 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8577 alu.src[0].sel = tex.src_gpr;
8578 alu.src[0].chan = array_index_offset_channel;
8579 alu.src[0].rel = tex.src_rel;
8580 alu.op = ALU_OP1_RNDNE;
8581 alu.dst.sel = tex.src_gpr;
8582 alu.dst.chan = array_index_offset_channel;
8583 alu.dst.rel = tex.src_rel;
8584 alu.dst.write = 1;
8585 alu.last = 1;
8586 r = r600_bytecode_add_alu(ctx->bc, &alu);
8587 if (r)
8588 return r;
8589 }
8590
8591 /* mask unused source components */
8592 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8593 switch (inst->Texture.Texture) {
8594 case TGSI_TEXTURE_2D:
8595 case TGSI_TEXTURE_RECT:
8596 tex.src_sel_z = 7;
8597 tex.src_sel_w = 7;
8598 break;
8599 case TGSI_TEXTURE_1D_ARRAY:
8600 tex.src_sel_y = 7;
8601 tex.src_sel_w = 7;
8602 break;
8603 case TGSI_TEXTURE_1D:
8604 tex.src_sel_y = 7;
8605 tex.src_sel_z = 7;
8606 tex.src_sel_w = 7;
8607 break;
8608 }
8609 }
8610
8611 /* Emit set gradient and offset instructions. */
8612 for (i = 0; i < n_grad_offs; ++i) {
8613 r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
8614 if (r)
8615 return r;
8616 }
8617
8618 r = r600_bytecode_add_tex(ctx->bc, &tex);
8619 if (r)
8620 return r;
8621
8622 /* add shadow ambient support - gallium doesn't do it yet */
8623 return 0;
8624 }
8625
8626 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8627 struct tgsi_full_src_register *src)
8628 {
8629 unsigned i;
8630
8631 if (src->Register.Indirect) {
8632 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8633 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8634 return ctx->shader->atomics[i].hw_idx;
8635 }
8636 } else {
8637 uint32_t index = src->Register.Index;
8638 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8639 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8640 continue;
8641 if (index > ctx->shader->atomics[i].end)
8642 continue;
8643 if (index < ctx->shader->atomics[i].start)
8644 continue;
8645 uint32_t offset = (index - ctx->shader->atomics[i].start);
8646 return ctx->shader->atomics[i].hw_idx + offset;
8647 }
8648 }
8649 assert(0);
8650 return -1;
8651 }
8652
8653 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8654 int *uav_id_p, int *uav_index_mode_p)
8655 {
8656 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8657 int uav_id, uav_index_mode = 0;
8658 int r;
8659 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8660
8661 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8662
8663 if (inst->Src[0].Register.Indirect) {
8664 if (is_cm) {
8665 struct r600_bytecode_alu alu;
8666 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8667 alu.op = ALU_OP2_LSHL_INT;
8668 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8669 alu.src[0].chan = 0;
8670 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8671 alu.src[1].value = 2;
8672 alu.dst.sel = ctx->temp_reg;
8673 alu.dst.chan = 0;
8674 alu.dst.write = 1;
8675 alu.last = 1;
8676 r = r600_bytecode_add_alu(ctx->bc, &alu);
8677 if (r)
8678 return r;
8679
8680 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8681 ctx->temp_reg, 0,
8682 ctx->temp_reg, 0,
8683 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8684 if (r)
8685 return r;
8686 } else
8687 uav_index_mode = 2;
8688 } else if (is_cm) {
8689 r = single_alu_op2(ctx, ALU_OP1_MOV,
8690 ctx->temp_reg, 0,
8691 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8692 0, 0);
8693 if (r)
8694 return r;
8695 }
8696 *uav_id_p = uav_id;
8697 *uav_index_mode_p = uav_index_mode;
8698 return 0;
8699 }
8700
8701 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8702 {
8703 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8704 int r;
8705 struct r600_bytecode_gds gds;
8706 int uav_id = 0;
8707 int uav_index_mode = 0;
8708 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8709
8710 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8711 if (r)
8712 return r;
8713
8714 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8715 gds.op = FETCH_OP_GDS_READ_RET;
8716 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8717 gds.uav_id = is_cm ? 0 : uav_id;
8718 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8719 gds.src_gpr = ctx->temp_reg;
8720 gds.src_sel_x = (is_cm) ? 0 : 4;
8721 gds.src_sel_y = 4;
8722 gds.src_sel_z = 4;
8723 gds.dst_sel_x = 0;
8724 gds.dst_sel_y = 7;
8725 gds.dst_sel_z = 7;
8726 gds.dst_sel_w = 7;
8727 gds.src_gpr2 = 0;
8728 gds.alloc_consume = !is_cm;
8729 r = r600_bytecode_add_gds(ctx->bc, &gds);
8730 if (r)
8731 return r;
8732
8733 ctx->bc->cf_last->vpm = 1;
8734 return 0;
8735 }
8736
8737 /* this fixes up 1D arrays properly */
8738 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8739 {
8740 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8741 int r, i;
8742 struct r600_bytecode_alu alu;
8743 int temp_reg = r600_get_temp(ctx);
8744
8745 for (i = 0; i < 4; i++) {
8746 bool def_val = true, write_zero = false;
8747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8748 alu.op = ALU_OP1_MOV;
8749 alu.dst.sel = temp_reg;
8750 alu.dst.chan = i;
8751
8752 switch (inst->Memory.Texture) {
8753 case TGSI_TEXTURE_BUFFER:
8754 case TGSI_TEXTURE_1D:
8755 if (i == 1 || i == 2 || i == 3) {
8756 write_zero = true;
8757 }
8758 break;
8759 case TGSI_TEXTURE_1D_ARRAY:
8760 if (i == 1 || i == 3)
8761 write_zero = true;
8762 else if (i == 2) {
8763 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8764 def_val = false;
8765 }
8766 break;
8767 case TGSI_TEXTURE_2D:
8768 if (i == 2 || i == 3)
8769 write_zero = true;
8770 break;
8771 default:
8772 if (i == 3)
8773 write_zero = true;
8774 break;
8775 }
8776
8777 if (write_zero) {
8778 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8779 alu.src[0].value = 0;
8780 } else if (def_val) {
8781 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8782 }
8783
8784 if (i == 3)
8785 alu.last = 1;
8786 alu.dst.write = 1;
8787 r = r600_bytecode_add_alu(ctx->bc, &alu);
8788 if (r)
8789 return r;
8790 }
8791 *idx_gpr = temp_reg;
8792 return 0;
8793 }
8794
8795 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8796 int temp_reg)
8797 {
8798 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8799 int r;
8800 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8801 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8802 r = single_alu_op2(ctx, ALU_OP1_MOV,
8803 temp_reg, 0,
8804 V_SQ_ALU_SRC_LITERAL, value >> 2,
8805 0, 0);
8806 if (r)
8807 return r;
8808 } else {
8809 struct r600_bytecode_alu alu;
8810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8811 alu.op = ALU_OP2_LSHR_INT;
8812 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8813 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8814 alu.src[1].value = 2;
8815 alu.dst.sel = temp_reg;
8816 alu.dst.write = 1;
8817 alu.last = 1;
8818 r = r600_bytecode_add_alu(ctx->bc, &alu);
8819 if (r)
8820 return r;
8821 }
8822 return 0;
8823 }
8824
8825 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8826 {
8827 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8828 /* have to work out the offset into the RAT immediate return buffer */
8829 struct r600_bytecode_vtx vtx;
8830 struct r600_bytecode_cf *cf;
8831 int r;
8832 int temp_reg = r600_get_temp(ctx);
8833 unsigned rat_index_mode;
8834 unsigned base;
8835
8836 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8837 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8838
8839 r = load_buffer_coord(ctx, 1, temp_reg);
8840 if (r)
8841 return r;
8842 ctx->bc->cf_last->barrier = 1;
8843 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8844 vtx.op = FETCH_OP_VFETCH;
8845 vtx.buffer_id = inst->Src[0].Register.Index + base;
8846 vtx.buffer_index_mode = rat_index_mode;
8847 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8848 vtx.src_gpr = temp_reg;
8849 vtx.src_sel_x = 0;
8850 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8851 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8852 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8853 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8854 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8855 vtx.num_format_all = 1;
8856 vtx.format_comp_all = 1;
8857 vtx.srf_mode_all = 0;
8858
8859 if (inst->Dst[0].Register.WriteMask & 8) {
8860 vtx.data_format = FMT_32_32_32_32;
8861 vtx.use_const_fields = 0;
8862 } else if (inst->Dst[0].Register.WriteMask & 4) {
8863 vtx.data_format = FMT_32_32_32;
8864 vtx.use_const_fields = 0;
8865 } else if (inst->Dst[0].Register.WriteMask & 2) {
8866 vtx.data_format = FMT_32_32;
8867 vtx.use_const_fields = 0;
8868 } else {
8869 vtx.data_format = FMT_32;
8870 vtx.use_const_fields = 0;
8871 }
8872
8873 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8874 if (r)
8875 return r;
8876 cf = ctx->bc->cf_last;
8877 cf->barrier = 1;
8878 return 0;
8879 }
8880
8881 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8882 {
8883 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8884 /* have to work out the offset into the RAT immediate return buffer */
8885 struct r600_bytecode_vtx vtx;
8886 struct r600_bytecode_cf *cf;
8887 int r;
8888 int idx_gpr;
8889 unsigned format, num_format, format_comp, endian;
8890 const struct util_format_description *desc;
8891 unsigned rat_index_mode;
8892 unsigned immed_base;
8893
8894 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8895
8896 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8897 r = load_index_src(ctx, 1, &idx_gpr);
8898 if (r)
8899 return r;
8900
8901 if (rat_index_mode)
8902 egcm_load_index_reg(ctx->bc, 1, false);
8903
8904 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8905 cf = ctx->bc->cf_last;
8906
8907 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8908 cf->rat.inst = V_RAT_INST_NOP_RTN;
8909 cf->rat.index_mode = rat_index_mode;
8910 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8911 cf->output.gpr = ctx->thread_id_gpr;
8912 cf->output.index_gpr = idx_gpr;
8913 cf->output.comp_mask = 0xf;
8914 cf->output.burst_count = 1;
8915 cf->vpm = 1;
8916 cf->barrier = 1;
8917 cf->mark = 1;
8918 cf->output.elem_size = 0;
8919
8920 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8921 cf = ctx->bc->cf_last;
8922 cf->barrier = 1;
8923
8924 desc = util_format_description(inst->Memory.Format);
8925 r600_vertex_data_type(inst->Memory.Format,
8926 &format, &num_format, &format_comp, &endian);
8927 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8928 vtx.op = FETCH_OP_VFETCH;
8929 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8930 vtx.buffer_index_mode = rat_index_mode;
8931 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8932 vtx.src_gpr = ctx->thread_id_gpr;
8933 vtx.src_sel_x = 1;
8934 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8935 vtx.dst_sel_x = desc->swizzle[0];
8936 vtx.dst_sel_y = desc->swizzle[1];
8937 vtx.dst_sel_z = desc->swizzle[2];
8938 vtx.dst_sel_w = desc->swizzle[3];
8939 vtx.srf_mode_all = 1;
8940 vtx.data_format = format;
8941 vtx.num_format_all = num_format;
8942 vtx.format_comp_all = format_comp;
8943 vtx.endian = endian;
8944 vtx.offset = 0;
8945 vtx.mega_fetch_count = 3;
8946 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8947 if (r)
8948 return r;
8949 cf = ctx->bc->cf_last;
8950 cf->barrier = 1;
8951 return 0;
8952 }
8953
8954 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8955 {
8956 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8957 struct r600_bytecode_alu alu;
8958 int r;
8959 int temp_reg = r600_get_temp(ctx);
8960
8961 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8962 alu.op = ALU_OP1_MOV;
8963 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8964 alu.dst.sel = temp_reg;
8965 alu.dst.write = 1;
8966 alu.last = 1;
8967 r = r600_bytecode_add_alu(ctx->bc, &alu);
8968 if (r)
8969 return r;
8970
8971 r = do_lds_fetch_values(ctx, temp_reg,
8972 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8973 if (r)
8974 return r;
8975 return 0;
8976 }
8977
8978 static int tgsi_load(struct r600_shader_ctx *ctx)
8979 {
8980 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8981 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8982 return tgsi_load_rat(ctx);
8983 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8984 return tgsi_load_gds(ctx);
8985 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8986 return tgsi_load_buffer(ctx);
8987 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8988 return tgsi_load_lds(ctx);
8989 return 0;
8990 }
8991
8992 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8993 {
8994 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8995 struct r600_bytecode_cf *cf;
8996 int r, i;
8997 unsigned rat_index_mode;
8998 int lasti;
8999 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
9000
9001 r = load_buffer_coord(ctx, 0, treg2);
9002 if (r)
9003 return r;
9004
9005 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9006 if (rat_index_mode)
9007 egcm_load_index_reg(ctx->bc, 1, false);
9008
9009 for (i = 0; i <= 3; i++) {
9010 struct r600_bytecode_alu alu;
9011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9012 alu.op = ALU_OP1_MOV;
9013 alu.dst.sel = temp_reg;
9014 alu.dst.chan = i;
9015 alu.src[0].sel = V_SQ_ALU_SRC_0;
9016 alu.last = (i == 3);
9017 alu.dst.write = 1;
9018 r = r600_bytecode_add_alu(ctx->bc, &alu);
9019 if (r)
9020 return r;
9021 }
9022
9023 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9024 for (i = 0; i <= lasti; i++) {
9025 struct r600_bytecode_alu alu;
9026 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
9027 continue;
9028
9029 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9030 temp_reg, 0,
9031 treg2, 0,
9032 V_SQ_ALU_SRC_LITERAL, i);
9033 if (r)
9034 return r;
9035
9036 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9037 alu.op = ALU_OP1_MOV;
9038 alu.dst.sel = ctx->temp_reg;
9039 alu.dst.chan = 0;
9040
9041 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9042 alu.last = 1;
9043 alu.dst.write = 1;
9044 r = r600_bytecode_add_alu(ctx->bc, &alu);
9045 if (r)
9046 return r;
9047
9048 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9049 cf = ctx->bc->cf_last;
9050
9051 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
9052 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9053 cf->rat.index_mode = rat_index_mode;
9054 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9055 cf->output.gpr = ctx->temp_reg;
9056 cf->output.index_gpr = temp_reg;
9057 cf->output.comp_mask = 1;
9058 cf->output.burst_count = 1;
9059 cf->vpm = 1;
9060 cf->barrier = 1;
9061 cf->output.elem_size = 0;
9062 }
9063 return 0;
9064 }
9065
9066 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
9067 {
9068 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9069 struct r600_bytecode_cf *cf;
9070 bool src_requires_loading = false;
9071 int val_gpr, idx_gpr;
9072 int r, i;
9073 unsigned rat_index_mode;
9074
9075 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9076
9077 r = load_index_src(ctx, 0, &idx_gpr);
9078 if (r)
9079 return r;
9080
9081 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
9082 src_requires_loading = true;
9083
9084 if (src_requires_loading) {
9085 struct r600_bytecode_alu alu;
9086 for (i = 0; i < 4; i++) {
9087 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9088 alu.op = ALU_OP1_MOV;
9089 alu.dst.sel = ctx->temp_reg;
9090 alu.dst.chan = i;
9091
9092 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9093 if (i == 3)
9094 alu.last = 1;
9095 alu.dst.write = 1;
9096 r = r600_bytecode_add_alu(ctx->bc, &alu);
9097 if (r)
9098 return r;
9099 }
9100 val_gpr = ctx->temp_reg;
9101 } else
9102 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
9103 if (rat_index_mode)
9104 egcm_load_index_reg(ctx->bc, 1, false);
9105
9106 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9107 cf = ctx->bc->cf_last;
9108
9109 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
9110 cf->rat.inst = V_RAT_INST_STORE_TYPED;
9111 cf->rat.index_mode = rat_index_mode;
9112 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
9113 cf->output.gpr = val_gpr;
9114 cf->output.index_gpr = idx_gpr;
9115 cf->output.comp_mask = 0xf;
9116 cf->output.burst_count = 1;
9117 cf->vpm = 1;
9118 cf->barrier = 1;
9119 cf->output.elem_size = 0;
9120 return 0;
9121 }
9122
9123 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
9124 {
9125 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9126 struct r600_bytecode_alu alu;
9127 int r, i, lasti;
9128 int write_mask = inst->Dst[0].Register.WriteMask;
9129 int temp_reg = r600_get_temp(ctx);
9130
9131 /* LDS write */
9132 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9133 alu.op = ALU_OP1_MOV;
9134 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9135 alu.dst.sel = temp_reg;
9136 alu.dst.write = 1;
9137 alu.last = 1;
9138 r = r600_bytecode_add_alu(ctx->bc, &alu);
9139 if (r)
9140 return r;
9141
9142 lasti = tgsi_last_instruction(write_mask);
9143 for (i = 1; i <= lasti; i++) {
9144 if (!(write_mask & (1 << i)))
9145 continue;
9146 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
9147 temp_reg, i,
9148 temp_reg, 0,
9149 V_SQ_ALU_SRC_LITERAL, 4 * i);
9150 if (r)
9151 return r;
9152 }
9153 for (i = 0; i <= lasti; i++) {
9154 if (!(write_mask & (1 << i)))
9155 continue;
9156
9157 if ((i == 0 && ((write_mask & 3) == 3)) ||
9158 (i == 2 && ((write_mask & 0xc) == 0xc))) {
9159 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9160 alu.op = LDS_OP3_LDS_WRITE_REL;
9161
9162 alu.src[0].sel = temp_reg;
9163 alu.src[0].chan = i;
9164 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9165 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
9166 alu.last = 1;
9167 alu.is_lds_idx_op = true;
9168 alu.lds_idx = 1;
9169 r = r600_bytecode_add_alu(ctx->bc, &alu);
9170 if (r)
9171 return r;
9172 i += 1;
9173 continue;
9174 }
9175 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9176 alu.op = LDS_OP2_LDS_WRITE;
9177
9178 alu.src[0].sel = temp_reg;
9179 alu.src[0].chan = i;
9180 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9181
9182 alu.last = 1;
9183 alu.is_lds_idx_op = true;
9184
9185 r = r600_bytecode_add_alu(ctx->bc, &alu);
9186 if (r)
9187 return r;
9188 }
9189 return 0;
9190 }
9191
9192 static int tgsi_store(struct r600_shader_ctx *ctx)
9193 {
9194 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9195 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
9196 return tgsi_store_buffer_rat(ctx);
9197 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
9198 return tgsi_store_lds(ctx);
9199 else
9200 return tgsi_store_rat(ctx);
9201 }
9202
9203 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
9204 {
9205 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9206 /* have to work out the offset into the RAT immediate return buffer */
9207 struct r600_bytecode_alu alu;
9208 struct r600_bytecode_vtx vtx;
9209 struct r600_bytecode_cf *cf;
9210 int r;
9211 int idx_gpr;
9212 unsigned format, num_format, format_comp, endian;
9213 const struct util_format_description *desc;
9214 unsigned rat_index_mode;
9215 unsigned immed_base;
9216 unsigned rat_base;
9217
9218 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
9219 rat_base = ctx->shader->rat_base;
9220
9221 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
9222 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9223 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9224
9225 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
9226 if (r)
9227 return r;
9228 idx_gpr = ctx->temp_reg;
9229 } else {
9230 r = load_index_src(ctx, 1, &idx_gpr);
9231 if (r)
9232 return r;
9233 }
9234
9235 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9236
9237 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9239 alu.op = ALU_OP1_MOV;
9240 alu.dst.sel = ctx->thread_id_gpr;
9241 alu.dst.chan = 0;
9242 alu.dst.write = 1;
9243 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9244 alu.last = 1;
9245 r = r600_bytecode_add_alu(ctx->bc, &alu);
9246 if (r)
9247 return r;
9248
9249 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9250 alu.op = ALU_OP1_MOV;
9251 alu.dst.sel = ctx->thread_id_gpr;
9252 if (ctx->bc->chip_class == CAYMAN)
9253 alu.dst.chan = 2;
9254 else
9255 alu.dst.chan = 3;
9256 alu.dst.write = 1;
9257 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9258 alu.last = 1;
9259 r = r600_bytecode_add_alu(ctx->bc, &alu);
9260 if (r)
9261 return r;
9262 } else {
9263 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9264 alu.op = ALU_OP1_MOV;
9265 alu.dst.sel = ctx->thread_id_gpr;
9266 alu.dst.chan = 0;
9267 alu.dst.write = 1;
9268 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9269 alu.last = 1;
9270 r = r600_bytecode_add_alu(ctx->bc, &alu);
9271 if (r)
9272 return r;
9273 }
9274
9275 if (rat_index_mode)
9276 egcm_load_index_reg(ctx->bc, 1, false);
9277 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9278 cf = ctx->bc->cf_last;
9279
9280 cf->rat.id = rat_base + inst->Src[0].Register.Index;
9281 cf->rat.inst = ctx->inst_info->op;
9282 cf->rat.index_mode = rat_index_mode;
9283 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9284 cf->output.gpr = ctx->thread_id_gpr;
9285 cf->output.index_gpr = idx_gpr;
9286 cf->output.comp_mask = 0xf;
9287 cf->output.burst_count = 1;
9288 cf->vpm = 1;
9289 cf->barrier = 1;
9290 cf->mark = 1;
9291 cf->output.elem_size = 0;
9292 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9293 cf = ctx->bc->cf_last;
9294 cf->barrier = 1;
9295 cf->cf_addr = 1;
9296
9297 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9298 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9299 desc = util_format_description(inst->Memory.Format);
9300 r600_vertex_data_type(inst->Memory.Format,
9301 &format, &num_format, &format_comp, &endian);
9302 vtx.dst_sel_x = desc->swizzle[0];
9303 } else {
9304 format = FMT_32;
9305 num_format = 1;
9306 format_comp = 0;
9307 endian = 0;
9308 vtx.dst_sel_x = 0;
9309 }
9310 vtx.op = FETCH_OP_VFETCH;
9311 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9312 vtx.buffer_index_mode = rat_index_mode;
9313 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9314 vtx.src_gpr = ctx->thread_id_gpr;
9315 vtx.src_sel_x = 1;
9316 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9317 vtx.dst_sel_y = 7;
9318 vtx.dst_sel_z = 7;
9319 vtx.dst_sel_w = 7;
9320 vtx.use_const_fields = 0;
9321 vtx.srf_mode_all = 1;
9322 vtx.data_format = format;
9323 vtx.num_format_all = num_format;
9324 vtx.format_comp_all = format_comp;
9325 vtx.endian = endian;
9326 vtx.offset = 0;
9327 vtx.mega_fetch_count = 0xf;
9328 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9329 if (r)
9330 return r;
9331 cf = ctx->bc->cf_last;
9332 cf->vpm = 1;
9333 cf->barrier = 1;
9334 return 0;
9335 }
9336
9337 static int get_gds_op(int opcode)
9338 {
9339 switch (opcode) {
9340 case TGSI_OPCODE_ATOMUADD:
9341 return FETCH_OP_GDS_ADD_RET;
9342 case TGSI_OPCODE_ATOMAND:
9343 return FETCH_OP_GDS_AND_RET;
9344 case TGSI_OPCODE_ATOMOR:
9345 return FETCH_OP_GDS_OR_RET;
9346 case TGSI_OPCODE_ATOMXOR:
9347 return FETCH_OP_GDS_XOR_RET;
9348 case TGSI_OPCODE_ATOMUMIN:
9349 return FETCH_OP_GDS_MIN_UINT_RET;
9350 case TGSI_OPCODE_ATOMUMAX:
9351 return FETCH_OP_GDS_MAX_UINT_RET;
9352 case TGSI_OPCODE_ATOMXCHG:
9353 return FETCH_OP_GDS_XCHG_RET;
9354 case TGSI_OPCODE_ATOMCAS:
9355 return FETCH_OP_GDS_CMP_XCHG_RET;
9356 default:
9357 return -1;
9358 }
9359 }
9360
9361 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9362 {
9363 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9364 struct r600_bytecode_gds gds;
9365 struct r600_bytecode_alu alu;
9366 int gds_op = get_gds_op(inst->Instruction.Opcode);
9367 int r;
9368 int uav_id = 0;
9369 int uav_index_mode = 0;
9370 bool is_cm = (ctx->bc->chip_class == CAYMAN);
9371
9372 if (gds_op == -1) {
9373 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9374 return -1;
9375 }
9376
9377 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9378 if (r)
9379 return r;
9380
9381 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9382 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9383 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9385 alu.op = ALU_OP1_MOV;
9386 alu.dst.sel = ctx->temp_reg;
9387 alu.dst.chan = is_cm ? 2 : 1;
9388 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9389 alu.src[0].value = value;
9390 alu.last = 1;
9391 alu.dst.write = 1;
9392 r = r600_bytecode_add_alu(ctx->bc, &alu);
9393 if (r)
9394 return r;
9395 } else {
9396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9397 alu.op = ALU_OP1_MOV;
9398 alu.dst.sel = ctx->temp_reg;
9399 alu.dst.chan = is_cm ? 2 : 1;
9400 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9401 alu.last = 1;
9402 alu.dst.write = 1;
9403 r = r600_bytecode_add_alu(ctx->bc, &alu);
9404 if (r)
9405 return r;
9406 }
9407 }
9408 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9409 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9410 int abs_value = abs(value);
9411 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9412 gds_op = FETCH_OP_GDS_SUB_RET;
9413 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9414 alu.op = ALU_OP1_MOV;
9415 alu.dst.sel = ctx->temp_reg;
9416 alu.dst.chan = is_cm ? 1 : 0;
9417 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9418 alu.src[0].value = abs_value;
9419 alu.last = 1;
9420 alu.dst.write = 1;
9421 r = r600_bytecode_add_alu(ctx->bc, &alu);
9422 if (r)
9423 return r;
9424 } else {
9425 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9426 alu.op = ALU_OP1_MOV;
9427 alu.dst.sel = ctx->temp_reg;
9428 alu.dst.chan = is_cm ? 1 : 0;
9429 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9430 alu.last = 1;
9431 alu.dst.write = 1;
9432 r = r600_bytecode_add_alu(ctx->bc, &alu);
9433 if (r)
9434 return r;
9435 }
9436
9437
9438 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9439 gds.op = gds_op;
9440 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9441 gds.uav_id = is_cm ? 0 : uav_id;
9442 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9443 gds.src_gpr = ctx->temp_reg;
9444 gds.src_gpr2 = 0;
9445 gds.src_sel_x = is_cm ? 0 : 4;
9446 gds.src_sel_y = is_cm ? 1 : 0;
9447 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9448 gds.src_sel_z = is_cm ? 2 : 1;
9449 else
9450 gds.src_sel_z = 7;
9451 gds.dst_sel_x = 0;
9452 gds.dst_sel_y = 7;
9453 gds.dst_sel_z = 7;
9454 gds.dst_sel_w = 7;
9455 gds.alloc_consume = !is_cm;
9456
9457 r = r600_bytecode_add_gds(ctx->bc, &gds);
9458 if (r)
9459 return r;
9460 ctx->bc->cf_last->vpm = 1;
9461 return 0;
9462 }
9463
9464 static int get_lds_op(int opcode)
9465 {
9466 switch (opcode) {
9467 case TGSI_OPCODE_ATOMUADD:
9468 return LDS_OP2_LDS_ADD_RET;
9469 case TGSI_OPCODE_ATOMAND:
9470 return LDS_OP2_LDS_AND_RET;
9471 case TGSI_OPCODE_ATOMOR:
9472 return LDS_OP2_LDS_OR_RET;
9473 case TGSI_OPCODE_ATOMXOR:
9474 return LDS_OP2_LDS_XOR_RET;
9475 case TGSI_OPCODE_ATOMUMIN:
9476 return LDS_OP2_LDS_MIN_UINT_RET;
9477 case TGSI_OPCODE_ATOMUMAX:
9478 return LDS_OP2_LDS_MAX_UINT_RET;
9479 case TGSI_OPCODE_ATOMIMIN:
9480 return LDS_OP2_LDS_MIN_INT_RET;
9481 case TGSI_OPCODE_ATOMIMAX:
9482 return LDS_OP2_LDS_MAX_INT_RET;
9483 case TGSI_OPCODE_ATOMXCHG:
9484 return LDS_OP2_LDS_XCHG_RET;
9485 case TGSI_OPCODE_ATOMCAS:
9486 return LDS_OP3_LDS_CMP_XCHG_RET;
9487 default:
9488 return -1;
9489 }
9490 }
9491
9492 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9493 {
9494 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9495 int lds_op = get_lds_op(inst->Instruction.Opcode);
9496 int r;
9497
9498 struct r600_bytecode_alu alu;
9499 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9500 alu.op = lds_op;
9501 alu.is_lds_idx_op = true;
9502 alu.last = 1;
9503 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9504 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9505 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9506 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9507 else
9508 alu.src[2].sel = V_SQ_ALU_SRC_0;
9509 r = r600_bytecode_add_alu(ctx->bc, &alu);
9510 if (r)
9511 return r;
9512
9513 /* then read from LDS_OQ_A_POP */
9514 memset(&alu, 0, sizeof(alu));
9515
9516 alu.op = ALU_OP1_MOV;
9517 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9518 alu.src[0].chan = 0;
9519 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9520 alu.dst.write = 1;
9521 alu.last = 1;
9522 r = r600_bytecode_add_alu(ctx->bc, &alu);
9523 if (r)
9524 return r;
9525
9526 return 0;
9527 }
9528
9529 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9530 {
9531 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9532 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9533 return tgsi_atomic_op_rat(ctx);
9534 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9535 return tgsi_atomic_op_gds(ctx);
9536 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9537 return tgsi_atomic_op_rat(ctx);
9538 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9539 return tgsi_atomic_op_lds(ctx);
9540 return 0;
9541 }
9542
9543 static int tgsi_resq(struct r600_shader_ctx *ctx)
9544 {
9545 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9546 unsigned sampler_index_mode;
9547 struct r600_bytecode_tex tex;
9548 int r;
9549 boolean has_txq_cube_array_z = false;
9550
9551 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9552 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9553 if (ctx->bc->chip_class < EVERGREEN)
9554 ctx->shader->uses_tex_buffers = true;
9555 unsigned eg_buffer_base = 0;
9556 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9557 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9558 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9559 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9560 }
9561
9562 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9563 inst->Dst[0].Register.WriteMask & 4) {
9564 ctx->shader->has_txq_cube_array_z_comp = true;
9565 has_txq_cube_array_z = true;
9566 }
9567
9568 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9569 if (sampler_index_mode)
9570 egcm_load_index_reg(ctx->bc, 1, false);
9571
9572
9573 /* does this shader want a num layers from TXQ for a cube array? */
9574 if (has_txq_cube_array_z) {
9575 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9576 struct r600_bytecode_alu alu;
9577
9578 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9579 alu.op = ALU_OP1_MOV;
9580
9581 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9582 /* with eg each dword is either number of cubes */
9583 alu.src[0].sel += id / 4;
9584 alu.src[0].chan = id % 4;
9585 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9586 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9587 alu.last = 1;
9588 r = r600_bytecode_add_alu(ctx->bc, &alu);
9589 if (r)
9590 return r;
9591 /* disable writemask from texture instruction */
9592 inst->Dst[0].Register.WriteMask &= ~4;
9593 }
9594 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9595 tex.op = ctx->inst_info->op;
9596 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9597 tex.sampler_index_mode = sampler_index_mode;
9598 tex.resource_id = tex.sampler_id;
9599 tex.resource_index_mode = sampler_index_mode;
9600 tex.src_sel_x = 4;
9601 tex.src_sel_y = 4;
9602 tex.src_sel_z = 4;
9603 tex.src_sel_w = 4;
9604 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9605 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9606 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9607 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9608 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9609 r = r600_bytecode_add_tex(ctx->bc, &tex);
9610 if (r)
9611 return r;
9612
9613 return 0;
9614 }
9615
9616 static int tgsi_lrp(struct r600_shader_ctx *ctx)
9617 {
9618 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9619 struct r600_bytecode_alu alu;
9620 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9621 struct r600_bytecode_alu_src srcs[2][4];
9622 unsigned i;
9623 int r;
9624
9625 /* optimize if it's just an equal balance */
9626 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9627 for (i = 0; i < lasti + 1; i++) {
9628 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9629 continue;
9630
9631 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9632 alu.op = ALU_OP2_ADD;
9633 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9634 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9635 alu.omod = 3;
9636 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9637 alu.dst.chan = i;
9638 if (i == lasti) {
9639 alu.last = 1;
9640 }
9641 r = r600_bytecode_add_alu(ctx->bc, &alu);
9642 if (r)
9643 return r;
9644 }
9645 return 0;
9646 }
9647
9648 /* 1 - src0 */
9649 for (i = 0; i < lasti + 1; i++) {
9650 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9651 continue;
9652
9653 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9654 alu.op = ALU_OP2_ADD;
9655 alu.src[0].sel = V_SQ_ALU_SRC_1;
9656 alu.src[0].chan = 0;
9657 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9658 r600_bytecode_src_toggle_neg(&alu.src[1]);
9659 alu.dst.sel = ctx->temp_reg;
9660 alu.dst.chan = i;
9661 if (i == lasti) {
9662 alu.last = 1;
9663 }
9664 alu.dst.write = 1;
9665 r = r600_bytecode_add_alu(ctx->bc, &alu);
9666 if (r)
9667 return r;
9668 }
9669
9670 /* (1 - src0) * src2 */
9671 for (i = 0; i < lasti + 1; i++) {
9672 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9673 continue;
9674
9675 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9676 alu.op = ALU_OP2_MUL;
9677 alu.src[0].sel = ctx->temp_reg;
9678 alu.src[0].chan = i;
9679 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9680 alu.dst.sel = ctx->temp_reg;
9681 alu.dst.chan = i;
9682 if (i == lasti) {
9683 alu.last = 1;
9684 }
9685 alu.dst.write = 1;
9686 r = r600_bytecode_add_alu(ctx->bc, &alu);
9687 if (r)
9688 return r;
9689 }
9690
9691 /* src0 * src1 + (1 - src0) * src2 */
9692
9693 for (i = 0; i < 2; i++) {
9694 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9695 srcs[i], &ctx->src[i]);
9696 if (r)
9697 return r;
9698 }
9699
9700 for (i = 0; i < lasti + 1; i++) {
9701 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9702 continue;
9703
9704 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9705 alu.op = ALU_OP3_MULADD;
9706 alu.is_op3 = 1;
9707 alu.src[0] = srcs[0][i];
9708 alu.src[1] = srcs[1][i];
9709 alu.src[2].sel = ctx->temp_reg;
9710 alu.src[2].chan = i;
9711
9712 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9713 alu.dst.chan = i;
9714 if (i == lasti) {
9715 alu.last = 1;
9716 }
9717 r = r600_bytecode_add_alu(ctx->bc, &alu);
9718 if (r)
9719 return r;
9720 }
9721 return 0;
9722 }
9723
9724 static int tgsi_cmp(struct r600_shader_ctx *ctx)
9725 {
9726 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9727 struct r600_bytecode_alu alu;
9728 int i, r, j;
9729 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9730 struct r600_bytecode_alu_src srcs[3][4];
9731
9732 unsigned op;
9733
9734 if (ctx->src[0].abs && ctx->src[0].neg) {
9735 op = ALU_OP3_CNDE;
9736 ctx->src[0].abs = 0;
9737 ctx->src[0].neg = 0;
9738 } else {
9739 op = ALU_OP3_CNDGE;
9740 }
9741
9742 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9743 r = tgsi_make_src_for_op3(ctx, inst->Dst[0].Register.WriteMask,
9744 srcs[j], &ctx->src[j]);
9745 if (r)
9746 return r;
9747 }
9748
9749 for (i = 0; i < lasti + 1; i++) {
9750 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9751 continue;
9752
9753 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9754 alu.op = op;
9755 alu.src[0] = srcs[0][i];
9756 alu.src[1] = srcs[2][i];
9757 alu.src[2] = srcs[1][i];
9758
9759 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9760 alu.dst.chan = i;
9761 alu.dst.write = 1;
9762 alu.is_op3 = 1;
9763 if (i == lasti)
9764 alu.last = 1;
9765 r = r600_bytecode_add_alu(ctx->bc, &alu);
9766 if (r)
9767 return r;
9768 }
9769 return 0;
9770 }
9771
9772 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9773 {
9774 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9775 struct r600_bytecode_alu alu;
9776 int i, r;
9777 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9778
9779 for (i = 0; i < lasti + 1; i++) {
9780 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9781 continue;
9782
9783 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9784 alu.op = ALU_OP3_CNDE_INT;
9785 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9786 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9787 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9788 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9789 alu.dst.chan = i;
9790 alu.dst.write = 1;
9791 alu.is_op3 = 1;
9792 if (i == lasti)
9793 alu.last = 1;
9794 r = r600_bytecode_add_alu(ctx->bc, &alu);
9795 if (r)
9796 return r;
9797 }
9798 return 0;
9799 }
9800
9801 static int tgsi_exp(struct r600_shader_ctx *ctx)
9802 {
9803 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9804 struct r600_bytecode_alu alu;
9805 int r;
9806 unsigned i;
9807
9808 /* result.x = 2^floor(src); */
9809 if (inst->Dst[0].Register.WriteMask & 1) {
9810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9811
9812 alu.op = ALU_OP1_FLOOR;
9813 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9814
9815 alu.dst.sel = ctx->temp_reg;
9816 alu.dst.chan = 0;
9817 alu.dst.write = 1;
9818 alu.last = 1;
9819 r = r600_bytecode_add_alu(ctx->bc, &alu);
9820 if (r)
9821 return r;
9822
9823 if (ctx->bc->chip_class == CAYMAN) {
9824 for (i = 0; i < 3; i++) {
9825 alu.op = ALU_OP1_EXP_IEEE;
9826 alu.src[0].sel = ctx->temp_reg;
9827 alu.src[0].chan = 0;
9828
9829 alu.dst.sel = ctx->temp_reg;
9830 alu.dst.chan = i;
9831 alu.dst.write = i == 0;
9832 alu.last = i == 2;
9833 r = r600_bytecode_add_alu(ctx->bc, &alu);
9834 if (r)
9835 return r;
9836 }
9837 } else {
9838 alu.op = ALU_OP1_EXP_IEEE;
9839 alu.src[0].sel = ctx->temp_reg;
9840 alu.src[0].chan = 0;
9841
9842 alu.dst.sel = ctx->temp_reg;
9843 alu.dst.chan = 0;
9844 alu.dst.write = 1;
9845 alu.last = 1;
9846 r = r600_bytecode_add_alu(ctx->bc, &alu);
9847 if (r)
9848 return r;
9849 }
9850 }
9851
9852 /* result.y = tmp - floor(tmp); */
9853 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9854 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9855
9856 alu.op = ALU_OP1_FRACT;
9857 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9858
9859 alu.dst.sel = ctx->temp_reg;
9860 #if 0
9861 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9862 if (r)
9863 return r;
9864 #endif
9865 alu.dst.write = 1;
9866 alu.dst.chan = 1;
9867
9868 alu.last = 1;
9869
9870 r = r600_bytecode_add_alu(ctx->bc, &alu);
9871 if (r)
9872 return r;
9873 }
9874
9875 /* result.z = RoughApprox2ToX(tmp);*/
9876 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9877 if (ctx->bc->chip_class == CAYMAN) {
9878 for (i = 0; i < 3; i++) {
9879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9880 alu.op = ALU_OP1_EXP_IEEE;
9881 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9882
9883 alu.dst.sel = ctx->temp_reg;
9884 alu.dst.chan = i;
9885 if (i == 2) {
9886 alu.dst.write = 1;
9887 alu.last = 1;
9888 }
9889
9890 r = r600_bytecode_add_alu(ctx->bc, &alu);
9891 if (r)
9892 return r;
9893 }
9894 } else {
9895 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9896 alu.op = ALU_OP1_EXP_IEEE;
9897 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9898
9899 alu.dst.sel = ctx->temp_reg;
9900 alu.dst.write = 1;
9901 alu.dst.chan = 2;
9902
9903 alu.last = 1;
9904
9905 r = r600_bytecode_add_alu(ctx->bc, &alu);
9906 if (r)
9907 return r;
9908 }
9909 }
9910
9911 /* result.w = 1.0;*/
9912 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9913 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9914
9915 alu.op = ALU_OP1_MOV;
9916 alu.src[0].sel = V_SQ_ALU_SRC_1;
9917 alu.src[0].chan = 0;
9918
9919 alu.dst.sel = ctx->temp_reg;
9920 alu.dst.chan = 3;
9921 alu.dst.write = 1;
9922 alu.last = 1;
9923 r = r600_bytecode_add_alu(ctx->bc, &alu);
9924 if (r)
9925 return r;
9926 }
9927 return tgsi_helper_copy(ctx, inst);
9928 }
9929
9930 static int tgsi_log(struct r600_shader_ctx *ctx)
9931 {
9932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9933 struct r600_bytecode_alu alu;
9934 int r;
9935 unsigned i;
9936
9937 /* result.x = floor(log2(|src|)); */
9938 if (inst->Dst[0].Register.WriteMask & 1) {
9939 if (ctx->bc->chip_class == CAYMAN) {
9940 for (i = 0; i < 3; i++) {
9941 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9942
9943 alu.op = ALU_OP1_LOG_IEEE;
9944 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9945 r600_bytecode_src_set_abs(&alu.src[0]);
9946
9947 alu.dst.sel = ctx->temp_reg;
9948 alu.dst.chan = i;
9949 if (i == 0)
9950 alu.dst.write = 1;
9951 if (i == 2)
9952 alu.last = 1;
9953 r = r600_bytecode_add_alu(ctx->bc, &alu);
9954 if (r)
9955 return r;
9956 }
9957
9958 } else {
9959 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9960
9961 alu.op = ALU_OP1_LOG_IEEE;
9962 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9963 r600_bytecode_src_set_abs(&alu.src[0]);
9964
9965 alu.dst.sel = ctx->temp_reg;
9966 alu.dst.chan = 0;
9967 alu.dst.write = 1;
9968 alu.last = 1;
9969 r = r600_bytecode_add_alu(ctx->bc, &alu);
9970 if (r)
9971 return r;
9972 }
9973
9974 alu.op = ALU_OP1_FLOOR;
9975 alu.src[0].sel = ctx->temp_reg;
9976 alu.src[0].chan = 0;
9977
9978 alu.dst.sel = ctx->temp_reg;
9979 alu.dst.chan = 0;
9980 alu.dst.write = 1;
9981 alu.last = 1;
9982
9983 r = r600_bytecode_add_alu(ctx->bc, &alu);
9984 if (r)
9985 return r;
9986 }
9987
9988 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9989 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9990
9991 if (ctx->bc->chip_class == CAYMAN) {
9992 for (i = 0; i < 3; i++) {
9993 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9994
9995 alu.op = ALU_OP1_LOG_IEEE;
9996 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9997 r600_bytecode_src_set_abs(&alu.src[0]);
9998
9999 alu.dst.sel = ctx->temp_reg;
10000 alu.dst.chan = i;
10001 if (i == 1)
10002 alu.dst.write = 1;
10003 if (i == 2)
10004 alu.last = 1;
10005
10006 r = r600_bytecode_add_alu(ctx->bc, &alu);
10007 if (r)
10008 return r;
10009 }
10010 } else {
10011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10012
10013 alu.op = ALU_OP1_LOG_IEEE;
10014 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10015 r600_bytecode_src_set_abs(&alu.src[0]);
10016
10017 alu.dst.sel = ctx->temp_reg;
10018 alu.dst.chan = 1;
10019 alu.dst.write = 1;
10020 alu.last = 1;
10021
10022 r = r600_bytecode_add_alu(ctx->bc, &alu);
10023 if (r)
10024 return r;
10025 }
10026
10027 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10028
10029 alu.op = ALU_OP1_FLOOR;
10030 alu.src[0].sel = ctx->temp_reg;
10031 alu.src[0].chan = 1;
10032
10033 alu.dst.sel = ctx->temp_reg;
10034 alu.dst.chan = 1;
10035 alu.dst.write = 1;
10036 alu.last = 1;
10037
10038 r = r600_bytecode_add_alu(ctx->bc, &alu);
10039 if (r)
10040 return r;
10041
10042 if (ctx->bc->chip_class == CAYMAN) {
10043 for (i = 0; i < 3; i++) {
10044 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10045 alu.op = ALU_OP1_EXP_IEEE;
10046 alu.src[0].sel = ctx->temp_reg;
10047 alu.src[0].chan = 1;
10048
10049 alu.dst.sel = ctx->temp_reg;
10050 alu.dst.chan = i;
10051 if (i == 1)
10052 alu.dst.write = 1;
10053 if (i == 2)
10054 alu.last = 1;
10055
10056 r = r600_bytecode_add_alu(ctx->bc, &alu);
10057 if (r)
10058 return r;
10059 }
10060 } else {
10061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10062 alu.op = ALU_OP1_EXP_IEEE;
10063 alu.src[0].sel = ctx->temp_reg;
10064 alu.src[0].chan = 1;
10065
10066 alu.dst.sel = ctx->temp_reg;
10067 alu.dst.chan = 1;
10068 alu.dst.write = 1;
10069 alu.last = 1;
10070
10071 r = r600_bytecode_add_alu(ctx->bc, &alu);
10072 if (r)
10073 return r;
10074 }
10075
10076 if (ctx->bc->chip_class == CAYMAN) {
10077 for (i = 0; i < 3; i++) {
10078 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10079 alu.op = ALU_OP1_RECIP_IEEE;
10080 alu.src[0].sel = ctx->temp_reg;
10081 alu.src[0].chan = 1;
10082
10083 alu.dst.sel = ctx->temp_reg;
10084 alu.dst.chan = i;
10085 if (i == 1)
10086 alu.dst.write = 1;
10087 if (i == 2)
10088 alu.last = 1;
10089
10090 r = r600_bytecode_add_alu(ctx->bc, &alu);
10091 if (r)
10092 return r;
10093 }
10094 } else {
10095 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10096 alu.op = ALU_OP1_RECIP_IEEE;
10097 alu.src[0].sel = ctx->temp_reg;
10098 alu.src[0].chan = 1;
10099
10100 alu.dst.sel = ctx->temp_reg;
10101 alu.dst.chan = 1;
10102 alu.dst.write = 1;
10103 alu.last = 1;
10104
10105 r = r600_bytecode_add_alu(ctx->bc, &alu);
10106 if (r)
10107 return r;
10108 }
10109
10110 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10111
10112 alu.op = ALU_OP2_MUL;
10113
10114 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10115 r600_bytecode_src_set_abs(&alu.src[0]);
10116
10117 alu.src[1].sel = ctx->temp_reg;
10118 alu.src[1].chan = 1;
10119
10120 alu.dst.sel = ctx->temp_reg;
10121 alu.dst.chan = 1;
10122 alu.dst.write = 1;
10123 alu.last = 1;
10124
10125 r = r600_bytecode_add_alu(ctx->bc, &alu);
10126 if (r)
10127 return r;
10128 }
10129
10130 /* result.z = log2(|src|);*/
10131 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
10132 if (ctx->bc->chip_class == CAYMAN) {
10133 for (i = 0; i < 3; i++) {
10134 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10135
10136 alu.op = ALU_OP1_LOG_IEEE;
10137 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10138 r600_bytecode_src_set_abs(&alu.src[0]);
10139
10140 alu.dst.sel = ctx->temp_reg;
10141 if (i == 2)
10142 alu.dst.write = 1;
10143 alu.dst.chan = i;
10144 if (i == 2)
10145 alu.last = 1;
10146
10147 r = r600_bytecode_add_alu(ctx->bc, &alu);
10148 if (r)
10149 return r;
10150 }
10151 } else {
10152 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10153
10154 alu.op = ALU_OP1_LOG_IEEE;
10155 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10156 r600_bytecode_src_set_abs(&alu.src[0]);
10157
10158 alu.dst.sel = ctx->temp_reg;
10159 alu.dst.write = 1;
10160 alu.dst.chan = 2;
10161 alu.last = 1;
10162
10163 r = r600_bytecode_add_alu(ctx->bc, &alu);
10164 if (r)
10165 return r;
10166 }
10167 }
10168
10169 /* result.w = 1.0; */
10170 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
10171 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10172
10173 alu.op = ALU_OP1_MOV;
10174 alu.src[0].sel = V_SQ_ALU_SRC_1;
10175 alu.src[0].chan = 0;
10176
10177 alu.dst.sel = ctx->temp_reg;
10178 alu.dst.chan = 3;
10179 alu.dst.write = 1;
10180 alu.last = 1;
10181
10182 r = r600_bytecode_add_alu(ctx->bc, &alu);
10183 if (r)
10184 return r;
10185 }
10186
10187 return tgsi_helper_copy(ctx, inst);
10188 }
10189
10190 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
10191 {
10192 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10193 struct r600_bytecode_alu alu;
10194 int r;
10195 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10196 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
10197
10198 assert(inst->Dst[0].Register.Index < 3);
10199 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10200
10201 switch (inst->Instruction.Opcode) {
10202 case TGSI_OPCODE_ARL:
10203 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
10204 break;
10205 case TGSI_OPCODE_ARR:
10206 alu.op = ALU_OP1_FLT_TO_INT;
10207 break;
10208 case TGSI_OPCODE_UARL:
10209 alu.op = ALU_OP1_MOV;
10210 break;
10211 default:
10212 assert(0);
10213 return -1;
10214 }
10215
10216 for (i = 0; i <= lasti; ++i) {
10217 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10218 continue;
10219 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10220 alu.last = i == lasti;
10221 alu.dst.sel = reg;
10222 alu.dst.chan = i;
10223 alu.dst.write = 1;
10224 r = r600_bytecode_add_alu(ctx->bc, &alu);
10225 if (r)
10226 return r;
10227 }
10228
10229 if (inst->Dst[0].Register.Index > 0)
10230 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10231 else
10232 ctx->bc->ar_loaded = 0;
10233
10234 return 0;
10235 }
10236 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10237 {
10238 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10239 struct r600_bytecode_alu alu;
10240 int r;
10241 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10242
10243 switch (inst->Instruction.Opcode) {
10244 case TGSI_OPCODE_ARL:
10245 memset(&alu, 0, sizeof(alu));
10246 alu.op = ALU_OP1_FLOOR;
10247 alu.dst.sel = ctx->bc->ar_reg;
10248 alu.dst.write = 1;
10249 for (i = 0; i <= lasti; ++i) {
10250 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10251 alu.dst.chan = i;
10252 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10253 alu.last = i == lasti;
10254 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10255 return r;
10256 }
10257 }
10258
10259 memset(&alu, 0, sizeof(alu));
10260 alu.op = ALU_OP1_FLT_TO_INT;
10261 alu.src[0].sel = ctx->bc->ar_reg;
10262 alu.dst.sel = ctx->bc->ar_reg;
10263 alu.dst.write = 1;
10264 /* FLT_TO_INT is trans-only on r600/r700 */
10265 alu.last = TRUE;
10266 for (i = 0; i <= lasti; ++i) {
10267 alu.dst.chan = i;
10268 alu.src[0].chan = i;
10269 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10270 return r;
10271 }
10272 break;
10273 case TGSI_OPCODE_ARR:
10274 memset(&alu, 0, sizeof(alu));
10275 alu.op = ALU_OP1_FLT_TO_INT;
10276 alu.dst.sel = ctx->bc->ar_reg;
10277 alu.dst.write = 1;
10278 /* FLT_TO_INT is trans-only on r600/r700 */
10279 alu.last = TRUE;
10280 for (i = 0; i <= lasti; ++i) {
10281 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10282 alu.dst.chan = i;
10283 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10284 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10285 return r;
10286 }
10287 }
10288 break;
10289 case TGSI_OPCODE_UARL:
10290 memset(&alu, 0, sizeof(alu));
10291 alu.op = ALU_OP1_MOV;
10292 alu.dst.sel = ctx->bc->ar_reg;
10293 alu.dst.write = 1;
10294 for (i = 0; i <= lasti; ++i) {
10295 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10296 alu.dst.chan = i;
10297 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10298 alu.last = i == lasti;
10299 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10300 return r;
10301 }
10302 }
10303 break;
10304 default:
10305 assert(0);
10306 return -1;
10307 }
10308
10309 ctx->bc->ar_loaded = 0;
10310 return 0;
10311 }
10312
10313 static int tgsi_opdst(struct r600_shader_ctx *ctx)
10314 {
10315 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10316 struct r600_bytecode_alu alu;
10317 int i, r = 0;
10318
10319 for (i = 0; i < 4; i++) {
10320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10321
10322 alu.op = ALU_OP2_MUL;
10323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10324
10325 if (i == 0 || i == 3) {
10326 alu.src[0].sel = V_SQ_ALU_SRC_1;
10327 } else {
10328 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10329 }
10330
10331 if (i == 0 || i == 2) {
10332 alu.src[1].sel = V_SQ_ALU_SRC_1;
10333 } else {
10334 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10335 }
10336 if (i == 3)
10337 alu.last = 1;
10338 r = r600_bytecode_add_alu(ctx->bc, &alu);
10339 if (r)
10340 return r;
10341 }
10342 return 0;
10343 }
10344
10345 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10346 struct r600_bytecode_alu_src *src)
10347 {
10348 struct r600_bytecode_alu alu;
10349 int r;
10350
10351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10352 alu.op = opcode;
10353 alu.execute_mask = 1;
10354 alu.update_pred = 1;
10355
10356 alu.dst.sel = ctx->temp_reg;
10357 alu.dst.write = 1;
10358 alu.dst.chan = 0;
10359
10360 alu.src[0] = *src;
10361 alu.src[1].sel = V_SQ_ALU_SRC_0;
10362 alu.src[1].chan = 0;
10363
10364 alu.last = 1;
10365
10366 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10367 if (r)
10368 return r;
10369 return 0;
10370 }
10371
10372 static int pops(struct r600_shader_ctx *ctx, int pops)
10373 {
10374 unsigned force_pop = ctx->bc->force_add_cf;
10375
10376 if (!force_pop) {
10377 int alu_pop = 3;
10378 if (ctx->bc->cf_last) {
10379 if (ctx->bc->cf_last->op == CF_OP_ALU)
10380 alu_pop = 0;
10381 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10382 alu_pop = 1;
10383 }
10384 alu_pop += pops;
10385 if (alu_pop == 1) {
10386 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10387 ctx->bc->force_add_cf = 1;
10388 } else if (alu_pop == 2) {
10389 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10390 ctx->bc->force_add_cf = 1;
10391 } else {
10392 force_pop = 1;
10393 }
10394 }
10395
10396 if (force_pop) {
10397 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10398 ctx->bc->cf_last->pop_count = pops;
10399 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10400 }
10401
10402 return 0;
10403 }
10404
10405 static inline int callstack_update_max_depth(struct r600_shader_ctx *ctx,
10406 unsigned reason)
10407 {
10408 struct r600_stack_info *stack = &ctx->bc->stack;
10409 unsigned elements;
10410 int entries;
10411
10412 unsigned entry_size = stack->entry_size;
10413
10414 elements = (stack->loop + stack->push_wqm ) * entry_size;
10415 elements += stack->push;
10416
10417 switch (ctx->bc->chip_class) {
10418 case R600:
10419 case R700:
10420 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10421 * the stack must be reserved to hold the current active/continue
10422 * masks */
10423 if (reason == FC_PUSH_VPM || stack->push > 0) {
10424 elements += 2;
10425 }
10426 break;
10427
10428 case CAYMAN:
10429 /* r9xx: any stack operation on empty stack consumes 2 additional
10430 * elements */
10431 elements += 2;
10432
10433 /* fallthrough */
10434 /* FIXME: do the two elements added above cover the cases for the
10435 * r8xx+ below? */
10436
10437 case EVERGREEN:
10438 /* r8xx+: 2 extra elements are not always required, but one extra
10439 * element must be added for each of the following cases:
10440 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10441 * stack usage.
10442 * (Currently we don't use ALU_ELSE_AFTER.)
10443 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10444 * PUSH instruction executed.
10445 *
10446 * NOTE: it seems we also need to reserve additional element in some
10447 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10448 * then STACK_SIZE should be 2 instead of 1 */
10449 if (reason == FC_PUSH_VPM || stack->push > 0) {
10450 elements += 1;
10451 }
10452 break;
10453
10454 default:
10455 assert(0);
10456 break;
10457 }
10458
10459 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10460 * for all chips, so we use 4 in the final formula, not the real entry_size
10461 * for the chip */
10462 entry_size = 4;
10463
10464 entries = (elements + (entry_size - 1)) / entry_size;
10465
10466 if (entries > stack->max_entries)
10467 stack->max_entries = entries;
10468 return elements;
10469 }
10470
10471 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10472 {
10473 switch(reason) {
10474 case FC_PUSH_VPM:
10475 --ctx->bc->stack.push;
10476 assert(ctx->bc->stack.push >= 0);
10477 break;
10478 case FC_PUSH_WQM:
10479 --ctx->bc->stack.push_wqm;
10480 assert(ctx->bc->stack.push_wqm >= 0);
10481 break;
10482 case FC_LOOP:
10483 --ctx->bc->stack.loop;
10484 assert(ctx->bc->stack.loop >= 0);
10485 break;
10486 default:
10487 assert(0);
10488 break;
10489 }
10490 }
10491
10492 static inline int callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10493 {
10494 switch (reason) {
10495 case FC_PUSH_VPM:
10496 ++ctx->bc->stack.push;
10497 break;
10498 case FC_PUSH_WQM:
10499 ++ctx->bc->stack.push_wqm;
10500 break;
10501 case FC_LOOP:
10502 ++ctx->bc->stack.loop;
10503 break;
10504 default:
10505 assert(0);
10506 }
10507
10508 return callstack_update_max_depth(ctx, reason);
10509 }
10510
10511 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10512 {
10513 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10514
10515 sp->mid = realloc((void *)sp->mid,
10516 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10517 sp->mid[sp->num_mid] = ctx->bc->cf_last;
10518 sp->num_mid++;
10519 }
10520
10521 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10522 {
10523 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10524 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10525 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10526 ctx->bc->fc_sp++;
10527 }
10528
10529 static void fc_poplevel(struct r600_shader_ctx *ctx)
10530 {
10531 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10532 free(sp->mid);
10533 sp->mid = NULL;
10534 sp->num_mid = 0;
10535 sp->start = NULL;
10536 sp->type = 0;
10537 ctx->bc->fc_sp--;
10538 }
10539
10540 #if 0
10541 static int emit_return(struct r600_shader_ctx *ctx)
10542 {
10543 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10544 return 0;
10545 }
10546
10547 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10548 {
10549
10550 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10551 ctx->bc->cf_last->pop_count = pops;
10552 /* XXX work out offset */
10553 return 0;
10554 }
10555
10556 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10557 {
10558 return 0;
10559 }
10560
10561 static void emit_testflag(struct r600_shader_ctx *ctx)
10562 {
10563
10564 }
10565
10566 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10567 {
10568 emit_testflag(ctx);
10569 emit_jump_to_offset(ctx, 1, 4);
10570 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10571 pops(ctx, ifidx + 1);
10572 emit_return(ctx);
10573 }
10574
10575 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10576 {
10577 emit_testflag(ctx);
10578
10579 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10580 ctx->bc->cf_last->pop_count = 1;
10581
10582 fc_set_mid(ctx, fc_sp);
10583
10584 pops(ctx, 1);
10585 }
10586 #endif
10587
10588 static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10589 struct r600_bytecode_alu_src *src)
10590 {
10591 int alu_type = CF_OP_ALU_PUSH_BEFORE;
10592 bool needs_workaround = false;
10593 int elems = callstack_push(ctx, FC_PUSH_VPM);
10594
10595 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1)
10596 needs_workaround = true;
10597
10598 if (ctx->bc->chip_class == EVERGREEN && ctx_needs_stack_workaround_8xx(ctx)) {
10599 unsigned dmod1 = (elems - 1) % ctx->bc->stack.entry_size;
10600 unsigned dmod2 = (elems) % ctx->bc->stack.entry_size;
10601
10602 if (elems && (!dmod1 || !dmod2))
10603 needs_workaround = true;
10604 }
10605
10606 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10607 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10608 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10609 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10610 if (needs_workaround) {
10611 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10612 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10613 alu_type = CF_OP_ALU;
10614 }
10615
10616 emit_logic_pred(ctx, opcode, alu_type, src);
10617
10618 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10619
10620 fc_pushlevel(ctx, FC_IF);
10621
10622 return 0;
10623 }
10624
10625 static int tgsi_if(struct r600_shader_ctx *ctx)
10626 {
10627 struct r600_bytecode_alu_src alu_src;
10628 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10629
10630 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10631 }
10632
10633 static int tgsi_uif(struct r600_shader_ctx *ctx)
10634 {
10635 struct r600_bytecode_alu_src alu_src;
10636 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10637 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10638 }
10639
10640 static int tgsi_else(struct r600_shader_ctx *ctx)
10641 {
10642 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10643 ctx->bc->cf_last->pop_count = 1;
10644
10645 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10646 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10647 return 0;
10648 }
10649
10650 static int tgsi_endif(struct r600_shader_ctx *ctx)
10651 {
10652 int offset = 2;
10653 pops(ctx, 1);
10654 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10655 R600_ERR("if/endif unbalanced in shader\n");
10656 return -1;
10657 }
10658
10659 /* ALU_EXTENDED needs 4 DWords instead of two, adjust jump target offset accordingly */
10660 if (ctx->bc->cf_last->eg_alu_extended)
10661 offset += 2;
10662
10663 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10664 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + offset;
10665 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10666 } else {
10667 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + offset;
10668 }
10669 fc_poplevel(ctx);
10670
10671 callstack_pop(ctx, FC_PUSH_VPM);
10672 return 0;
10673 }
10674
10675 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10676 {
10677 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10678 * limited to 4096 iterations, like the other LOOP_* instructions. */
10679 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10680
10681 fc_pushlevel(ctx, FC_LOOP);
10682
10683 /* check stack depth */
10684 callstack_push(ctx, FC_LOOP);
10685 return 0;
10686 }
10687
10688 static int tgsi_endloop(struct r600_shader_ctx *ctx)
10689 {
10690 int i;
10691
10692 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10693
10694 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10695 R600_ERR("loop/endloop in shader code are not paired.\n");
10696 return -EINVAL;
10697 }
10698
10699 /* fixup loop pointers - from r600isa
10700 LOOP END points to CF after LOOP START,
10701 LOOP START point to CF after LOOP END
10702 BRK/CONT point to LOOP END CF
10703 */
10704 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10705
10706 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10707
10708 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10709 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10710 }
10711 /* XXX add LOOPRET support */
10712 fc_poplevel(ctx);
10713 callstack_pop(ctx, FC_LOOP);
10714 return 0;
10715 }
10716
10717 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10718 {
10719 unsigned int fscp;
10720
10721 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10722 {
10723 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10724 break;
10725 }
10726
10727 if (fscp == 0) {
10728 R600_ERR("Break not inside loop/endloop pair\n");
10729 return -EINVAL;
10730 }
10731
10732 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10733
10734 fc_set_mid(ctx, fscp - 1);
10735
10736 return 0;
10737 }
10738
10739 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10740 {
10741 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10742 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10743 int r;
10744
10745 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10746 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10747
10748 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10749 if (!r) {
10750 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10751 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10752 return emit_inc_ring_offset(ctx, stream, TRUE);
10753 }
10754 return r;
10755 }
10756
10757 static int tgsi_umad(struct r600_shader_ctx *ctx)
10758 {
10759 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10760 struct r600_bytecode_alu alu;
10761 int i, j, r;
10762 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10763
10764 /* src0 * src1 */
10765 for (i = 0; i < lasti + 1; i++) {
10766 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10767 continue;
10768
10769 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10770
10771 alu.dst.chan = i;
10772 alu.dst.sel = ctx->temp_reg;
10773 alu.dst.write = 1;
10774
10775 alu.op = ALU_OP2_MULLO_UINT;
10776 for (j = 0; j < 2; j++) {
10777 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10778 }
10779
10780 alu.last = 1;
10781 r = emit_mul_int_op(ctx->bc, &alu);
10782 if (r)
10783 return r;
10784 }
10785
10786
10787 for (i = 0; i < lasti + 1; i++) {
10788 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10789 continue;
10790
10791 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10792 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10793
10794 alu.op = ALU_OP2_ADD_INT;
10795
10796 alu.src[0].sel = ctx->temp_reg;
10797 alu.src[0].chan = i;
10798
10799 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10800 if (i == lasti) {
10801 alu.last = 1;
10802 }
10803 r = r600_bytecode_add_alu(ctx->bc, &alu);
10804 if (r)
10805 return r;
10806 }
10807 return 0;
10808 }
10809
10810 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10811 {
10812 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10813 struct r600_bytecode_alu alu;
10814 int r, i;
10815 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10816
10817 /* temp.xy = f32_to_f16(src) */
10818 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10819 alu.op = ALU_OP1_FLT32_TO_FLT16;
10820 alu.dst.chan = 0;
10821 alu.dst.sel = ctx->temp_reg;
10822 alu.dst.write = 1;
10823 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10824 r = r600_bytecode_add_alu(ctx->bc, &alu);
10825 if (r)
10826 return r;
10827 alu.dst.chan = 1;
10828 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10829 alu.last = 1;
10830 r = r600_bytecode_add_alu(ctx->bc, &alu);
10831 if (r)
10832 return r;
10833
10834 /* dst.x = temp.y * 0x10000 + temp.x */
10835 for (i = 0; i < lasti + 1; i++) {
10836 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10837 continue;
10838
10839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10840 alu.op = ALU_OP3_MULADD_UINT24;
10841 alu.is_op3 = 1;
10842 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10843 alu.last = i == lasti;
10844 alu.src[0].sel = ctx->temp_reg;
10845 alu.src[0].chan = 1;
10846 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10847 alu.src[1].value = 0x10000;
10848 alu.src[2].sel = ctx->temp_reg;
10849 alu.src[2].chan = 0;
10850 r = r600_bytecode_add_alu(ctx->bc, &alu);
10851 if (r)
10852 return r;
10853 }
10854
10855 return 0;
10856 }
10857
10858 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10859 {
10860 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10861 struct r600_bytecode_alu alu;
10862 int r, i;
10863 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10864
10865 /* temp.x = src.x */
10866 /* note: no need to mask out the high bits */
10867 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10868 alu.op = ALU_OP1_MOV;
10869 alu.dst.chan = 0;
10870 alu.dst.sel = ctx->temp_reg;
10871 alu.dst.write = 1;
10872 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10873 r = r600_bytecode_add_alu(ctx->bc, &alu);
10874 if (r)
10875 return r;
10876
10877 /* temp.y = src.x >> 16 */
10878 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10879 alu.op = ALU_OP2_LSHR_INT;
10880 alu.dst.chan = 1;
10881 alu.dst.sel = ctx->temp_reg;
10882 alu.dst.write = 1;
10883 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10884 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10885 alu.src[1].value = 16;
10886 alu.last = 1;
10887 r = r600_bytecode_add_alu(ctx->bc, &alu);
10888 if (r)
10889 return r;
10890
10891 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10892 for (i = 0; i < lasti + 1; i++) {
10893 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10894 continue;
10895 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10896 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10897 alu.op = ALU_OP1_FLT16_TO_FLT32;
10898 alu.src[0].sel = ctx->temp_reg;
10899 alu.src[0].chan = i % 2;
10900 alu.last = i == lasti;
10901 r = r600_bytecode_add_alu(ctx->bc, &alu);
10902 if (r)
10903 return r;
10904 }
10905
10906 return 0;
10907 }
10908
10909 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10910 {
10911 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10912 struct r600_bytecode_alu alu;
10913 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10914 int r, i;
10915 int dst = -1;
10916
10917 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10918 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10919 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10920 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10921 dst = r600_get_temp(ctx);
10922
10923 r = tgsi_op3_dst(ctx, dst);
10924 if (r)
10925 return r;
10926
10927 for (i = 0; i < lasti + 1; i++) {
10928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10929 alu.op = ALU_OP2_SETGE_INT;
10930 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10931 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10932 alu.src[1].value = 32;
10933 alu.dst.sel = ctx->temp_reg;
10934 alu.dst.chan = i;
10935 alu.dst.write = 1;
10936 if (i == lasti)
10937 alu.last = 1;
10938 r = r600_bytecode_add_alu(ctx->bc, &alu);
10939 if (r)
10940 return r;
10941 }
10942
10943 for (i = 0; i < lasti + 1; i++) {
10944 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10945 alu.op = ALU_OP3_CNDE_INT;
10946 alu.is_op3 = 1;
10947 alu.src[0].sel = ctx->temp_reg;
10948 alu.src[0].chan = i;
10949
10950 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10951 if (dst != -1)
10952 alu.src[1].sel = dst;
10953 else
10954 alu.src[1].sel = alu.dst.sel;
10955 alu.src[1].chan = i;
10956 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10957 alu.dst.write = 1;
10958 if (i == lasti)
10959 alu.last = 1;
10960 r = r600_bytecode_add_alu(ctx->bc, &alu);
10961 if (r)
10962 return r;
10963 }
10964
10965 return 0;
10966 }
10967
10968 static int tgsi_clock(struct r600_shader_ctx *ctx)
10969 {
10970 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10971 struct r600_bytecode_alu alu;
10972 int r;
10973
10974 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10975 alu.op = ALU_OP1_MOV;
10976 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10977 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10978 r = r600_bytecode_add_alu(ctx->bc, &alu);
10979 if (r)
10980 return r;
10981 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10982 alu.op = ALU_OP1_MOV;
10983 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10984 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10985 alu.last = 1;
10986 r = r600_bytecode_add_alu(ctx->bc, &alu);
10987 if (r)
10988 return r;
10989 return 0;
10990 }
10991
10992 static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10993 int treg,
10994 int src0_sel, int src0_chan,
10995 int src1_sel, int src1_chan)
10996 {
10997 struct r600_bytecode_alu alu;
10998 int r;
10999 int opc;
11000
11001 if (op == ALU_OP2_ADD_INT)
11002 opc = ALU_OP2_ADDC_UINT;
11003 else
11004 opc = ALU_OP2_SUBB_UINT;
11005
11006 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11007 alu.op = op; ;
11008 alu.dst.sel = treg;
11009 alu.dst.chan = 0;
11010 alu.dst.write = 1;
11011 alu.src[0].sel = src0_sel;
11012 alu.src[0].chan = src0_chan + 0;
11013 alu.src[1].sel = src1_sel;
11014 alu.src[1].chan = src1_chan + 0;
11015 alu.src[1].neg = 0;
11016 r = r600_bytecode_add_alu(ctx->bc, &alu);
11017 if (r)
11018 return r;
11019
11020 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11021 alu.op = op;
11022 alu.dst.sel = treg;
11023 alu.dst.chan = 1;
11024 alu.dst.write = 1;
11025 alu.src[0].sel = src0_sel;
11026 alu.src[0].chan = src0_chan + 1;
11027 alu.src[1].sel = src1_sel;
11028 alu.src[1].chan = src1_chan + 1;
11029 alu.src[1].neg = 0;
11030 r = r600_bytecode_add_alu(ctx->bc, &alu);
11031 if (r)
11032 return r;
11033
11034 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11035 alu.op = opc;
11036 alu.dst.sel = treg;
11037 alu.dst.chan = 2;
11038 alu.dst.write = 1;
11039 alu.last = 1;
11040 alu.src[0].sel = src0_sel;
11041 alu.src[0].chan = src0_chan + 0;
11042 alu.src[1].sel = src1_sel;
11043 alu.src[1].chan = src1_chan + 0;
11044 alu.src[1].neg = 0;
11045 r = r600_bytecode_add_alu(ctx->bc, &alu);
11046 if (r)
11047 return r;
11048
11049 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11050 alu.op = op;
11051 alu.dst.sel = treg;
11052 alu.dst.chan = 1;
11053 alu.dst.write = 1;
11054 alu.src[0].sel = treg;
11055 alu.src[0].chan = 1;
11056 alu.src[1].sel = treg;
11057 alu.src[1].chan = 2;
11058 alu.last = 1;
11059 r = r600_bytecode_add_alu(ctx->bc, &alu);
11060 if (r)
11061 return r;
11062 return 0;
11063 }
11064
11065 static int egcm_u64add(struct r600_shader_ctx *ctx)
11066 {
11067 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11068 struct r600_bytecode_alu alu;
11069 int r;
11070 int treg = ctx->temp_reg;
11071 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
11072
11073 if (ctx->src[1].neg) {
11074 op = ALU_OP2_SUB_INT;
11075 opc = ALU_OP2_SUBB_UINT;
11076 }
11077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11078 alu.op = op; ;
11079 alu.dst.sel = treg;
11080 alu.dst.chan = 0;
11081 alu.dst.write = 1;
11082 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11083 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11084 alu.src[1].neg = 0;
11085 r = r600_bytecode_add_alu(ctx->bc, &alu);
11086 if (r)
11087 return r;
11088
11089 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11090 alu.op = op;
11091 alu.dst.sel = treg;
11092 alu.dst.chan = 1;
11093 alu.dst.write = 1;
11094 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11095 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11096 alu.src[1].neg = 0;
11097 r = r600_bytecode_add_alu(ctx->bc, &alu);
11098 if (r)
11099 return r;
11100
11101 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11102 alu.op = opc ;
11103 alu.dst.sel = treg;
11104 alu.dst.chan = 2;
11105 alu.dst.write = 1;
11106 alu.last = 1;
11107 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11108 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11109 alu.src[1].neg = 0;
11110 r = r600_bytecode_add_alu(ctx->bc, &alu);
11111 if (r)
11112 return r;
11113
11114 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11115 alu.op = op;
11116 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11117 alu.src[0].sel = treg;
11118 alu.src[0].chan = 1;
11119 alu.src[1].sel = treg;
11120 alu.src[1].chan = 2;
11121 alu.last = 1;
11122 r = r600_bytecode_add_alu(ctx->bc, &alu);
11123 if (r)
11124 return r;
11125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11126 alu.op = ALU_OP1_MOV;
11127 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11128 alu.src[0].sel = treg;
11129 alu.src[0].chan = 0;
11130 alu.last = 1;
11131 r = r600_bytecode_add_alu(ctx->bc, &alu);
11132 if (r)
11133 return r;
11134 return 0;
11135 }
11136
11137 /* result.y = mul_high a, b
11138 result.x = mul a,b
11139 result.y += a.x * b.y + a.y * b.x;
11140 */
11141 static int egcm_u64mul(struct r600_shader_ctx *ctx)
11142 {
11143 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11144 struct r600_bytecode_alu alu;
11145 int r;
11146 int treg = ctx->temp_reg;
11147
11148 /* temp.x = mul_lo a.x, b.x */
11149 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11150 alu.op = ALU_OP2_MULLO_UINT;
11151 alu.dst.sel = treg;
11152 alu.dst.chan = 0;
11153 alu.dst.write = 1;
11154 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11155 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11156 r = emit_mul_int_op(ctx->bc, &alu);
11157 if (r)
11158 return r;
11159
11160 /* temp.y = mul_hi a.x, b.x */
11161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11162 alu.op = ALU_OP2_MULHI_UINT;
11163 alu.dst.sel = treg;
11164 alu.dst.chan = 1;
11165 alu.dst.write = 1;
11166 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11167 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11168 r = emit_mul_int_op(ctx->bc, &alu);
11169 if (r)
11170 return r;
11171
11172 /* temp.z = mul a.x, b.y */
11173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11174 alu.op = ALU_OP2_MULLO_UINT;
11175 alu.dst.sel = treg;
11176 alu.dst.chan = 2;
11177 alu.dst.write = 1;
11178 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11179 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11180 r = emit_mul_int_op(ctx->bc, &alu);
11181 if (r)
11182 return r;
11183
11184 /* temp.w = mul a.y, b.x */
11185 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11186 alu.op = ALU_OP2_MULLO_UINT;
11187 alu.dst.sel = treg;
11188 alu.dst.chan = 3;
11189 alu.dst.write = 1;
11190 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11191 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11192 r = emit_mul_int_op(ctx->bc, &alu);
11193 if (r)
11194 return r;
11195
11196 /* temp.z = temp.z + temp.w */
11197 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11198 alu.op = ALU_OP2_ADD_INT;
11199 alu.dst.sel = treg;
11200 alu.dst.chan = 2;
11201 alu.dst.write = 1;
11202 alu.src[0].sel = treg;
11203 alu.src[0].chan = 2;
11204 alu.src[1].sel = treg;
11205 alu.src[1].chan = 3;
11206 alu.last = 1;
11207 r = r600_bytecode_add_alu(ctx->bc, &alu);
11208 if (r)
11209 return r;
11210
11211 /* temp.y = temp.y + temp.z */
11212 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11213 alu.op = ALU_OP2_ADD_INT;
11214 alu.dst.sel = treg;
11215 alu.dst.chan = 1;
11216 alu.dst.write = 1;
11217 alu.src[0].sel = treg;
11218 alu.src[0].chan = 1;
11219 alu.src[1].sel = treg;
11220 alu.src[1].chan = 2;
11221 alu.last = 1;
11222 r = r600_bytecode_add_alu(ctx->bc, &alu);
11223 if (r)
11224 return r;
11225
11226 /* dst.x = temp.x */
11227 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11228 alu.op = ALU_OP1_MOV;
11229 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11230 alu.src[0].sel = treg;
11231 alu.src[0].chan = 0;
11232 r = r600_bytecode_add_alu(ctx->bc, &alu);
11233 if (r)
11234 return r;
11235
11236 /* dst.y = temp.y */
11237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11238 alu.op = ALU_OP1_MOV;
11239 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11240 alu.src[0].sel = treg;
11241 alu.src[0].chan = 1;
11242 alu.last = 1;
11243 r = r600_bytecode_add_alu(ctx->bc, &alu);
11244 if (r)
11245 return r;
11246
11247 return 0;
11248 }
11249
11250 static int emit_u64sge(struct r600_shader_ctx *ctx,
11251 int treg,
11252 int src0_sel, int src0_base_chan,
11253 int src1_sel, int src1_base_chan)
11254 {
11255 int r;
11256 /* for 64-bit sge */
11257 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11258 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11259 treg, 1,
11260 src0_sel, src0_base_chan + 1,
11261 src1_sel, src1_base_chan + 1);
11262 if (r)
11263 return r;
11264
11265 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11266 treg, 0,
11267 src0_sel, src0_base_chan,
11268 src1_sel, src1_base_chan);
11269 if (r)
11270 return r;
11271
11272 r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11273 treg, 2,
11274 src0_sel, src0_base_chan + 1,
11275 src1_sel, src1_base_chan + 1);
11276 if (r)
11277 return r;
11278
11279 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11280 treg, 0,
11281 treg, 0,
11282 treg, 2);
11283 if (r)
11284 return r;
11285
11286 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11287 treg, 0,
11288 treg, 0,
11289 treg, 1);
11290 if (r)
11291 return r;
11292 return 0;
11293 }
11294
11295 /* this isn't a complete div it's just enough for qbo shader to work */
11296 static int egcm_u64div(struct r600_shader_ctx *ctx)
11297 {
11298 struct r600_bytecode_alu alu;
11299 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11300 int r, i;
11301 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11302
11303 /* make sure we are dividing my a const with 0 in the high bits */
11304 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11305 return -1;
11306 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11307 return -1;
11308 /* make sure we are doing one division */
11309 if (inst->Dst[0].Register.WriteMask != 0x3)
11310 return -1;
11311
11312 /* emit_if uses ctx->temp_reg so we can't */
11313 int treg = r600_get_temp(ctx);
11314 int tmp_num = r600_get_temp(ctx);
11315 int sub_tmp = r600_get_temp(ctx);
11316
11317 /* tmp quot are tmp_num.zw */
11318 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11319 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11320 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11321 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11322
11323 /* MOV tmp_num.xy, numerator */
11324 r = single_alu_op2(ctx, ALU_OP1_MOV,
11325 tmp_num, 0,
11326 alu_num_lo.sel, alu_num_lo.chan,
11327 0, 0);
11328 if (r)
11329 return r;
11330 r = single_alu_op2(ctx, ALU_OP1_MOV,
11331 tmp_num, 1,
11332 alu_num_hi.sel, alu_num_hi.chan,
11333 0, 0);
11334 if (r)
11335 return r;
11336
11337 r = single_alu_op2(ctx, ALU_OP1_MOV,
11338 tmp_num, 2,
11339 V_SQ_ALU_SRC_LITERAL, 0,
11340 0, 0);
11341 if (r)
11342 return r;
11343
11344 r = single_alu_op2(ctx, ALU_OP1_MOV,
11345 tmp_num, 3,
11346 V_SQ_ALU_SRC_LITERAL, 0,
11347 0, 0);
11348 if (r)
11349 return r;
11350
11351 /* treg 0 is log2_denom */
11352 /* normally this gets the MSB for the denom high value
11353 - however we know this will always be 0 here. */
11354 r = single_alu_op2(ctx,
11355 ALU_OP1_MOV,
11356 treg, 0,
11357 V_SQ_ALU_SRC_LITERAL, 32,
11358 0, 0);
11359 if (r)
11360 return r;
11361
11362 /* normally check demon hi for 0, but we know it is already */
11363 /* t0.z = num_hi >= denom_lo */
11364 r = single_alu_op2(ctx,
11365 ALU_OP2_SETGE_UINT,
11366 treg, 1,
11367 alu_num_hi.sel, alu_num_hi.chan,
11368 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11369 if (r)
11370 return r;
11371
11372 memset(&alu_src, 0, sizeof(alu_src));
11373 alu_src.sel = treg;
11374 alu_src.chan = 1;
11375 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11376 if (r)
11377 return r;
11378
11379 /* for loops in here */
11380 /* get msb t0.x = msb(src[1].x) first */
11381 int msb_lo = util_last_bit(alu_denom_lo.value);
11382 r = single_alu_op2(ctx, ALU_OP1_MOV,
11383 treg, 0,
11384 V_SQ_ALU_SRC_LITERAL, msb_lo,
11385 0, 0);
11386 if (r)
11387 return r;
11388
11389 /* unroll the asm here */
11390 for (i = 0; i < 31; i++) {
11391 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11392 treg, 2,
11393 V_SQ_ALU_SRC_LITERAL, i,
11394 treg, 0);
11395 if (r)
11396 return r;
11397
11398 /* we can do this on the CPU */
11399 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11400 /* t0.z = tmp_num.y >= t0.z */
11401 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11402 treg, 1,
11403 tmp_num, 1,
11404 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11405 if (r)
11406 return r;
11407
11408 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11409 treg, 1,
11410 treg, 1,
11411 treg, 2);
11412 if (r)
11413 return r;
11414
11415 memset(&alu_src, 0, sizeof(alu_src));
11416 alu_src.sel = treg;
11417 alu_src.chan = 1;
11418 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11419 if (r)
11420 return r;
11421
11422 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11423 tmp_num, 1,
11424 tmp_num, 1,
11425 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11426 if (r)
11427 return r;
11428
11429 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11430 tmp_num, 3,
11431 tmp_num, 3,
11432 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11433 if (r)
11434 return r;
11435
11436 r = tgsi_endif(ctx);
11437 if (r)
11438 return r;
11439 }
11440
11441 /* log2_denom is always <= 31, so manually peel the last loop
11442 * iteration.
11443 */
11444 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11445 treg, 1,
11446 tmp_num, 1,
11447 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11448 if (r)
11449 return r;
11450
11451 memset(&alu_src, 0, sizeof(alu_src));
11452 alu_src.sel = treg;
11453 alu_src.chan = 1;
11454 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11455 if (r)
11456 return r;
11457
11458 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11459 tmp_num, 1,
11460 tmp_num, 1,
11461 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11462 if (r)
11463 return r;
11464
11465 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11466 tmp_num, 3,
11467 tmp_num, 3,
11468 V_SQ_ALU_SRC_LITERAL, 1U);
11469 if (r)
11470 return r;
11471 r = tgsi_endif(ctx);
11472 if (r)
11473 return r;
11474
11475 r = tgsi_endif(ctx);
11476 if (r)
11477 return r;
11478
11479 /* onto the second loop to unroll */
11480 for (i = 0; i < 31; i++) {
11481 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11482 treg, 1,
11483 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11484 treg, 0);
11485 if (r)
11486 return r;
11487
11488 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11489 r = single_alu_op2(ctx, ALU_OP1_MOV,
11490 treg, 2,
11491 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11492 0, 0);
11493 if (r)
11494 return r;
11495
11496 r = single_alu_op2(ctx, ALU_OP1_MOV,
11497 treg, 3,
11498 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11499 0, 0);
11500 if (r)
11501 return r;
11502
11503 r = emit_u64sge(ctx, sub_tmp,
11504 tmp_num, 0,
11505 treg, 2);
11506 if (r)
11507 return r;
11508
11509 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11510 treg, 1,
11511 treg, 1,
11512 sub_tmp, 0);
11513 if (r)
11514 return r;
11515
11516 memset(&alu_src, 0, sizeof(alu_src));
11517 alu_src.sel = treg;
11518 alu_src.chan = 1;
11519 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11520 if (r)
11521 return r;
11522
11523
11524 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11525 sub_tmp,
11526 tmp_num, 0,
11527 treg, 2);
11528 if (r)
11529 return r;
11530
11531 r = single_alu_op2(ctx, ALU_OP1_MOV,
11532 tmp_num, 0,
11533 sub_tmp, 0,
11534 0, 0);
11535 if (r)
11536 return r;
11537
11538 r = single_alu_op2(ctx, ALU_OP1_MOV,
11539 tmp_num, 1,
11540 sub_tmp, 1,
11541 0, 0);
11542 if (r)
11543 return r;
11544
11545 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11546 tmp_num, 2,
11547 tmp_num, 2,
11548 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11549 if (r)
11550 return r;
11551
11552 r = tgsi_endif(ctx);
11553 if (r)
11554 return r;
11555 }
11556
11557 /* log2_denom is always <= 63, so manually peel the last loop
11558 * iteration.
11559 */
11560 uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11561 r = single_alu_op2(ctx, ALU_OP1_MOV,
11562 treg, 2,
11563 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11564 0, 0);
11565 if (r)
11566 return r;
11567
11568 r = single_alu_op2(ctx, ALU_OP1_MOV,
11569 treg, 3,
11570 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11571 0, 0);
11572 if (r)
11573 return r;
11574
11575 r = emit_u64sge(ctx, sub_tmp,
11576 tmp_num, 0,
11577 treg, 2);
11578 if (r)
11579 return r;
11580
11581 memset(&alu_src, 0, sizeof(alu_src));
11582 alu_src.sel = sub_tmp;
11583 alu_src.chan = 0;
11584 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11585 if (r)
11586 return r;
11587
11588 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11589 sub_tmp,
11590 tmp_num, 0,
11591 treg, 2);
11592 if (r)
11593 return r;
11594
11595 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11596 tmp_num, 2,
11597 tmp_num, 2,
11598 V_SQ_ALU_SRC_LITERAL, 1U);
11599 if (r)
11600 return r;
11601 r = tgsi_endif(ctx);
11602 if (r)
11603 return r;
11604
11605 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11606 alu.op = ALU_OP1_MOV;
11607 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11608 alu.src[0].sel = tmp_num;
11609 alu.src[0].chan = 2;
11610 r = r600_bytecode_add_alu(ctx->bc, &alu);
11611 if (r)
11612 return r;
11613
11614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11615 alu.op = ALU_OP1_MOV;
11616 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11617 alu.src[0].sel = tmp_num;
11618 alu.src[0].chan = 3;
11619 alu.last = 1;
11620 r = r600_bytecode_add_alu(ctx->bc, &alu);
11621 if (r)
11622 return r;
11623 return 0;
11624 }
11625
11626 static int egcm_u64sne(struct r600_shader_ctx *ctx)
11627 {
11628 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11629 struct r600_bytecode_alu alu;
11630 int r;
11631 int treg = ctx->temp_reg;
11632
11633 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11634 alu.op = ALU_OP2_SETNE_INT;
11635 alu.dst.sel = treg;
11636 alu.dst.chan = 0;
11637 alu.dst.write = 1;
11638 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11639 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11640 r = r600_bytecode_add_alu(ctx->bc, &alu);
11641 if (r)
11642 return r;
11643
11644 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11645 alu.op = ALU_OP2_SETNE_INT;
11646 alu.dst.sel = treg;
11647 alu.dst.chan = 1;
11648 alu.dst.write = 1;
11649 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11650 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11651 alu.last = 1;
11652 r = r600_bytecode_add_alu(ctx->bc, &alu);
11653 if (r)
11654 return r;
11655
11656 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11657 alu.op = ALU_OP2_OR_INT;
11658 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11659 alu.src[0].sel = treg;
11660 alu.src[0].chan = 0;
11661 alu.src[1].sel = treg;
11662 alu.src[1].chan = 1;
11663 alu.last = 1;
11664 r = r600_bytecode_add_alu(ctx->bc, &alu);
11665 if (r)
11666 return r;
11667 return 0;
11668 }
11669
11670 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11671 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
11672 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11673 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11674
11675 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11676
11677 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11678 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11679 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11680 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11681 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11682 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11683 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11684 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11685 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11686 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11687 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11688 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11689 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11690 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11691 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11692 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
11693 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11694 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11695 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11696 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11697 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11698 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11699 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11700 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11701 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11702 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11703 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11704 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11705 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11706 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
11707 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11708 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11709 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11710 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11711 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11712 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11713 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
11714 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11715 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11716 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11717 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11718 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11719 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11720 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11721 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11722 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11723 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11724 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11725 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11726 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11727 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11728 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
11729 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11730 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11731 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11732 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11733 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11734 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
11735 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11736 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11737 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11738 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11739 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11740 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11741 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11742 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11743 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11744 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11745 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11746 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11747 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11748 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11749 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11750 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11751 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11752 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11753 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11754 [81] = { ALU_OP0_NOP, tgsi_unsupported},
11755 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11756 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11757 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11758 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11759 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11760 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11761 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11762 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11763 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11764 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11765 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11766 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11767 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11768 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11769 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11770 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11771 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11772 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11773 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11774 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11775 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11776 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11777 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11778 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
11779 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11780 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11781 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11782 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11783 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11784 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11785 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
11786 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11787 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11788 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11789 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11790 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11791 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
11792 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11793 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11794 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11795 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11796 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11797 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11798 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11799 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11800 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11801 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11802 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11803 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11804 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11805 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11806 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11807 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11808 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11809 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11810 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11811 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11812 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11813 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11814 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11815 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11816 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11817 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11818 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11819 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11820 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11821 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11822 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11823 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11824 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11825 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11826 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11827 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11828 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11829 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11830 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11831 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11832 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11833 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11834 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
11835 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
11836 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11837 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11838 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11839 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
11840 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
11841 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
11842 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
11843 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
11844 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
11845 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
11846 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11847 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11848 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11849 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11850 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11851 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11852 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11853 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11854 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11855 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
11856 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
11857 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
11858 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
11859 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
11860 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
11861 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
11862 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
11863 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
11864 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11865 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
11866 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
11867 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
11868 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11869 };
11870
11871 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11872 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11873 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11874 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11875 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11876 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11877 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11878 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11879 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11880 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11881 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11882 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11883 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11884 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11885 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11886 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11887 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11888 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11889 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11890 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11891 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11892 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11893 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11894 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11895 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11896 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11897 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11898 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11899 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11900 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11901 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11902 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11903 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11904 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11905 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11906 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11907 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11908 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11909 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11910 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11911 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11912 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11913 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11914 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11915 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11916 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11917 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11918 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11919 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11920 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11921 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11922 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11923 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11924 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11925 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11926 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11927 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11928 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11929 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11930 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11931 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11932 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11933 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11934 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11935 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11936 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11937 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11938 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11939 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11940 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11941 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11942 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11943 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11944 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11945 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11946 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11947 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11948 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11949 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11950 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11951 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11952 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11953 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11954 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11955 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11956 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11957 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11958 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11959 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11960 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11961 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11962 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11963 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11964 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11965 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11966 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11967 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11968 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11969 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11970 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11971 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11972 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11973 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11974 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11975 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11976 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11977 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11978 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11979 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11980 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11981 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11982 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11983 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11984 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11985 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11986 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11987 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11988 /* Refer below for TGSI_OPCODE_DFMA */
11989 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11990 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11991 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11992 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11993 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11994 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11995 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11996 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11997 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11998 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11999 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
12000 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12001 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12002 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12003 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12004 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12005 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
12006 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12007 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12008 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12009 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12010 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12011 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12012 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12013 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12014 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12015 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12016 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12017 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12018 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12019 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12020 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12021 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12022 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12023 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12024 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12025 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12026 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12027 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12028 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12029 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12030 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12031 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12032 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12033 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12034 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12035 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12036 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12037 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12038 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12039 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12040 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12041 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12042 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12043 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12044 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12045 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12046 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12047 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12048 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12049 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12050 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
12051 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
12052 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12053 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12054 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12055 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12056 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12057 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12058 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12059 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12060 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12061 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12062 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12063 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12064 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12065 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12066 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12067 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12068 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12069 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12070 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12071 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12072 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12073 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12074 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12075 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12076 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12077 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12078 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12079 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12080 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12081 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12082 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12083 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12084 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12085 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12086 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12087 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12088 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12089 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12090 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12091 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12092 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12093 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12094 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12095 };
12096
12097 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
12098 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
12099 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
12100 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
12101 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
12102 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
12103 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
12104 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
12105 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
12106 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
12107 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12108 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12109 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
12110 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
12111 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
12112 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
12113 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
12114 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
12115 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
12116 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
12117 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
12118 [21] = { ALU_OP0_NOP, tgsi_unsupported},
12119 [22] = { ALU_OP0_NOP, tgsi_unsupported},
12120 [23] = { ALU_OP0_NOP, tgsi_unsupported},
12121 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
12122 [25] = { ALU_OP0_NOP, tgsi_unsupported},
12123 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
12124 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
12125 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
12126 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
12127 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
12128 [31] = { ALU_OP0_NOP, tgsi_unsupported},
12129 [32] = { ALU_OP0_NOP, tgsi_unsupported},
12130 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
12131 [34] = { ALU_OP0_NOP, tgsi_unsupported},
12132 [35] = { ALU_OP0_NOP, tgsi_unsupported},
12133 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
12134 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12135 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12136 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
12137 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
12138 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
12139 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
12140 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12141 [44] = { ALU_OP0_NOP, tgsi_unsupported},
12142 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
12143 [46] = { ALU_OP0_NOP, tgsi_unsupported},
12144 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
12145 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
12146 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
12147 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
12148 [51] = { ALU_OP0_NOP, tgsi_unsupported},
12149 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
12150 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
12151 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
12152 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
12153 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
12154 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
12155 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
12156 [59] = { ALU_OP0_NOP, tgsi_unsupported},
12157 [60] = { ALU_OP0_NOP, tgsi_unsupported},
12158 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
12159 [62] = { ALU_OP0_NOP, tgsi_unsupported},
12160 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
12161 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
12162 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
12163 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
12164 [67] = { ALU_OP0_NOP, tgsi_unsupported},
12165 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12166 [69] = { ALU_OP0_NOP, tgsi_unsupported},
12167 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
12168 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
12169 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12170 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
12171 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
12172 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
12173 [76] = { ALU_OP0_NOP, tgsi_unsupported},
12174 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
12175 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
12176 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
12177 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
12178 [82] = { ALU_OP0_NOP, tgsi_unsupported},
12179 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
12180 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
12181 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
12182 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
12183 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
12184 [88] = { ALU_OP0_NOP, tgsi_unsupported},
12185 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
12186 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
12187 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
12188 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
12189 [93] = { ALU_OP0_NOP, tgsi_unsupported},
12190 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
12191 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12192 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
12193 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
12194 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
12195 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
12196 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12197 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
12198 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
12199 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
12200 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
12201 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
12202 [106] = { ALU_OP0_NOP, tgsi_unsupported},
12203 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
12204 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
12205 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
12206 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
12207 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
12208 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12209 [113] = { ALU_OP0_NOP, tgsi_unsupported},
12210 [114] = { ALU_OP0_NOP, tgsi_unsupported},
12211 [115] = { ALU_OP0_NOP, tgsi_unsupported},
12212 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
12213 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
12214 /* Refer below for TGSI_OPCODE_DFMA */
12215 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
12216 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
12217 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
12218 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
12219 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
12220 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
12221 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
12222 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
12223 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
12224 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
12225 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
12226 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
12227 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
12228 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
12229 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
12230 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
12231 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
12232 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
12233 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
12234 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
12235 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
12236 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
12237 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12238 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
12239 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
12240 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
12241 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
12242 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
12243 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
12244 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
12245 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
12246 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
12247 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
12248 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
12249 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12250 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12251 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12252 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12253 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12254 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12255 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12256 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12257 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12258 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12259 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12260 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12261 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12262 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12263 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12264 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12265 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12266 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12267 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12268 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12269 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12270 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12271 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12272 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12273 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12274 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12275 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12276 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12277 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12278 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12279 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12280 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12281 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12282 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12283 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12284 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12285 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12286 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12287 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12288 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12289 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12290 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12291 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12292 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12293 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12294 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12295 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12296 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12297 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12298 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12299 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12300 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12301 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12302 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12303 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12304 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12305 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12306 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12307 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12308 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12309 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12310 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12311 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12312 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12313 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12314 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12315 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12316 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12317 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12318 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12319 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12320 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12321 };