r600: implement tg4 integer workaround. (v2)
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194
195 /* disable SB for shaders using doubles */
196 use_sb &= !shader->shader.uses_doubles;
197
198 use_sb &= !shader->shader.uses_atomics;
199 use_sb &= !shader->shader.uses_images;
200 use_sb &= !shader->shader.uses_helper_invocation;
201
202 /* Check if the bytecode has already been built. */
203 if (!shader->shader.bc.bytecode) {
204 r = r600_bytecode_build(&shader->shader.bc);
205 if (r) {
206 R600_ERR("building bytecode failed !\n");
207 goto error;
208 }
209 }
210
211 if (dump && !sb_disasm) {
212 fprintf(stderr, "--------------------------------------------------------------\n");
213 r600_bytecode_disasm(&shader->shader.bc);
214 fprintf(stderr, "______________________________________________________________\n");
215 } else if ((dump && sb_disasm) || use_sb) {
216 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
217 dump, use_sb);
218 if (r) {
219 R600_ERR("r600_sb_bytecode_process failed !\n");
220 goto error;
221 }
222 }
223
224 if (shader->gs_copy_shader) {
225 if (dump) {
226 // dump copy shader
227 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
228 &shader->gs_copy_shader->shader, dump, 0);
229 if (r)
230 goto error;
231 }
232
233 if ((r = store_shader(ctx, shader->gs_copy_shader)))
234 goto error;
235 }
236
237 /* Store the shader in a buffer. */
238 if ((r = store_shader(ctx, shader)))
239 goto error;
240
241 /* Build state. */
242 switch (shader->shader.processor_type) {
243 case PIPE_SHADER_TESS_CTRL:
244 evergreen_update_hs_state(ctx, shader);
245 break;
246 case PIPE_SHADER_TESS_EVAL:
247 if (key.tes.as_es)
248 evergreen_update_es_state(ctx, shader);
249 else
250 evergreen_update_vs_state(ctx, shader);
251 break;
252 case PIPE_SHADER_GEOMETRY:
253 if (rctx->b.chip_class >= EVERGREEN) {
254 evergreen_update_gs_state(ctx, shader);
255 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
256 } else {
257 r600_update_gs_state(ctx, shader);
258 r600_update_vs_state(ctx, shader->gs_copy_shader);
259 }
260 break;
261 case PIPE_SHADER_VERTEX:
262 export_shader = key.vs.as_es;
263 if (rctx->b.chip_class >= EVERGREEN) {
264 if (key.vs.as_ls)
265 evergreen_update_ls_state(ctx, shader);
266 else if (key.vs.as_es)
267 evergreen_update_es_state(ctx, shader);
268 else
269 evergreen_update_vs_state(ctx, shader);
270 } else {
271 if (export_shader)
272 r600_update_es_state(ctx, shader);
273 else
274 r600_update_vs_state(ctx, shader);
275 }
276 break;
277 case PIPE_SHADER_FRAGMENT:
278 if (rctx->b.chip_class >= EVERGREEN) {
279 evergreen_update_ps_state(ctx, shader);
280 } else {
281 r600_update_ps_state(ctx, shader);
282 }
283 break;
284 case PIPE_SHADER_COMPUTE:
285 evergreen_update_ls_state(ctx, shader);
286 break;
287 default:
288 r = -EINVAL;
289 goto error;
290 }
291 return 0;
292
293 error:
294 r600_pipe_shader_destroy(ctx, shader);
295 return r;
296 }
297
298 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
299 {
300 r600_resource_reference(&shader->bo, NULL);
301 r600_bytecode_clear(&shader->shader.bc);
302 r600_release_command_buffer(&shader->command_buffer);
303 }
304
305 /*
306 * tgsi -> r600 shader
307 */
308 struct r600_shader_tgsi_instruction;
309
310 struct r600_shader_src {
311 unsigned sel;
312 unsigned swizzle[4];
313 unsigned neg;
314 unsigned abs;
315 unsigned rel;
316 unsigned kc_bank;
317 boolean kc_rel; /* true if cache bank is indexed */
318 uint32_t value[4];
319 };
320
321 struct eg_interp {
322 boolean enabled;
323 unsigned ij_index;
324 };
325
326 struct r600_shader_ctx {
327 struct tgsi_shader_info info;
328 struct tgsi_parse_context parse;
329 const struct tgsi_token *tokens;
330 unsigned type;
331 unsigned file_offset[TGSI_FILE_COUNT];
332 unsigned temp_reg;
333 const struct r600_shader_tgsi_instruction *inst_info;
334 struct r600_bytecode *bc;
335 struct r600_shader *shader;
336 struct r600_shader_src src[4];
337 uint32_t *literals;
338 uint32_t nliterals;
339 uint32_t max_driver_temp_used;
340 /* needed for evergreen interpolation */
341 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
342 /* evergreen/cayman also store sample mask in face register */
343 int face_gpr;
344 /* sample id is .w component stored in fixed point position register */
345 int fixed_pt_position_gpr;
346 int colors_used;
347 boolean clip_vertex_write;
348 unsigned cv_output;
349 unsigned edgeflag_output;
350 int helper_invoc_reg;
351 int cs_block_size_reg;
352 int cs_grid_size_reg;
353 bool cs_block_size_loaded, cs_grid_size_loaded;
354 int fragcoord_input;
355 int next_ring_offset;
356 int gs_out_ring_offset;
357 int gs_next_vertex;
358 struct r600_shader *gs_for_vs;
359 int gs_export_gpr_tregs[4];
360 int gs_rotated_input[2];
361 const struct pipe_stream_output_info *gs_stream_output_info;
362 unsigned enabled_stream_buffers_mask;
363 unsigned tess_input_info; /* temp with tess input offsets */
364 unsigned tess_output_info; /* temp with tess input offsets */
365 unsigned thread_id_gpr; /* temp with thread id calculated for images */
366 bool thread_id_gpr_loaded;
367 };
368
369 struct r600_shader_tgsi_instruction {
370 unsigned op;
371 int (*process)(struct r600_shader_ctx *ctx);
372 };
373
374 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
375 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
376 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
377 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
378 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
379 static int tgsi_else(struct r600_shader_ctx *ctx);
380 static int tgsi_endif(struct r600_shader_ctx *ctx);
381 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
382 static int tgsi_endloop(struct r600_shader_ctx *ctx);
383 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
384 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
385 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
386 unsigned int dst_reg);
387 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
388 const struct r600_shader_src *shader_src,
389 unsigned chan);
390 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
391 unsigned dst_reg, unsigned mask);
392
393 static int tgsi_last_instruction(unsigned writemask)
394 {
395 int i, lasti = 0;
396
397 for (i = 0; i < 4; i++) {
398 if (writemask & (1 << i)) {
399 lasti = i;
400 }
401 }
402 return lasti;
403 }
404
405 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
406 {
407 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
408 unsigned j;
409
410 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
411 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
412 return -EINVAL;
413 }
414 #if 0
415 if (i->Instruction.Label) {
416 R600_ERR("label unsupported\n");
417 return -EINVAL;
418 }
419 #endif
420 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
421 if (i->Src[j].Register.Dimension) {
422 switch (i->Src[j].Register.File) {
423 case TGSI_FILE_CONSTANT:
424 case TGSI_FILE_HW_ATOMIC:
425 break;
426 case TGSI_FILE_INPUT:
427 if (ctx->type == PIPE_SHADER_GEOMETRY ||
428 ctx->type == PIPE_SHADER_TESS_CTRL ||
429 ctx->type == PIPE_SHADER_TESS_EVAL)
430 break;
431 case TGSI_FILE_OUTPUT:
432 if (ctx->type == PIPE_SHADER_TESS_CTRL)
433 break;
434 default:
435 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
436 i->Src[j].Register.File,
437 i->Src[j].Register.Dimension);
438 return -EINVAL;
439 }
440 }
441 }
442 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
443 if (i->Dst[j].Register.Dimension) {
444 if (ctx->type == PIPE_SHADER_TESS_CTRL)
445 continue;
446 R600_ERR("unsupported dst (dimension)\n");
447 return -EINVAL;
448 }
449 }
450 return 0;
451 }
452
453 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
454 {
455 if (interpolate == TGSI_INTERPOLATE_COLOR ||
456 interpolate == TGSI_INTERPOLATE_LINEAR ||
457 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
458 {
459 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
460 int loc;
461
462 switch(location) {
463 case TGSI_INTERPOLATE_LOC_CENTER:
464 loc = 1;
465 break;
466 case TGSI_INTERPOLATE_LOC_CENTROID:
467 loc = 2;
468 break;
469 case TGSI_INTERPOLATE_LOC_SAMPLE:
470 default:
471 loc = 0; break;
472 }
473
474 return is_linear * 3 + loc;
475 }
476
477 return -1;
478 }
479
480 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
481 int input)
482 {
483 int i = eg_get_interpolator_index(
484 ctx->shader->input[input].interpolate,
485 ctx->shader->input[input].interpolate_location);
486 assert(i >= 0);
487 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
488 }
489
490 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
491 {
492 int i, r;
493 struct r600_bytecode_alu alu;
494 int gpr = 0, base_chan = 0;
495 int ij_index = ctx->shader->input[input].ij_index;
496
497 /* work out gpr and base_chan from index */
498 gpr = ij_index / 2;
499 base_chan = (2 * (ij_index % 2)) + 1;
500
501 for (i = 0; i < 8; i++) {
502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
503
504 if (i < 4)
505 alu.op = ALU_OP2_INTERP_ZW;
506 else
507 alu.op = ALU_OP2_INTERP_XY;
508
509 if ((i > 1) && (i < 6)) {
510 alu.dst.sel = ctx->shader->input[input].gpr;
511 alu.dst.write = 1;
512 }
513
514 alu.dst.chan = i % 4;
515
516 alu.src[0].sel = gpr;
517 alu.src[0].chan = (base_chan - (i % 2));
518
519 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
520
521 alu.bank_swizzle_force = SQ_ALU_VEC_210;
522 if ((i % 4) == 3)
523 alu.last = 1;
524 r = r600_bytecode_add_alu(ctx->bc, &alu);
525 if (r)
526 return r;
527 }
528 return 0;
529 }
530
531 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
532 {
533 int i, r;
534 struct r600_bytecode_alu alu;
535
536 for (i = 0; i < 4; i++) {
537 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
538
539 alu.op = ALU_OP1_INTERP_LOAD_P0;
540
541 alu.dst.sel = ctx->shader->input[input].gpr;
542 alu.dst.write = 1;
543
544 alu.dst.chan = i;
545
546 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
547 alu.src[0].chan = i;
548
549 if (i == 3)
550 alu.last = 1;
551 r = r600_bytecode_add_alu(ctx->bc, &alu);
552 if (r)
553 return r;
554 }
555 return 0;
556 }
557
558 /*
559 * Special export handling in shaders
560 *
561 * shader export ARRAY_BASE for EXPORT_POS:
562 * 60 is position
563 * 61 is misc vector
564 * 62, 63 are clip distance vectors
565 *
566 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
567 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
568 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
569 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
570 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
571 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
572 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
573 * exclusive from render target index)
574 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
575 *
576 *
577 * shader export ARRAY_BASE for EXPORT_PIXEL:
578 * 0-7 CB targets
579 * 61 computed Z vector
580 *
581 * The use of the values exported in the computed Z vector are controlled
582 * by DB_SHADER_CONTROL:
583 * Z_EXPORT_ENABLE - Z as a float in RED
584 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
585 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
586 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
587 * DB_SOURCE_FORMAT - export control restrictions
588 *
589 */
590
591
592 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
593 static int r600_spi_sid(struct r600_shader_io * io)
594 {
595 int index, name = io->name;
596
597 /* These params are handled differently, they don't need
598 * semantic indices, so we'll use 0 for them.
599 */
600 if (name == TGSI_SEMANTIC_POSITION ||
601 name == TGSI_SEMANTIC_PSIZE ||
602 name == TGSI_SEMANTIC_EDGEFLAG ||
603 name == TGSI_SEMANTIC_FACE ||
604 name == TGSI_SEMANTIC_SAMPLEMASK)
605 index = 0;
606 else {
607 if (name == TGSI_SEMANTIC_GENERIC) {
608 /* For generic params simply use sid from tgsi */
609 index = io->sid;
610 } else {
611 /* For non-generic params - pack name and sid into 8 bits */
612 index = 0x80 | (name<<3) | (io->sid);
613 }
614
615 /* Make sure that all really used indices have nonzero value, so
616 * we can just compare it to 0 later instead of comparing the name
617 * with different values to detect special cases. */
618 index++;
619 }
620
621 return index;
622 };
623
624 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
625 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
626 {
627 switch (semantic_name) {
628 case TGSI_SEMANTIC_POSITION:
629 return 0;
630 case TGSI_SEMANTIC_PSIZE:
631 return 1;
632 case TGSI_SEMANTIC_CLIPDIST:
633 assert(index <= 1);
634 return 2 + index;
635 case TGSI_SEMANTIC_GENERIC:
636 if (index <= 63-4)
637 return 4 + index - 9;
638 else
639 /* same explanation as in the default statement,
640 * the only user hitting this is st/nine.
641 */
642 return 0;
643
644 /* patch indices are completely separate and thus start from 0 */
645 case TGSI_SEMANTIC_TESSOUTER:
646 return 0;
647 case TGSI_SEMANTIC_TESSINNER:
648 return 1;
649 case TGSI_SEMANTIC_PATCH:
650 return 2 + index;
651
652 default:
653 /* Don't fail here. The result of this function is only used
654 * for LS, TCS, TES, and GS, where legacy GL semantics can't
655 * occur, but this function is called for all vertex shaders
656 * before it's known whether LS will be compiled or not.
657 */
658 return 0;
659 }
660 }
661
662 /* turn input into interpolate on EG */
663 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
664 {
665 int r = 0;
666
667 if (ctx->shader->input[index].spi_sid) {
668 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
669 if (ctx->shader->input[index].interpolate > 0) {
670 evergreen_interp_assign_ij_index(ctx, index);
671 r = evergreen_interp_alu(ctx, index);
672 } else {
673 r = evergreen_interp_flat(ctx, index);
674 }
675 }
676 return r;
677 }
678
679 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
680 {
681 struct r600_bytecode_alu alu;
682 int i, r;
683 int gpr_front = ctx->shader->input[front].gpr;
684 int gpr_back = ctx->shader->input[back].gpr;
685
686 for (i = 0; i < 4; i++) {
687 memset(&alu, 0, sizeof(alu));
688 alu.op = ALU_OP3_CNDGT;
689 alu.is_op3 = 1;
690 alu.dst.write = 1;
691 alu.dst.sel = gpr_front;
692 alu.src[0].sel = ctx->face_gpr;
693 alu.src[1].sel = gpr_front;
694 alu.src[2].sel = gpr_back;
695
696 alu.dst.chan = i;
697 alu.src[1].chan = i;
698 alu.src[2].chan = i;
699 alu.last = (i==3);
700
701 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
702 return r;
703 }
704
705 return 0;
706 }
707
708 /* execute a single slot ALU calculation */
709 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
710 int dst_sel, int dst_chan,
711 int src0_sel, unsigned src0_chan_val,
712 int src1_sel, unsigned src1_chan_val)
713 {
714 struct r600_bytecode_alu alu;
715 int r, i;
716
717 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
718 for (i = 0; i < 4; i++) {
719 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
720 alu.op = op;
721 alu.src[0].sel = src0_sel;
722 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
723 alu.src[0].value = src0_chan_val;
724 else
725 alu.src[0].chan = src0_chan_val;
726 alu.src[1].sel = src1_sel;
727 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
728 alu.src[1].value = src1_chan_val;
729 else
730 alu.src[1].chan = src1_chan_val;
731 alu.dst.sel = dst_sel;
732 alu.dst.chan = i;
733 alu.dst.write = i == dst_chan;
734 alu.last = (i == 3);
735 r = r600_bytecode_add_alu(ctx->bc, &alu);
736 if (r)
737 return r;
738 }
739 return 0;
740 }
741
742 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
743 alu.op = op;
744 alu.src[0].sel = src0_sel;
745 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
746 alu.src[0].value = src0_chan_val;
747 else
748 alu.src[0].chan = src0_chan_val;
749 alu.src[1].sel = src1_sel;
750 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
751 alu.src[1].value = src1_chan_val;
752 else
753 alu.src[1].chan = src1_chan_val;
754 alu.dst.sel = dst_sel;
755 alu.dst.chan = dst_chan;
756 alu.dst.write = 1;
757 alu.last = 1;
758 r = r600_bytecode_add_alu(ctx->bc, &alu);
759 if (r)
760 return r;
761 return 0;
762 }
763
764 /* execute a single slot ALU calculation */
765 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
766 int dst_sel, int dst_chan,
767 int src0_sel, unsigned src0_chan_val,
768 int src1_sel, unsigned src1_chan_val,
769 int src2_sel, unsigned src2_chan_val)
770 {
771 struct r600_bytecode_alu alu;
772 int r;
773
774 /* validate this for other ops */
775 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
777 alu.op = op;
778 alu.src[0].sel = src0_sel;
779 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
780 alu.src[0].value = src0_chan_val;
781 else
782 alu.src[0].chan = src0_chan_val;
783 alu.src[1].sel = src1_sel;
784 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
785 alu.src[1].value = src1_chan_val;
786 else
787 alu.src[1].chan = src1_chan_val;
788 alu.src[2].sel = src2_sel;
789 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
790 alu.src[2].value = src2_chan_val;
791 else
792 alu.src[2].chan = src2_chan_val;
793 alu.dst.sel = dst_sel;
794 alu.dst.chan = dst_chan;
795 alu.is_op3 = 1;
796 alu.last = 1;
797 r = r600_bytecode_add_alu(ctx->bc, &alu);
798 if (r)
799 return r;
800 return 0;
801 }
802
803 /* put it in temp_reg.x */
804 static int get_lds_offset0(struct r600_shader_ctx *ctx,
805 int rel_patch_chan,
806 int temp_reg, bool is_patch_var)
807 {
808 int r;
809
810 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
811 /* ADD
812 Dimension - patch0_offset (input_vals.z),
813 Non-dim - patch0_data_offset (input_vals.w)
814 */
815 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
816 temp_reg, 0,
817 ctx->tess_output_info, 0,
818 0, rel_patch_chan,
819 ctx->tess_output_info, is_patch_var ? 3 : 2);
820 if (r)
821 return r;
822 return 0;
823 }
824
825 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
826 {
827 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
828 }
829
830 static int r600_get_temp(struct r600_shader_ctx *ctx)
831 {
832 return ctx->temp_reg + ctx->max_driver_temp_used++;
833 }
834
835 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
836 {
837 int i;
838 i = ctx->shader->noutput++;
839 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
840 ctx->shader->output[i].sid = 0;
841 ctx->shader->output[i].gpr = 0;
842 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
843 ctx->shader->output[i].write_mask = 0x4;
844 ctx->shader->output[i].spi_sid = prim_id_sid;
845
846 return 0;
847 }
848
849 static int tgsi_barrier(struct r600_shader_ctx *ctx)
850 {
851 struct r600_bytecode_alu alu;
852 int r;
853
854 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
855 alu.op = ctx->inst_info->op;
856 alu.last = 1;
857
858 r = r600_bytecode_add_alu(ctx->bc, &alu);
859 if (r)
860 return r;
861 return 0;
862 }
863
864 static int tgsi_declaration(struct r600_shader_ctx *ctx)
865 {
866 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
867 int r, i, j, count = d->Range.Last - d->Range.First + 1;
868
869 switch (d->Declaration.File) {
870 case TGSI_FILE_INPUT:
871 for (j = 0; j < count; j++) {
872 i = ctx->shader->ninput + j;
873 assert(i < ARRAY_SIZE(ctx->shader->input));
874 ctx->shader->input[i].name = d->Semantic.Name;
875 ctx->shader->input[i].sid = d->Semantic.Index + j;
876 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
877 ctx->shader->input[i].interpolate_location = d->Interp.Location;
878 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
879 if (ctx->type == PIPE_SHADER_FRAGMENT) {
880 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
881 switch (ctx->shader->input[i].name) {
882 case TGSI_SEMANTIC_FACE:
883 if (ctx->face_gpr != -1)
884 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
885 else
886 ctx->face_gpr = ctx->shader->input[i].gpr;
887 break;
888 case TGSI_SEMANTIC_COLOR:
889 ctx->colors_used++;
890 break;
891 case TGSI_SEMANTIC_POSITION:
892 ctx->fragcoord_input = i;
893 break;
894 case TGSI_SEMANTIC_PRIMID:
895 /* set this for now */
896 ctx->shader->gs_prim_id_input = true;
897 ctx->shader->ps_prim_id_input = i;
898 break;
899 }
900 if (ctx->bc->chip_class >= EVERGREEN) {
901 if ((r = evergreen_interp_input(ctx, i)))
902 return r;
903 }
904 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
905 /* FIXME probably skip inputs if they aren't passed in the ring */
906 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
907 ctx->next_ring_offset += 16;
908 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
909 ctx->shader->gs_prim_id_input = true;
910 }
911 }
912 ctx->shader->ninput += count;
913 break;
914 case TGSI_FILE_OUTPUT:
915 for (j = 0; j < count; j++) {
916 i = ctx->shader->noutput + j;
917 assert(i < ARRAY_SIZE(ctx->shader->output));
918 ctx->shader->output[i].name = d->Semantic.Name;
919 ctx->shader->output[i].sid = d->Semantic.Index + j;
920 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
921 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
922 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
923 if (ctx->type == PIPE_SHADER_VERTEX ||
924 ctx->type == PIPE_SHADER_GEOMETRY ||
925 ctx->type == PIPE_SHADER_TESS_EVAL) {
926 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
927 switch (d->Semantic.Name) {
928 case TGSI_SEMANTIC_CLIPDIST:
929 break;
930 case TGSI_SEMANTIC_PSIZE:
931 ctx->shader->vs_out_misc_write = 1;
932 ctx->shader->vs_out_point_size = 1;
933 break;
934 case TGSI_SEMANTIC_EDGEFLAG:
935 ctx->shader->vs_out_misc_write = 1;
936 ctx->shader->vs_out_edgeflag = 1;
937 ctx->edgeflag_output = i;
938 break;
939 case TGSI_SEMANTIC_VIEWPORT_INDEX:
940 ctx->shader->vs_out_misc_write = 1;
941 ctx->shader->vs_out_viewport = 1;
942 break;
943 case TGSI_SEMANTIC_LAYER:
944 ctx->shader->vs_out_misc_write = 1;
945 ctx->shader->vs_out_layer = 1;
946 break;
947 case TGSI_SEMANTIC_CLIPVERTEX:
948 ctx->clip_vertex_write = TRUE;
949 ctx->cv_output = i;
950 break;
951 }
952 if (ctx->type == PIPE_SHADER_GEOMETRY) {
953 ctx->gs_out_ring_offset += 16;
954 }
955 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
956 switch (d->Semantic.Name) {
957 case TGSI_SEMANTIC_COLOR:
958 ctx->shader->nr_ps_max_color_exports++;
959 break;
960 }
961 }
962 }
963 ctx->shader->noutput += count;
964 break;
965 case TGSI_FILE_TEMPORARY:
966 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
967 if (d->Array.ArrayID) {
968 r600_add_gpr_array(ctx->shader,
969 ctx->file_offset[TGSI_FILE_TEMPORARY] +
970 d->Range.First,
971 d->Range.Last - d->Range.First + 1, 0x0F);
972 }
973 }
974 break;
975
976 case TGSI_FILE_CONSTANT:
977 case TGSI_FILE_SAMPLER:
978 case TGSI_FILE_SAMPLER_VIEW:
979 case TGSI_FILE_ADDRESS:
980 case TGSI_FILE_BUFFER:
981 case TGSI_FILE_IMAGE:
982 case TGSI_FILE_MEMORY:
983 break;
984
985 case TGSI_FILE_HW_ATOMIC:
986 i = ctx->shader->nhwatomic_ranges;
987 ctx->shader->atomics[i].start = d->Range.First;
988 ctx->shader->atomics[i].end = d->Range.Last;
989 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
990 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
991 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
992 ctx->shader->nhwatomic_ranges++;
993 ctx->shader->nhwatomic += count;
994 break;
995
996 case TGSI_FILE_SYSTEM_VALUE:
997 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
998 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
999 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1000 break; /* Already handled from allocate_system_value_inputs */
1001 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1002 break;
1003 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1004 break;
1005 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1006 break;
1007 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1008 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1009 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1010 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1011 unsigned temp_reg = r600_get_temp(ctx);
1012
1013 r = get_lds_offset0(ctx, 2, temp_reg, true);
1014 if (r)
1015 return r;
1016
1017 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1018 temp_reg, 0,
1019 temp_reg, 0,
1020 V_SQ_ALU_SRC_LITERAL, param * 16);
1021 if (r)
1022 return r;
1023
1024 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1025 }
1026 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1027 /* MOV r1.x, r0.x;
1028 MOV r1.y, r0.y;
1029 */
1030 for (i = 0; i < 2; i++) {
1031 struct r600_bytecode_alu alu;
1032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1033 alu.op = ALU_OP1_MOV;
1034 alu.src[0].sel = 0;
1035 alu.src[0].chan = 0 + i;
1036 alu.dst.sel = 1;
1037 alu.dst.chan = 0 + i;
1038 alu.dst.write = 1;
1039 alu.last = (i == 1) ? 1 : 0;
1040 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1041 return r;
1042 }
1043 /* ADD r1.z, 1.0f, -r0.x */
1044 struct r600_bytecode_alu alu;
1045 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1046 alu.op = ALU_OP2_ADD;
1047 alu.src[0].sel = V_SQ_ALU_SRC_1;
1048 alu.src[1].sel = 1;
1049 alu.src[1].chan = 0;
1050 alu.src[1].neg = 1;
1051 alu.dst.sel = 1;
1052 alu.dst.chan = 2;
1053 alu.dst.write = 1;
1054 alu.last = 1;
1055 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1056 return r;
1057
1058 /* ADD r1.z, r1.z, -r1.y */
1059 alu.op = ALU_OP2_ADD;
1060 alu.src[0].sel = 1;
1061 alu.src[0].chan = 2;
1062 alu.src[1].sel = 1;
1063 alu.src[1].chan = 1;
1064 alu.src[1].neg = 1;
1065 alu.dst.sel = 1;
1066 alu.dst.chan = 2;
1067 alu.dst.write = 1;
1068 alu.last = 1;
1069 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1070 return r;
1071 break;
1072 }
1073 break;
1074 default:
1075 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1076 return -EINVAL;
1077 }
1078 return 0;
1079 }
1080
1081 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1082 {
1083 struct tgsi_parse_context parse;
1084 struct {
1085 boolean enabled;
1086 int *reg;
1087 unsigned name, alternate_name;
1088 } inputs[2] = {
1089 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1090
1091 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1092 };
1093 int num_regs = 0;
1094 unsigned k, i;
1095
1096 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1097 return 0;
1098 }
1099
1100 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1101 while (!tgsi_parse_end_of_tokens(&parse)) {
1102 tgsi_parse_token(&parse);
1103
1104 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1105 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1106 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1107 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1108 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1109 {
1110 int interpolate, location, k;
1111
1112 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1113 location = TGSI_INTERPOLATE_LOC_CENTER;
1114 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1115 location = TGSI_INTERPOLATE_LOC_CENTER;
1116 /* Needs sample positions, currently those are always available */
1117 } else {
1118 location = TGSI_INTERPOLATE_LOC_CENTROID;
1119 }
1120
1121 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1122 k = eg_get_interpolator_index(interpolate, location);
1123 if (k >= 0)
1124 ctx->eg_interpolators[k].enabled = true;
1125 }
1126 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1127 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1128 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1129 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1130 if (d->Semantic.Name == inputs[k].name ||
1131 d->Semantic.Name == inputs[k].alternate_name) {
1132 inputs[k].enabled = true;
1133 }
1134 }
1135 }
1136 }
1137 }
1138
1139 tgsi_parse_free(&parse);
1140
1141 if (ctx->info.reads_samplemask &&
1142 (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
1143 inputs[1].enabled = true;
1144 }
1145
1146 if (ctx->bc->chip_class >= EVERGREEN) {
1147 int num_baryc = 0;
1148 /* assign gpr to each interpolator according to priority */
1149 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1150 if (ctx->eg_interpolators[i].enabled) {
1151 ctx->eg_interpolators[i].ij_index = num_baryc;
1152 num_baryc++;
1153 }
1154 }
1155 num_baryc = (num_baryc + 1) >> 1;
1156 gpr_offset += num_baryc;
1157 }
1158
1159 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1160 boolean enabled = inputs[i].enabled;
1161 int *reg = inputs[i].reg;
1162 unsigned name = inputs[i].name;
1163
1164 if (enabled) {
1165 int gpr = gpr_offset + num_regs++;
1166 ctx->shader->nsys_inputs++;
1167
1168 // add to inputs, allocate a gpr
1169 k = ctx->shader->ninput++;
1170 ctx->shader->input[k].name = name;
1171 ctx->shader->input[k].sid = 0;
1172 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1173 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1174 *reg = ctx->shader->input[k].gpr = gpr;
1175 }
1176 }
1177
1178 return gpr_offset + num_regs;
1179 }
1180
1181 /*
1182 * for evergreen we need to scan the shader to find the number of GPRs we need to
1183 * reserve for interpolation and system values
1184 *
1185 * we need to know if we are going to emit any sample or centroid inputs
1186 * if perspective and linear are required
1187 */
1188 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1189 {
1190 unsigned i;
1191
1192 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1193
1194 /*
1195 * Could get this information from the shader info. But right now
1196 * we interpolate all declared inputs, whereas the shader info will
1197 * only contain the bits if the inputs are actually used, so it might
1198 * not be safe...
1199 */
1200 for (i = 0; i < ctx->info.num_inputs; i++) {
1201 int k;
1202 /* skip position/face/mask/sampleid */
1203 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1204 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1205 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1206 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1207 continue;
1208
1209 k = eg_get_interpolator_index(
1210 ctx->info.input_interpolate[i],
1211 ctx->info.input_interpolate_loc[i]);
1212 if (k >= 0)
1213 ctx->eg_interpolators[k].enabled = TRUE;
1214 }
1215
1216 /* XXX PULL MODEL and LINE STIPPLE */
1217
1218 return allocate_system_value_inputs(ctx, 0);
1219 }
1220
1221 /* sample_id_sel == NULL means fetch for current sample */
1222 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1223 {
1224 struct r600_bytecode_vtx vtx;
1225 int r, t1;
1226
1227 t1 = r600_get_temp(ctx);
1228
1229 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1230 vtx.op = FETCH_OP_VFETCH;
1231 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1232 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1233 if (sample_id == NULL) {
1234 assert(ctx->fixed_pt_position_gpr != -1);
1235
1236 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1237 vtx.src_sel_x = 3;
1238 }
1239 else {
1240 struct r600_bytecode_alu alu;
1241
1242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1243 alu.op = ALU_OP1_MOV;
1244 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1245 alu.dst.sel = t1;
1246 alu.dst.write = 1;
1247 alu.last = 1;
1248 r = r600_bytecode_add_alu(ctx->bc, &alu);
1249 if (r)
1250 return r;
1251
1252 vtx.src_gpr = t1;
1253 vtx.src_sel_x = 0;
1254 }
1255 vtx.mega_fetch_count = 16;
1256 vtx.dst_gpr = t1;
1257 vtx.dst_sel_x = 0;
1258 vtx.dst_sel_y = 1;
1259 vtx.dst_sel_z = 2;
1260 vtx.dst_sel_w = 3;
1261 vtx.data_format = FMT_32_32_32_32_FLOAT;
1262 vtx.num_format_all = 2;
1263 vtx.format_comp_all = 1;
1264 vtx.use_const_fields = 0;
1265 vtx.offset = 0;
1266 vtx.endian = r600_endian_swap(32);
1267 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1268
1269 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1270 if (r)
1271 return r;
1272
1273 return t1;
1274 }
1275
1276 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1277 {
1278 int r;
1279 struct r600_bytecode_alu alu;
1280
1281 /* do a vtx fetch with wqm set on the vtx fetch */
1282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1283 alu.op = ALU_OP1_MOV;
1284 alu.dst.sel = ctx->helper_invoc_reg;
1285 alu.dst.chan = 0;
1286 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1287 alu.src[0].value = 0xffffffff;
1288 alu.dst.write = 1;
1289 alu.last = 1;
1290 r = r600_bytecode_add_alu(ctx->bc, &alu);
1291 if (r)
1292 return r;
1293
1294 /* do a vtx fetch in VPM mode */
1295 struct r600_bytecode_vtx vtx;
1296 memset(&vtx, 0, sizeof(vtx));
1297 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1298 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1299 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1300 vtx.src_gpr = 0;
1301 vtx.mega_fetch_count = 16; /* no idea here really... */
1302 vtx.dst_gpr = ctx->helper_invoc_reg;
1303 vtx.dst_sel_x = 4;
1304 vtx.dst_sel_y = 7; /* SEL_Y */
1305 vtx.dst_sel_z = 7; /* SEL_Z */
1306 vtx.dst_sel_w = 7; /* SEL_W */
1307 vtx.data_format = FMT_32;
1308 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1309 return r;
1310 ctx->bc->cf_last->vpm = 1;
1311 return 0;
1312 }
1313
1314 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1315 {
1316 int r;
1317 struct r600_bytecode_alu alu;
1318
1319 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1320 alu.op = ALU_OP1_MOV;
1321 alu.dst.sel = ctx->helper_invoc_reg;
1322 alu.dst.chan = 0;
1323 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1324 alu.src[0].value = 0xffffffff;
1325 alu.dst.write = 1;
1326 alu.last = 1;
1327 r = r600_bytecode_add_alu(ctx->bc, &alu);
1328 if (r)
1329 return r;
1330
1331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1332 alu.op = ALU_OP1_MOV;
1333 alu.dst.sel = ctx->helper_invoc_reg;
1334 alu.dst.chan = 0;
1335 alu.src[0].sel = V_SQ_ALU_SRC_0;
1336 alu.dst.write = 1;
1337 alu.last = 1;
1338 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1339 if (r)
1340 return r;
1341
1342 return ctx->helper_invoc_reg;
1343 }
1344
1345 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1346 {
1347 struct r600_bytecode_vtx vtx;
1348 int r, t1;
1349
1350 if (ctx->cs_block_size_loaded)
1351 return ctx->cs_block_size_reg;
1352 if (ctx->cs_grid_size_loaded)
1353 return ctx->cs_grid_size_reg;
1354
1355 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1356 struct r600_bytecode_alu alu;
1357 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1358 alu.op = ALU_OP1_MOV;
1359 alu.src[0].sel = V_SQ_ALU_SRC_0;
1360 alu.dst.sel = t1;
1361 alu.dst.write = 1;
1362 alu.last = 1;
1363 r = r600_bytecode_add_alu(ctx->bc, &alu);
1364 if (r)
1365 return r;
1366
1367 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1368 vtx.op = FETCH_OP_VFETCH;
1369 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1370 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1371 vtx.src_gpr = t1;
1372 vtx.src_sel_x = 0;
1373
1374 vtx.mega_fetch_count = 16;
1375 vtx.dst_gpr = t1;
1376 vtx.dst_sel_x = 0;
1377 vtx.dst_sel_y = 1;
1378 vtx.dst_sel_z = 2;
1379 vtx.dst_sel_w = 7;
1380 vtx.data_format = FMT_32_32_32_32;
1381 vtx.num_format_all = 1;
1382 vtx.format_comp_all = 0;
1383 vtx.use_const_fields = 0;
1384 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1385 vtx.endian = r600_endian_swap(32);
1386 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1387
1388 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1389 if (r)
1390 return r;
1391
1392 if (load_block)
1393 ctx->cs_block_size_loaded = true;
1394 else
1395 ctx->cs_grid_size_loaded = true;
1396 return t1;
1397 }
1398
1399 static void tgsi_src(struct r600_shader_ctx *ctx,
1400 const struct tgsi_full_src_register *tgsi_src,
1401 struct r600_shader_src *r600_src)
1402 {
1403 memset(r600_src, 0, sizeof(*r600_src));
1404 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1405 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1406 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1407 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1408 r600_src->neg = tgsi_src->Register.Negate;
1409 r600_src->abs = tgsi_src->Register.Absolute;
1410
1411 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1412 int index;
1413 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1414 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1415 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1416
1417 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1418 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1419 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1420 return;
1421 }
1422 index = tgsi_src->Register.Index;
1423 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1424 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1425 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1426 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1427 r600_src->swizzle[0] = 2; // Z value
1428 r600_src->swizzle[1] = 2;
1429 r600_src->swizzle[2] = 2;
1430 r600_src->swizzle[3] = 2;
1431 r600_src->sel = ctx->face_gpr;
1432 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1433 r600_src->swizzle[0] = 3; // W value
1434 r600_src->swizzle[1] = 3;
1435 r600_src->swizzle[2] = 3;
1436 r600_src->swizzle[3] = 3;
1437 r600_src->sel = ctx->fixed_pt_position_gpr;
1438 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1439 r600_src->swizzle[0] = 0;
1440 r600_src->swizzle[1] = 1;
1441 r600_src->swizzle[2] = 4;
1442 r600_src->swizzle[3] = 4;
1443 r600_src->sel = load_sample_position(ctx, NULL, -1);
1444 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1445 r600_src->swizzle[0] = 3;
1446 r600_src->swizzle[1] = 3;
1447 r600_src->swizzle[2] = 3;
1448 r600_src->swizzle[3] = 3;
1449 r600_src->sel = 0;
1450 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1451 r600_src->swizzle[0] = 0;
1452 r600_src->swizzle[1] = 0;
1453 r600_src->swizzle[2] = 0;
1454 r600_src->swizzle[3] = 0;
1455 r600_src->sel = 0;
1456 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1457 r600_src->sel = 0;
1458 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1459 r600_src->sel = 1;
1460 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1461 r600_src->swizzle[0] = 3;
1462 r600_src->swizzle[1] = 3;
1463 r600_src->swizzle[2] = 3;
1464 r600_src->swizzle[3] = 3;
1465 r600_src->sel = 1;
1466 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1467 r600_src->swizzle[0] = 2;
1468 r600_src->swizzle[1] = 2;
1469 r600_src->swizzle[2] = 2;
1470 r600_src->swizzle[3] = 2;
1471 r600_src->sel = 0;
1472 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1473 r600_src->sel = 1;
1474 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1475 r600_src->sel = 3;
1476 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1477 r600_src->sel = 2;
1478 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1479 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1480 r600_src->sel = ctx->tess_input_info;
1481 r600_src->swizzle[0] = 2;
1482 r600_src->swizzle[1] = 2;
1483 r600_src->swizzle[2] = 2;
1484 r600_src->swizzle[3] = 2;
1485 } else {
1486 r600_src->sel = ctx->tess_input_info;
1487 r600_src->swizzle[0] = 3;
1488 r600_src->swizzle[1] = 3;
1489 r600_src->swizzle[2] = 3;
1490 r600_src->swizzle[3] = 3;
1491 }
1492 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1493 r600_src->sel = 0;
1494 r600_src->swizzle[0] = 0;
1495 r600_src->swizzle[1] = 0;
1496 r600_src->swizzle[2] = 0;
1497 r600_src->swizzle[3] = 0;
1498 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1499 r600_src->sel = 0;
1500 r600_src->swizzle[0] = 3;
1501 r600_src->swizzle[1] = 3;
1502 r600_src->swizzle[2] = 3;
1503 r600_src->swizzle[3] = 3;
1504 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1505 r600_src->sel = load_block_grid_size(ctx, false);
1506 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1507 r600_src->sel = load_block_grid_size(ctx, true);
1508 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1509 r600_src->sel = ctx->helper_invoc_reg;
1510 r600_src->swizzle[0] = 0;
1511 r600_src->swizzle[1] = 0;
1512 r600_src->swizzle[2] = 0;
1513 r600_src->swizzle[3] = 0;
1514 }
1515 } else {
1516 if (tgsi_src->Register.Indirect)
1517 r600_src->rel = V_SQ_REL_RELATIVE;
1518 r600_src->sel = tgsi_src->Register.Index;
1519 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1520 }
1521 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1522 if (tgsi_src->Register.Dimension) {
1523 r600_src->kc_bank = tgsi_src->Dimension.Index;
1524 if (tgsi_src->Dimension.Indirect) {
1525 r600_src->kc_rel = 1;
1526 }
1527 }
1528 }
1529 }
1530
1531 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1532 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1533 unsigned int dst_reg)
1534 {
1535 struct r600_bytecode_vtx vtx;
1536 unsigned int ar_reg;
1537 int r;
1538
1539 if (offset) {
1540 struct r600_bytecode_alu alu;
1541
1542 memset(&alu, 0, sizeof(alu));
1543
1544 alu.op = ALU_OP2_ADD_INT;
1545 alu.src[0].sel = ctx->bc->ar_reg;
1546 alu.src[0].chan = ar_chan;
1547
1548 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1549 alu.src[1].value = offset;
1550
1551 alu.dst.sel = dst_reg;
1552 alu.dst.chan = ar_chan;
1553 alu.dst.write = 1;
1554 alu.last = 1;
1555
1556 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1557 return r;
1558
1559 ar_reg = dst_reg;
1560 } else {
1561 ar_reg = ctx->bc->ar_reg;
1562 }
1563
1564 memset(&vtx, 0, sizeof(vtx));
1565 vtx.buffer_id = cb_idx;
1566 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1567 vtx.src_gpr = ar_reg;
1568 vtx.src_sel_x = ar_chan;
1569 vtx.mega_fetch_count = 16;
1570 vtx.dst_gpr = dst_reg;
1571 vtx.dst_sel_x = 0; /* SEL_X */
1572 vtx.dst_sel_y = 1; /* SEL_Y */
1573 vtx.dst_sel_z = 2; /* SEL_Z */
1574 vtx.dst_sel_w = 3; /* SEL_W */
1575 vtx.data_format = FMT_32_32_32_32_FLOAT;
1576 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1577 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1578 vtx.endian = r600_endian_swap(32);
1579 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1580
1581 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1582 return r;
1583
1584 return 0;
1585 }
1586
1587 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1588 {
1589 struct r600_bytecode_vtx vtx;
1590 int r;
1591 unsigned index = src->Register.Index;
1592 unsigned vtx_id = src->Dimension.Index;
1593 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1594 int offset_chan = vtx_id % 3;
1595 int t2 = 0;
1596
1597 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1598 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1599
1600 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1601 offset_chan = 3;
1602
1603 if (src->Dimension.Indirect || src->Register.Indirect)
1604 t2 = r600_get_temp(ctx);
1605
1606 if (src->Dimension.Indirect) {
1607 int treg[3];
1608 struct r600_bytecode_alu alu;
1609 int r, i;
1610 unsigned addr_reg;
1611 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1612 if (src->DimIndirect.Index > 0) {
1613 r = single_alu_op2(ctx, ALU_OP1_MOV,
1614 ctx->bc->ar_reg, 0,
1615 addr_reg, 0,
1616 0, 0);
1617 if (r)
1618 return r;
1619 }
1620 /*
1621 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1622 at least this is what fglrx seems to do. */
1623 for (i = 0; i < 3; i++) {
1624 treg[i] = r600_get_temp(ctx);
1625 }
1626 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1627
1628 for (i = 0; i < 3; i++) {
1629 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1630 alu.op = ALU_OP1_MOV;
1631 alu.src[0].sel = ctx->gs_rotated_input[0];
1632 alu.src[0].chan = i == 2 ? 3 : i;
1633 alu.dst.sel = treg[i];
1634 alu.dst.chan = 0;
1635 alu.dst.write = 1;
1636 alu.last = 1;
1637 r = r600_bytecode_add_alu(ctx->bc, &alu);
1638 if (r)
1639 return r;
1640 }
1641 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1642 alu.op = ALU_OP1_MOV;
1643 alu.src[0].sel = treg[0];
1644 alu.src[0].rel = 1;
1645 alu.dst.sel = t2;
1646 alu.dst.write = 1;
1647 alu.last = 1;
1648 r = r600_bytecode_add_alu(ctx->bc, &alu);
1649 if (r)
1650 return r;
1651 offset_reg = t2;
1652 offset_chan = 0;
1653 }
1654
1655 if (src->Register.Indirect) {
1656 int addr_reg;
1657 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1658
1659 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1660
1661 /* pull the value from index_reg */
1662 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1663 t2, 1,
1664 addr_reg, 0,
1665 V_SQ_ALU_SRC_LITERAL, first);
1666 if (r)
1667 return r;
1668 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1669 t2, 0,
1670 t2, 1,
1671 V_SQ_ALU_SRC_LITERAL, 4,
1672 offset_reg, offset_chan);
1673 if (r)
1674 return r;
1675 offset_reg = t2;
1676 offset_chan = 0;
1677 index = src->Register.Index - first;
1678 }
1679
1680 memset(&vtx, 0, sizeof(vtx));
1681 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1682 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1683 vtx.src_gpr = offset_reg;
1684 vtx.src_sel_x = offset_chan;
1685 vtx.offset = index * 16; /*bytes*/
1686 vtx.mega_fetch_count = 16;
1687 vtx.dst_gpr = dst_reg;
1688 vtx.dst_sel_x = 0; /* SEL_X */
1689 vtx.dst_sel_y = 1; /* SEL_Y */
1690 vtx.dst_sel_z = 2; /* SEL_Z */
1691 vtx.dst_sel_w = 3; /* SEL_W */
1692 if (ctx->bc->chip_class >= EVERGREEN) {
1693 vtx.use_const_fields = 1;
1694 } else {
1695 vtx.data_format = FMT_32_32_32_32_FLOAT;
1696 }
1697
1698 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1699 return r;
1700
1701 return 0;
1702 }
1703
1704 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1705 {
1706 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1707 unsigned i;
1708
1709 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1710 struct tgsi_full_src_register *src = &inst->Src[i];
1711
1712 if (src->Register.File == TGSI_FILE_INPUT) {
1713 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1714 /* primitive id is in R0.z */
1715 ctx->src[i].sel = 0;
1716 ctx->src[i].swizzle[0] = 2;
1717 }
1718 }
1719 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1720 int treg = r600_get_temp(ctx);
1721
1722 fetch_gs_input(ctx, src, treg);
1723 ctx->src[i].sel = treg;
1724 ctx->src[i].rel = 0;
1725 }
1726 }
1727 return 0;
1728 }
1729
1730
1731 /* Tessellation shaders pass outputs to the next shader using LDS.
1732 *
1733 * LS outputs = TCS(HS) inputs
1734 * TCS(HS) outputs = TES(DS) inputs
1735 *
1736 * The LDS layout is:
1737 * - TCS inputs for patch 0
1738 * - TCS inputs for patch 1
1739 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1740 * - ...
1741 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1742 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1743 * - TCS outputs for patch 1
1744 * - Per-patch TCS outputs for patch 1
1745 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1746 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1747 * - ...
1748 *
1749 * All three shaders VS(LS), TCS, TES share the same LDS space.
1750 */
1751 /* this will return with the dw address in temp_reg.x */
1752 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1753 const struct tgsi_full_dst_register *dst,
1754 const struct tgsi_full_src_register *src,
1755 int stride_bytes_reg, int stride_bytes_chan)
1756 {
1757 struct tgsi_full_dst_register reg;
1758 ubyte *name, *index, *array_first;
1759 int r;
1760 int param;
1761 struct tgsi_shader_info *info = &ctx->info;
1762 /* Set the register description. The address computation is the same
1763 * for sources and destinations. */
1764 if (src) {
1765 reg.Register.File = src->Register.File;
1766 reg.Register.Index = src->Register.Index;
1767 reg.Register.Indirect = src->Register.Indirect;
1768 reg.Register.Dimension = src->Register.Dimension;
1769 reg.Indirect = src->Indirect;
1770 reg.Dimension = src->Dimension;
1771 reg.DimIndirect = src->DimIndirect;
1772 } else
1773 reg = *dst;
1774
1775 /* If the register is 2-dimensional (e.g. an array of vertices
1776 * in a primitive), calculate the base address of the vertex. */
1777 if (reg.Register.Dimension) {
1778 int sel, chan;
1779 if (reg.Dimension.Indirect) {
1780 unsigned addr_reg;
1781 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1782
1783 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1784 /* pull the value from index_reg */
1785 sel = addr_reg;
1786 chan = 0;
1787 } else {
1788 sel = V_SQ_ALU_SRC_LITERAL;
1789 chan = reg.Dimension.Index;
1790 }
1791
1792 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1793 temp_reg, 0,
1794 stride_bytes_reg, stride_bytes_chan,
1795 sel, chan,
1796 temp_reg, 0);
1797 if (r)
1798 return r;
1799 }
1800
1801 if (reg.Register.File == TGSI_FILE_INPUT) {
1802 name = info->input_semantic_name;
1803 index = info->input_semantic_index;
1804 array_first = info->input_array_first;
1805 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1806 name = info->output_semantic_name;
1807 index = info->output_semantic_index;
1808 array_first = info->output_array_first;
1809 } else {
1810 assert(0);
1811 return -1;
1812 }
1813 if (reg.Register.Indirect) {
1814 int addr_reg;
1815 int first;
1816 /* Add the relative address of the element. */
1817 if (reg.Indirect.ArrayID)
1818 first = array_first[reg.Indirect.ArrayID];
1819 else
1820 first = reg.Register.Index;
1821
1822 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1823
1824 /* pull the value from index_reg */
1825 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1826 temp_reg, 0,
1827 V_SQ_ALU_SRC_LITERAL, 16,
1828 addr_reg, 0,
1829 temp_reg, 0);
1830 if (r)
1831 return r;
1832
1833 param = r600_get_lds_unique_index(name[first],
1834 index[first]);
1835
1836 } else {
1837 param = r600_get_lds_unique_index(name[reg.Register.Index],
1838 index[reg.Register.Index]);
1839 }
1840
1841 /* add to base_addr - passed in temp_reg.x */
1842 if (param) {
1843 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1844 temp_reg, 0,
1845 temp_reg, 0,
1846 V_SQ_ALU_SRC_LITERAL, param * 16);
1847 if (r)
1848 return r;
1849
1850 }
1851 return 0;
1852 }
1853
1854 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1855 unsigned dst_reg, unsigned mask)
1856 {
1857 struct r600_bytecode_alu alu;
1858 int r, i, lasti;
1859
1860 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1861 ctx->bc->force_add_cf = 1;
1862
1863 lasti = tgsi_last_instruction(mask);
1864 for (i = 1; i <= lasti; i++) {
1865 if (!(mask & (1 << i)))
1866 continue;
1867
1868 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1869 temp_reg, i,
1870 temp_reg, 0,
1871 V_SQ_ALU_SRC_LITERAL, 4 * i);
1872 if (r)
1873 return r;
1874 }
1875 for (i = 0; i <= lasti; i++) {
1876 if (!(mask & (1 << i)))
1877 continue;
1878
1879 /* emit an LDS_READ_RET */
1880 memset(&alu, 0, sizeof(alu));
1881 alu.op = LDS_OP1_LDS_READ_RET;
1882 alu.src[0].sel = temp_reg;
1883 alu.src[0].chan = i;
1884 alu.src[1].sel = V_SQ_ALU_SRC_0;
1885 alu.src[2].sel = V_SQ_ALU_SRC_0;
1886 alu.dst.chan = 0;
1887 alu.is_lds_idx_op = true;
1888 alu.last = 1;
1889 r = r600_bytecode_add_alu(ctx->bc, &alu);
1890 if (r)
1891 return r;
1892 }
1893 for (i = 0; i <= lasti; i++) {
1894 if (!(mask & (1 << i)))
1895 continue;
1896
1897 /* then read from LDS_OQ_A_POP */
1898 memset(&alu, 0, sizeof(alu));
1899
1900 alu.op = ALU_OP1_MOV;
1901 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1902 alu.src[0].chan = 0;
1903 alu.dst.sel = dst_reg;
1904 alu.dst.chan = i;
1905 alu.dst.write = 1;
1906 alu.last = 1;
1907 r = r600_bytecode_add_alu(ctx->bc, &alu);
1908 if (r)
1909 return r;
1910 }
1911 return 0;
1912 }
1913
1914 static int fetch_mask(struct tgsi_src_register *reg)
1915 {
1916 int mask = 0;
1917 mask |= 1 << reg->SwizzleX;
1918 mask |= 1 << reg->SwizzleY;
1919 mask |= 1 << reg->SwizzleZ;
1920 mask |= 1 << reg->SwizzleW;
1921 return mask;
1922 }
1923
1924 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1925 {
1926 int r;
1927 unsigned temp_reg = r600_get_temp(ctx);
1928
1929 r = get_lds_offset0(ctx, 2, temp_reg,
1930 src->Register.Dimension ? false : true);
1931 if (r)
1932 return r;
1933
1934 /* the base address is now in temp.x */
1935 r = r600_get_byte_address(ctx, temp_reg,
1936 NULL, src, ctx->tess_output_info, 1);
1937 if (r)
1938 return r;
1939
1940 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1941 if (r)
1942 return r;
1943 return 0;
1944 }
1945
1946 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1947 {
1948 int r;
1949 unsigned temp_reg = r600_get_temp(ctx);
1950
1951 /* t.x = ips * r0.y */
1952 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1953 temp_reg, 0,
1954 ctx->tess_input_info, 0,
1955 0, 1);
1956
1957 if (r)
1958 return r;
1959
1960 /* the base address is now in temp.x */
1961 r = r600_get_byte_address(ctx, temp_reg,
1962 NULL, src, ctx->tess_input_info, 1);
1963 if (r)
1964 return r;
1965
1966 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1967 if (r)
1968 return r;
1969 return 0;
1970 }
1971
1972 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1973 {
1974 int r;
1975 unsigned temp_reg = r600_get_temp(ctx);
1976
1977 r = get_lds_offset0(ctx, 1, temp_reg,
1978 src->Register.Dimension ? false : true);
1979 if (r)
1980 return r;
1981 /* the base address is now in temp.x */
1982 r = r600_get_byte_address(ctx, temp_reg,
1983 NULL, src,
1984 ctx->tess_output_info, 1);
1985 if (r)
1986 return r;
1987
1988 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1989 if (r)
1990 return r;
1991 return 0;
1992 }
1993
1994 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1995 {
1996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1997 unsigned i;
1998
1999 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2000 struct tgsi_full_src_register *src = &inst->Src[i];
2001
2002 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2003 int treg = r600_get_temp(ctx);
2004 fetch_tes_input(ctx, src, treg);
2005 ctx->src[i].sel = treg;
2006 ctx->src[i].rel = 0;
2007 }
2008 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2009 int treg = r600_get_temp(ctx);
2010 fetch_tcs_input(ctx, src, treg);
2011 ctx->src[i].sel = treg;
2012 ctx->src[i].rel = 0;
2013 }
2014 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2015 int treg = r600_get_temp(ctx);
2016 fetch_tcs_output(ctx, src, treg);
2017 ctx->src[i].sel = treg;
2018 ctx->src[i].rel = 0;
2019 }
2020 }
2021 return 0;
2022 }
2023
2024 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2025 {
2026 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2027 struct r600_bytecode_alu alu;
2028 int i, j, k, nconst, r;
2029
2030 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2031 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2032 nconst++;
2033 }
2034 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2035 }
2036 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2037 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2038 continue;
2039 }
2040
2041 if (ctx->src[i].rel) {
2042 int chan = inst->Src[i].Indirect.Swizzle;
2043 int treg = r600_get_temp(ctx);
2044 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2045 return r;
2046
2047 ctx->src[i].kc_bank = 0;
2048 ctx->src[i].kc_rel = 0;
2049 ctx->src[i].sel = treg;
2050 ctx->src[i].rel = 0;
2051 j--;
2052 } else if (j > 0) {
2053 int treg = r600_get_temp(ctx);
2054 for (k = 0; k < 4; k++) {
2055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2056 alu.op = ALU_OP1_MOV;
2057 alu.src[0].sel = ctx->src[i].sel;
2058 alu.src[0].chan = k;
2059 alu.src[0].rel = ctx->src[i].rel;
2060 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2061 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2062 alu.dst.sel = treg;
2063 alu.dst.chan = k;
2064 alu.dst.write = 1;
2065 if (k == 3)
2066 alu.last = 1;
2067 r = r600_bytecode_add_alu(ctx->bc, &alu);
2068 if (r)
2069 return r;
2070 }
2071 ctx->src[i].sel = treg;
2072 ctx->src[i].rel =0;
2073 j--;
2074 }
2075 }
2076 return 0;
2077 }
2078
2079 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2080 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2081 {
2082 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2083 struct r600_bytecode_alu alu;
2084 int i, j, k, nliteral, r;
2085
2086 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2087 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2088 nliteral++;
2089 }
2090 }
2091 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2092 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2093 int treg = r600_get_temp(ctx);
2094 for (k = 0; k < 4; k++) {
2095 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2096 alu.op = ALU_OP1_MOV;
2097 alu.src[0].sel = ctx->src[i].sel;
2098 alu.src[0].chan = k;
2099 alu.src[0].value = ctx->src[i].value[k];
2100 alu.dst.sel = treg;
2101 alu.dst.chan = k;
2102 alu.dst.write = 1;
2103 if (k == 3)
2104 alu.last = 1;
2105 r = r600_bytecode_add_alu(ctx->bc, &alu);
2106 if (r)
2107 return r;
2108 }
2109 ctx->src[i].sel = treg;
2110 j--;
2111 }
2112 }
2113 return 0;
2114 }
2115
2116 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2117 {
2118 int i, r, count = ctx->shader->ninput;
2119
2120 for (i = 0; i < count; i++) {
2121 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2122 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2123 if (r)
2124 return r;
2125 }
2126 }
2127 return 0;
2128 }
2129
2130 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2131 int stream, unsigned *stream_item_size UNUSED)
2132 {
2133 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2134 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2135 int j, r;
2136 unsigned i;
2137
2138 /* Sanity checking. */
2139 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2140 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2141 r = -EINVAL;
2142 goto out_err;
2143 }
2144 for (i = 0; i < so->num_outputs; i++) {
2145 if (so->output[i].output_buffer >= 4) {
2146 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2147 so->output[i].output_buffer);
2148 r = -EINVAL;
2149 goto out_err;
2150 }
2151 }
2152
2153 /* Initialize locations where the outputs are stored. */
2154 for (i = 0; i < so->num_outputs; i++) {
2155
2156 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2157 start_comp[i] = so->output[i].start_component;
2158 /* Lower outputs with dst_offset < start_component.
2159 *
2160 * We can only output 4D vectors with a write mask, e.g. we can
2161 * only output the W component at offset 3, etc. If we want
2162 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2163 * to move it to X and output X. */
2164 if (so->output[i].dst_offset < so->output[i].start_component) {
2165 unsigned tmp = r600_get_temp(ctx);
2166
2167 for (j = 0; j < so->output[i].num_components; j++) {
2168 struct r600_bytecode_alu alu;
2169 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2170 alu.op = ALU_OP1_MOV;
2171 alu.src[0].sel = so_gpr[i];
2172 alu.src[0].chan = so->output[i].start_component + j;
2173
2174 alu.dst.sel = tmp;
2175 alu.dst.chan = j;
2176 alu.dst.write = 1;
2177 if (j == so->output[i].num_components - 1)
2178 alu.last = 1;
2179 r = r600_bytecode_add_alu(ctx->bc, &alu);
2180 if (r)
2181 return r;
2182 }
2183 start_comp[i] = 0;
2184 so_gpr[i] = tmp;
2185 }
2186 }
2187
2188 /* Write outputs to buffers. */
2189 for (i = 0; i < so->num_outputs; i++) {
2190 struct r600_bytecode_output output;
2191
2192 if (stream != -1 && stream != so->output[i].stream)
2193 continue;
2194
2195 memset(&output, 0, sizeof(struct r600_bytecode_output));
2196 output.gpr = so_gpr[i];
2197 output.elem_size = so->output[i].num_components - 1;
2198 if (output.elem_size == 2)
2199 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2200 output.array_base = so->output[i].dst_offset - start_comp[i];
2201 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2202 output.burst_count = 1;
2203 /* array_size is an upper limit for the burst_count
2204 * with MEM_STREAM instructions */
2205 output.array_size = 0xFFF;
2206 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2207
2208 if (ctx->bc->chip_class >= EVERGREEN) {
2209 switch (so->output[i].output_buffer) {
2210 case 0:
2211 output.op = CF_OP_MEM_STREAM0_BUF0;
2212 break;
2213 case 1:
2214 output.op = CF_OP_MEM_STREAM0_BUF1;
2215 break;
2216 case 2:
2217 output.op = CF_OP_MEM_STREAM0_BUF2;
2218 break;
2219 case 3:
2220 output.op = CF_OP_MEM_STREAM0_BUF3;
2221 break;
2222 }
2223 output.op += so->output[i].stream * 4;
2224 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2225 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2226 } else {
2227 switch (so->output[i].output_buffer) {
2228 case 0:
2229 output.op = CF_OP_MEM_STREAM0;
2230 break;
2231 case 1:
2232 output.op = CF_OP_MEM_STREAM1;
2233 break;
2234 case 2:
2235 output.op = CF_OP_MEM_STREAM2;
2236 break;
2237 case 3:
2238 output.op = CF_OP_MEM_STREAM3;
2239 break;
2240 }
2241 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2242 }
2243 r = r600_bytecode_add_output(ctx->bc, &output);
2244 if (r)
2245 goto out_err;
2246 }
2247 return 0;
2248 out_err:
2249 return r;
2250 }
2251
2252 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2253 {
2254 struct r600_bytecode_alu alu;
2255 unsigned reg;
2256
2257 if (!ctx->shader->vs_out_edgeflag)
2258 return;
2259
2260 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2261
2262 /* clamp(x, 0, 1) */
2263 memset(&alu, 0, sizeof(alu));
2264 alu.op = ALU_OP1_MOV;
2265 alu.src[0].sel = reg;
2266 alu.dst.sel = reg;
2267 alu.dst.write = 1;
2268 alu.dst.clamp = 1;
2269 alu.last = 1;
2270 r600_bytecode_add_alu(ctx->bc, &alu);
2271
2272 memset(&alu, 0, sizeof(alu));
2273 alu.op = ALU_OP1_FLT_TO_INT;
2274 alu.src[0].sel = reg;
2275 alu.dst.sel = reg;
2276 alu.dst.write = 1;
2277 alu.last = 1;
2278 r600_bytecode_add_alu(ctx->bc, &alu);
2279 }
2280
2281 static int generate_gs_copy_shader(struct r600_context *rctx,
2282 struct r600_pipe_shader *gs,
2283 struct pipe_stream_output_info *so)
2284 {
2285 struct r600_shader_ctx ctx = {};
2286 struct r600_shader *gs_shader = &gs->shader;
2287 struct r600_pipe_shader *cshader;
2288 unsigned ocnt = gs_shader->noutput;
2289 struct r600_bytecode_alu alu;
2290 struct r600_bytecode_vtx vtx;
2291 struct r600_bytecode_output output;
2292 struct r600_bytecode_cf *cf_jump, *cf_pop,
2293 *last_exp_pos = NULL, *last_exp_param = NULL;
2294 int next_clip_pos = 61, next_param = 0;
2295 unsigned i, j;
2296 int ring;
2297 bool only_ring_0 = true;
2298 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2299 if (!cshader)
2300 return 0;
2301
2302 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2303 sizeof(struct r600_shader_io));
2304
2305 cshader->shader.noutput = ocnt;
2306
2307 ctx.shader = &cshader->shader;
2308 ctx.bc = &ctx.shader->bc;
2309 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2310
2311 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2312 rctx->screen->has_compressed_msaa_texturing);
2313
2314 ctx.bc->isa = rctx->isa;
2315
2316 cf_jump = NULL;
2317 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2318
2319 /* R0.x = R0.x & 0x3fffffff */
2320 memset(&alu, 0, sizeof(alu));
2321 alu.op = ALU_OP2_AND_INT;
2322 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2323 alu.src[1].value = 0x3fffffff;
2324 alu.dst.write = 1;
2325 r600_bytecode_add_alu(ctx.bc, &alu);
2326
2327 /* R0.y = R0.x >> 30 */
2328 memset(&alu, 0, sizeof(alu));
2329 alu.op = ALU_OP2_LSHR_INT;
2330 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2331 alu.src[1].value = 0x1e;
2332 alu.dst.chan = 1;
2333 alu.dst.write = 1;
2334 alu.last = 1;
2335 r600_bytecode_add_alu(ctx.bc, &alu);
2336
2337 /* fetch vertex data from GSVS ring */
2338 for (i = 0; i < ocnt; ++i) {
2339 struct r600_shader_io *out = &ctx.shader->output[i];
2340
2341 out->gpr = i + 1;
2342 out->ring_offset = i * 16;
2343
2344 memset(&vtx, 0, sizeof(vtx));
2345 vtx.op = FETCH_OP_VFETCH;
2346 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2347 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2348 vtx.mega_fetch_count = 16;
2349 vtx.offset = out->ring_offset;
2350 vtx.dst_gpr = out->gpr;
2351 vtx.src_gpr = 0;
2352 vtx.dst_sel_x = 0;
2353 vtx.dst_sel_y = 1;
2354 vtx.dst_sel_z = 2;
2355 vtx.dst_sel_w = 3;
2356 if (rctx->b.chip_class >= EVERGREEN) {
2357 vtx.use_const_fields = 1;
2358 } else {
2359 vtx.data_format = FMT_32_32_32_32_FLOAT;
2360 }
2361
2362 r600_bytecode_add_vtx(ctx.bc, &vtx);
2363 }
2364 ctx.temp_reg = i + 1;
2365 for (ring = 3; ring >= 0; --ring) {
2366 bool enabled = false;
2367 for (i = 0; i < so->num_outputs; i++) {
2368 if (so->output[i].stream == ring) {
2369 enabled = true;
2370 if (ring > 0)
2371 only_ring_0 = false;
2372 break;
2373 }
2374 }
2375 if (ring != 0 && !enabled) {
2376 cshader->shader.ring_item_sizes[ring] = 0;
2377 continue;
2378 }
2379
2380 if (cf_jump) {
2381 // Patch up jump label
2382 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2383 cf_pop = ctx.bc->cf_last;
2384
2385 cf_jump->cf_addr = cf_pop->id + 2;
2386 cf_jump->pop_count = 1;
2387 cf_pop->cf_addr = cf_pop->id + 2;
2388 cf_pop->pop_count = 1;
2389 }
2390
2391 /* PRED_SETE_INT __, R0.y, ring */
2392 memset(&alu, 0, sizeof(alu));
2393 alu.op = ALU_OP2_PRED_SETE_INT;
2394 alu.src[0].chan = 1;
2395 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2396 alu.src[1].value = ring;
2397 alu.execute_mask = 1;
2398 alu.update_pred = 1;
2399 alu.last = 1;
2400 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2401
2402 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2403 cf_jump = ctx.bc->cf_last;
2404
2405 if (enabled)
2406 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2407 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2408 }
2409
2410 /* bc adds nops - copy it */
2411 if (ctx.bc->chip_class == R600) {
2412 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2413 alu.op = ALU_OP0_NOP;
2414 alu.last = 1;
2415 r600_bytecode_add_alu(ctx.bc, &alu);
2416
2417 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2418 }
2419
2420 /* export vertex data */
2421 /* XXX factor out common code with r600_shader_from_tgsi ? */
2422 for (i = 0; i < ocnt; ++i) {
2423 struct r600_shader_io *out = &ctx.shader->output[i];
2424 bool instream0 = true;
2425 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2426 continue;
2427
2428 for (j = 0; j < so->num_outputs; j++) {
2429 if (so->output[j].register_index == i) {
2430 if (so->output[j].stream == 0)
2431 break;
2432 if (so->output[j].stream > 0)
2433 instream0 = false;
2434 }
2435 }
2436 if (!instream0)
2437 continue;
2438 memset(&output, 0, sizeof(output));
2439 output.gpr = out->gpr;
2440 output.elem_size = 3;
2441 output.swizzle_x = 0;
2442 output.swizzle_y = 1;
2443 output.swizzle_z = 2;
2444 output.swizzle_w = 3;
2445 output.burst_count = 1;
2446 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2447 output.op = CF_OP_EXPORT;
2448 switch (out->name) {
2449 case TGSI_SEMANTIC_POSITION:
2450 output.array_base = 60;
2451 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2452 break;
2453
2454 case TGSI_SEMANTIC_PSIZE:
2455 output.array_base = 61;
2456 if (next_clip_pos == 61)
2457 next_clip_pos = 62;
2458 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2459 output.swizzle_y = 7;
2460 output.swizzle_z = 7;
2461 output.swizzle_w = 7;
2462 ctx.shader->vs_out_misc_write = 1;
2463 ctx.shader->vs_out_point_size = 1;
2464 break;
2465 case TGSI_SEMANTIC_LAYER:
2466 if (out->spi_sid) {
2467 /* duplicate it as PARAM to pass to the pixel shader */
2468 output.array_base = next_param++;
2469 r600_bytecode_add_output(ctx.bc, &output);
2470 last_exp_param = ctx.bc->cf_last;
2471 }
2472 output.array_base = 61;
2473 if (next_clip_pos == 61)
2474 next_clip_pos = 62;
2475 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2476 output.swizzle_x = 7;
2477 output.swizzle_y = 7;
2478 output.swizzle_z = 0;
2479 output.swizzle_w = 7;
2480 ctx.shader->vs_out_misc_write = 1;
2481 ctx.shader->vs_out_layer = 1;
2482 break;
2483 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2484 if (out->spi_sid) {
2485 /* duplicate it as PARAM to pass to the pixel shader */
2486 output.array_base = next_param++;
2487 r600_bytecode_add_output(ctx.bc, &output);
2488 last_exp_param = ctx.bc->cf_last;
2489 }
2490 output.array_base = 61;
2491 if (next_clip_pos == 61)
2492 next_clip_pos = 62;
2493 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2494 ctx.shader->vs_out_misc_write = 1;
2495 ctx.shader->vs_out_viewport = 1;
2496 output.swizzle_x = 7;
2497 output.swizzle_y = 7;
2498 output.swizzle_z = 7;
2499 output.swizzle_w = 0;
2500 break;
2501 case TGSI_SEMANTIC_CLIPDIST:
2502 /* spi_sid is 0 for clipdistance outputs that were generated
2503 * for clipvertex - we don't need to pass them to PS */
2504 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2505 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2506 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2507 if (out->spi_sid) {
2508 /* duplicate it as PARAM to pass to the pixel shader */
2509 output.array_base = next_param++;
2510 r600_bytecode_add_output(ctx.bc, &output);
2511 last_exp_param = ctx.bc->cf_last;
2512 }
2513 output.array_base = next_clip_pos++;
2514 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2515 break;
2516 case TGSI_SEMANTIC_FOG:
2517 output.swizzle_y = 4; /* 0 */
2518 output.swizzle_z = 4; /* 0 */
2519 output.swizzle_w = 5; /* 1 */
2520 break;
2521 default:
2522 output.array_base = next_param++;
2523 break;
2524 }
2525 r600_bytecode_add_output(ctx.bc, &output);
2526 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2527 last_exp_param = ctx.bc->cf_last;
2528 else
2529 last_exp_pos = ctx.bc->cf_last;
2530 }
2531
2532 if (!last_exp_pos) {
2533 memset(&output, 0, sizeof(output));
2534 output.gpr = 0;
2535 output.elem_size = 3;
2536 output.swizzle_x = 7;
2537 output.swizzle_y = 7;
2538 output.swizzle_z = 7;
2539 output.swizzle_w = 7;
2540 output.burst_count = 1;
2541 output.type = 2;
2542 output.op = CF_OP_EXPORT;
2543 output.array_base = 60;
2544 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2545 r600_bytecode_add_output(ctx.bc, &output);
2546 last_exp_pos = ctx.bc->cf_last;
2547 }
2548
2549 if (!last_exp_param) {
2550 memset(&output, 0, sizeof(output));
2551 output.gpr = 0;
2552 output.elem_size = 3;
2553 output.swizzle_x = 7;
2554 output.swizzle_y = 7;
2555 output.swizzle_z = 7;
2556 output.swizzle_w = 7;
2557 output.burst_count = 1;
2558 output.type = 2;
2559 output.op = CF_OP_EXPORT;
2560 output.array_base = next_param++;
2561 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2562 r600_bytecode_add_output(ctx.bc, &output);
2563 last_exp_param = ctx.bc->cf_last;
2564 }
2565
2566 last_exp_pos->op = CF_OP_EXPORT_DONE;
2567 last_exp_param->op = CF_OP_EXPORT_DONE;
2568
2569 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2570 cf_pop = ctx.bc->cf_last;
2571
2572 cf_jump->cf_addr = cf_pop->id + 2;
2573 cf_jump->pop_count = 1;
2574 cf_pop->cf_addr = cf_pop->id + 2;
2575 cf_pop->pop_count = 1;
2576
2577 if (ctx.bc->chip_class == CAYMAN)
2578 cm_bytecode_add_cf_end(ctx.bc);
2579 else {
2580 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2581 ctx.bc->cf_last->end_of_program = 1;
2582 }
2583
2584 gs->gs_copy_shader = cshader;
2585 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2586
2587 ctx.bc->nstack = 1;
2588
2589 return r600_bytecode_build(ctx.bc);
2590 }
2591
2592 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2593 {
2594 if (ind) {
2595 struct r600_bytecode_alu alu;
2596 int r;
2597
2598 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2599 alu.op = ALU_OP2_ADD_INT;
2600 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2601 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2602 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2603 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2604 alu.dst.write = 1;
2605 alu.last = 1;
2606 r = r600_bytecode_add_alu(ctx->bc, &alu);
2607 if (r)
2608 return r;
2609 }
2610 return 0;
2611 }
2612
2613 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2614 {
2615 struct r600_bytecode_output output;
2616 int ring_offset;
2617 unsigned i, k;
2618 int effective_stream = stream == -1 ? 0 : stream;
2619 int idx = 0;
2620
2621 for (i = 0; i < ctx->shader->noutput; i++) {
2622 if (ctx->gs_for_vs) {
2623 /* for ES we need to lookup corresponding ring offset expected by GS
2624 * (map this output to GS input by name and sid) */
2625 /* FIXME precompute offsets */
2626 ring_offset = -1;
2627 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2628 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2629 struct r600_shader_io *out = &ctx->shader->output[i];
2630 if (in->name == out->name && in->sid == out->sid)
2631 ring_offset = in->ring_offset;
2632 }
2633
2634 if (ring_offset == -1)
2635 continue;
2636 } else {
2637 ring_offset = idx * 16;
2638 idx++;
2639 }
2640
2641 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2642 continue;
2643 /* next_ring_offset after parsing input decls contains total size of
2644 * single vertex data, gs_next_vertex - current vertex index */
2645 if (!ind)
2646 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2647
2648 memset(&output, 0, sizeof(struct r600_bytecode_output));
2649 output.gpr = ctx->shader->output[i].gpr;
2650 output.elem_size = 3;
2651 output.comp_mask = 0xF;
2652 output.burst_count = 1;
2653
2654 if (ind)
2655 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2656 else
2657 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2658
2659 switch (stream) {
2660 default:
2661 case 0:
2662 output.op = CF_OP_MEM_RING; break;
2663 case 1:
2664 output.op = CF_OP_MEM_RING1; break;
2665 case 2:
2666 output.op = CF_OP_MEM_RING2; break;
2667 case 3:
2668 output.op = CF_OP_MEM_RING3; break;
2669 }
2670
2671 if (ind) {
2672 output.array_base = ring_offset >> 2; /* in dwords */
2673 output.array_size = 0xfff;
2674 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2675 } else
2676 output.array_base = ring_offset >> 2; /* in dwords */
2677 r600_bytecode_add_output(ctx->bc, &output);
2678 }
2679
2680 ++ctx->gs_next_vertex;
2681 return 0;
2682 }
2683
2684
2685 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2686 {
2687 int r;
2688 struct r600_bytecode_vtx vtx;
2689 int temp_val = ctx->temp_reg;
2690 /* need to store the TCS output somewhere */
2691 r = single_alu_op2(ctx, ALU_OP1_MOV,
2692 temp_val, 0,
2693 V_SQ_ALU_SRC_LITERAL, 0,
2694 0, 0);
2695 if (r)
2696 return r;
2697
2698 /* used by VS/TCS */
2699 if (ctx->tess_input_info) {
2700 /* fetch tcs input values into resv space */
2701 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2702 vtx.op = FETCH_OP_VFETCH;
2703 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2704 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2705 vtx.mega_fetch_count = 16;
2706 vtx.data_format = FMT_32_32_32_32;
2707 vtx.num_format_all = 2;
2708 vtx.format_comp_all = 1;
2709 vtx.use_const_fields = 0;
2710 vtx.endian = r600_endian_swap(32);
2711 vtx.srf_mode_all = 1;
2712 vtx.offset = 0;
2713 vtx.dst_gpr = ctx->tess_input_info;
2714 vtx.dst_sel_x = 0;
2715 vtx.dst_sel_y = 1;
2716 vtx.dst_sel_z = 2;
2717 vtx.dst_sel_w = 3;
2718 vtx.src_gpr = temp_val;
2719 vtx.src_sel_x = 0;
2720
2721 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2722 if (r)
2723 return r;
2724 }
2725
2726 /* used by TCS/TES */
2727 if (ctx->tess_output_info) {
2728 /* fetch tcs output values into resv space */
2729 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2730 vtx.op = FETCH_OP_VFETCH;
2731 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2732 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2733 vtx.mega_fetch_count = 16;
2734 vtx.data_format = FMT_32_32_32_32;
2735 vtx.num_format_all = 2;
2736 vtx.format_comp_all = 1;
2737 vtx.use_const_fields = 0;
2738 vtx.endian = r600_endian_swap(32);
2739 vtx.srf_mode_all = 1;
2740 vtx.offset = 16;
2741 vtx.dst_gpr = ctx->tess_output_info;
2742 vtx.dst_sel_x = 0;
2743 vtx.dst_sel_y = 1;
2744 vtx.dst_sel_z = 2;
2745 vtx.dst_sel_w = 3;
2746 vtx.src_gpr = temp_val;
2747 vtx.src_sel_x = 0;
2748
2749 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2750 if (r)
2751 return r;
2752 }
2753 return 0;
2754 }
2755
2756 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2757 {
2758 int j, r;
2759 int temp_reg;
2760 unsigned i;
2761
2762 /* fetch tcs input values into input_vals */
2763 ctx->tess_input_info = r600_get_temp(ctx);
2764 ctx->tess_output_info = 0;
2765 r = r600_fetch_tess_io_info(ctx);
2766 if (r)
2767 return r;
2768
2769 temp_reg = r600_get_temp(ctx);
2770 /* dst reg contains LDS address stride * idx */
2771 /* MUL vertexID, vertex_dw_stride */
2772 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2773 temp_reg, 0,
2774 ctx->tess_input_info, 1,
2775 0, 1); /* rel id in r0.y? */
2776 if (r)
2777 return r;
2778
2779 for (i = 0; i < ctx->shader->noutput; i++) {
2780 struct r600_bytecode_alu alu;
2781 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2782
2783 if (param) {
2784 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2785 temp_reg, 1,
2786 temp_reg, 0,
2787 V_SQ_ALU_SRC_LITERAL, param * 16);
2788 if (r)
2789 return r;
2790 }
2791
2792 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2793 temp_reg, 2,
2794 temp_reg, param ? 1 : 0,
2795 V_SQ_ALU_SRC_LITERAL, 8);
2796 if (r)
2797 return r;
2798
2799
2800 for (j = 0; j < 2; j++) {
2801 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2802 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2803 alu.op = LDS_OP3_LDS_WRITE_REL;
2804 alu.src[0].sel = temp_reg;
2805 alu.src[0].chan = chan;
2806 alu.src[1].sel = ctx->shader->output[i].gpr;
2807 alu.src[1].chan = j * 2;
2808 alu.src[2].sel = ctx->shader->output[i].gpr;
2809 alu.src[2].chan = (j * 2) + 1;
2810 alu.last = 1;
2811 alu.dst.chan = 0;
2812 alu.lds_idx = 1;
2813 alu.is_lds_idx_op = true;
2814 r = r600_bytecode_add_alu(ctx->bc, &alu);
2815 if (r)
2816 return r;
2817 }
2818 }
2819 return 0;
2820 }
2821
2822 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2823 {
2824 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2825 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2826 int i, r, lasti;
2827 int temp_reg = r600_get_temp(ctx);
2828 struct r600_bytecode_alu alu;
2829 unsigned write_mask = dst->Register.WriteMask;
2830
2831 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2832 return 0;
2833
2834 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2835 if (r)
2836 return r;
2837
2838 /* the base address is now in temp.x */
2839 r = r600_get_byte_address(ctx, temp_reg,
2840 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2841 if (r)
2842 return r;
2843
2844 /* LDS write */
2845 lasti = tgsi_last_instruction(write_mask);
2846 for (i = 1; i <= lasti; i++) {
2847
2848 if (!(write_mask & (1 << i)))
2849 continue;
2850 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2851 temp_reg, i,
2852 temp_reg, 0,
2853 V_SQ_ALU_SRC_LITERAL, 4 * i);
2854 if (r)
2855 return r;
2856 }
2857
2858 for (i = 0; i <= lasti; i++) {
2859 if (!(write_mask & (1 << i)))
2860 continue;
2861
2862 if ((i == 0 && ((write_mask & 3) == 3)) ||
2863 (i == 2 && ((write_mask & 0xc) == 0xc))) {
2864 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2865 alu.op = LDS_OP3_LDS_WRITE_REL;
2866 alu.src[0].sel = temp_reg;
2867 alu.src[0].chan = i;
2868
2869 alu.src[1].sel = dst->Register.Index;
2870 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2871 alu.src[1].chan = i;
2872
2873 alu.src[2].sel = dst->Register.Index;
2874 alu.src[2].sel += ctx->file_offset[dst->Register.File];
2875 alu.src[2].chan = i + 1;
2876 alu.lds_idx = 1;
2877 alu.dst.chan = 0;
2878 alu.last = 1;
2879 alu.is_lds_idx_op = true;
2880 r = r600_bytecode_add_alu(ctx->bc, &alu);
2881 if (r)
2882 return r;
2883 i += 1;
2884 continue;
2885 }
2886 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2887 alu.op = LDS_OP2_LDS_WRITE;
2888 alu.src[0].sel = temp_reg;
2889 alu.src[0].chan = i;
2890
2891 alu.src[1].sel = dst->Register.Index;
2892 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2893 alu.src[1].chan = i;
2894
2895 alu.src[2].sel = V_SQ_ALU_SRC_0;
2896 alu.dst.chan = 0;
2897 alu.last = 1;
2898 alu.is_lds_idx_op = true;
2899 r = r600_bytecode_add_alu(ctx->bc, &alu);
2900 if (r)
2901 return r;
2902 }
2903 return 0;
2904 }
2905
2906 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2907 int output_idx, int nc)
2908 {
2909 int param;
2910 unsigned temp_reg = r600_get_temp(ctx);
2911 unsigned name = ctx->shader->output[output_idx].name;
2912 int dreg = ctx->shader->output[output_idx].gpr;
2913 int r;
2914
2915 param = r600_get_lds_unique_index(name, 0);
2916 r = get_lds_offset0(ctx, 1, temp_reg, true);
2917 if (r)
2918 return r;
2919
2920 if (param) {
2921 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2922 temp_reg, 0,
2923 temp_reg, 0,
2924 V_SQ_ALU_SRC_LITERAL, param * 16);
2925 if (r)
2926 return r;
2927 }
2928
2929 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
2930 return 0;
2931 }
2932
2933 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2934 {
2935 int stride, outer_comps, inner_comps;
2936 int tessinner_idx = -1, tessouter_idx = -1;
2937 int i, r;
2938 unsigned j;
2939 int temp_reg = r600_get_temp(ctx);
2940 int treg[3] = {-1, -1, -1};
2941 struct r600_bytecode_alu alu;
2942 struct r600_bytecode_cf *cf_jump, *cf_pop;
2943
2944 /* only execute factor emission for invocation 0 */
2945 /* PRED_SETE_INT __, R0.x, 0 */
2946 memset(&alu, 0, sizeof(alu));
2947 alu.op = ALU_OP2_PRED_SETE_INT;
2948 alu.src[0].chan = 2;
2949 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2950 alu.execute_mask = 1;
2951 alu.update_pred = 1;
2952 alu.last = 1;
2953 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2954
2955 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2956 cf_jump = ctx->bc->cf_last;
2957
2958 treg[0] = r600_get_temp(ctx);
2959 switch (ctx->shader->tcs_prim_mode) {
2960 case PIPE_PRIM_LINES:
2961 stride = 8; /* 2 dwords, 1 vec2 store */
2962 outer_comps = 2;
2963 inner_comps = 0;
2964 break;
2965 case PIPE_PRIM_TRIANGLES:
2966 stride = 16; /* 4 dwords, 1 vec4 store */
2967 outer_comps = 3;
2968 inner_comps = 1;
2969 treg[1] = r600_get_temp(ctx);
2970 break;
2971 case PIPE_PRIM_QUADS:
2972 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2973 outer_comps = 4;
2974 inner_comps = 2;
2975 treg[1] = r600_get_temp(ctx);
2976 treg[2] = r600_get_temp(ctx);
2977 break;
2978 default:
2979 assert(0);
2980 return -1;
2981 }
2982
2983 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2984 /* TF_WRITE takes index in R.x, value in R.y */
2985 for (j = 0; j < ctx->shader->noutput; j++) {
2986 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2987 tessinner_idx = j;
2988 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2989 tessouter_idx = j;
2990 }
2991
2992 if (tessouter_idx == -1)
2993 return -1;
2994
2995 if (tessinner_idx == -1 && inner_comps)
2996 return -1;
2997
2998 if (tessouter_idx != -1) {
2999 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3000 if (r)
3001 return r;
3002 }
3003
3004 if (tessinner_idx != -1) {
3005 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3006 if (r)
3007 return r;
3008 }
3009
3010 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3011 /* r.x = relpatchid(r0.y) * tf_stride */
3012
3013 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
3014 /* add incoming r0.w to it: t.x = t.x + r0.w */
3015 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3016 temp_reg, 0,
3017 0, 1,
3018 V_SQ_ALU_SRC_LITERAL, stride,
3019 0, 3);
3020 if (r)
3021 return r;
3022
3023 for (i = 0; i < outer_comps + inner_comps; i++) {
3024 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3025 int out_comp = i >= outer_comps ? i - outer_comps : i;
3026
3027 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3028 if (out_comp == 1)
3029 out_comp = 0;
3030 else if (out_comp == 0)
3031 out_comp = 1;
3032 }
3033
3034 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3035 treg[i / 2], (2 * (i % 2)),
3036 temp_reg, 0,
3037 V_SQ_ALU_SRC_LITERAL, 4 * i);
3038 if (r)
3039 return r;
3040 r = single_alu_op2(ctx, ALU_OP1_MOV,
3041 treg[i / 2], 1 + (2 * (i%2)),
3042 ctx->shader->output[out_idx].gpr, out_comp,
3043 0, 0);
3044 if (r)
3045 return r;
3046 }
3047 for (i = 0; i < outer_comps + inner_comps; i++) {
3048 struct r600_bytecode_gds gds;
3049
3050 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3051 gds.src_gpr = treg[i / 2];
3052 gds.src_sel_x = 2 * (i % 2);
3053 gds.src_sel_y = 1 + (2 * (i % 2));
3054 gds.src_sel_z = 4;
3055 gds.dst_sel_x = 7;
3056 gds.dst_sel_y = 7;
3057 gds.dst_sel_z = 7;
3058 gds.dst_sel_w = 7;
3059 gds.op = FETCH_OP_TF_WRITE;
3060 r = r600_bytecode_add_gds(ctx->bc, &gds);
3061 if (r)
3062 return r;
3063 }
3064
3065 // Patch up jump label
3066 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3067 cf_pop = ctx->bc->cf_last;
3068
3069 cf_jump->cf_addr = cf_pop->id + 2;
3070 cf_jump->pop_count = 1;
3071 cf_pop->cf_addr = cf_pop->id + 2;
3072 cf_pop->pop_count = 1;
3073
3074 return 0;
3075 }
3076
3077 /*
3078 * We have to work out the thread ID for load and atomic
3079 * operations, which store the returned value to an index
3080 * in an intermediate buffer.
3081 * The index is calculated by taking the thread id,
3082 * calculated from the MBCNT instructions.
3083 * Then the shader engine ID is multiplied by 256,
3084 * and the wave id is added.
3085 * Then the result is multipled by 64 and thread id is
3086 * added.
3087 */
3088 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3089 {
3090 struct r600_bytecode_alu alu;
3091 int r;
3092
3093 if (ctx->thread_id_gpr_loaded)
3094 return 0;
3095
3096 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3097 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3098 alu.dst.sel = ctx->temp_reg;
3099 alu.dst.chan = 0;
3100 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3101 alu.src[0].value = 0xffffffff;
3102 alu.dst.write = 1;
3103 r = r600_bytecode_add_alu(ctx->bc, &alu);
3104 if (r)
3105 return r;
3106
3107 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3108 alu.op = ALU_OP1_MBCNT_32HI_INT;
3109 alu.dst.sel = ctx->temp_reg;
3110 alu.dst.chan = 1;
3111 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3112 alu.src[0].value = 0xffffffff;
3113 alu.dst.write = 1;
3114 r = r600_bytecode_add_alu(ctx->bc, &alu);
3115 if (r)
3116 return r;
3117
3118 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3119 alu.op = ALU_OP3_MULADD_UINT24;
3120 alu.dst.sel = ctx->temp_reg;
3121 alu.dst.chan = 2;
3122 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3123 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3124 alu.src[1].value = 256;
3125 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3126 alu.dst.write = 1;
3127 alu.is_op3 = 1;
3128 alu.last = 1;
3129 r = r600_bytecode_add_alu(ctx->bc, &alu);
3130 if (r)
3131 return r;
3132
3133 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3134 ctx->thread_id_gpr, 1,
3135 ctx->temp_reg, 2,
3136 V_SQ_ALU_SRC_LITERAL, 0x40,
3137 ctx->temp_reg, 0);
3138 if (r)
3139 return r;
3140 ctx->thread_id_gpr_loaded = true;
3141 return 0;
3142 }
3143
3144 static int r600_shader_from_tgsi(struct r600_context *rctx,
3145 struct r600_pipe_shader *pipeshader,
3146 union r600_shader_key key)
3147 {
3148 struct r600_screen *rscreen = rctx->screen;
3149 struct r600_shader *shader = &pipeshader->shader;
3150 struct tgsi_token *tokens = pipeshader->selector->tokens;
3151 struct pipe_stream_output_info so = pipeshader->selector->so;
3152 struct tgsi_full_immediate *immediate;
3153 struct r600_shader_ctx ctx;
3154 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3155 unsigned output_done, noutput;
3156 unsigned opcode;
3157 int j, k, r = 0;
3158 unsigned i;
3159 int next_param_base = 0, next_clip_base;
3160 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3161 bool indirect_gprs;
3162 bool ring_outputs = false;
3163 bool lds_outputs = false;
3164 bool lds_inputs = false;
3165 bool pos_emitted = false;
3166
3167 ctx.bc = &shader->bc;
3168 ctx.shader = shader;
3169
3170 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3171 rscreen->has_compressed_msaa_texturing);
3172 ctx.tokens = tokens;
3173 tgsi_scan_shader(tokens, &ctx.info);
3174 shader->indirect_files = ctx.info.indirect_files;
3175
3176 shader->uses_helper_invocation = false;
3177 shader->uses_doubles = ctx.info.uses_doubles;
3178 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3179 shader->nsys_inputs = 0;
3180
3181 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3182 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3183 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3184 tgsi_parse_init(&ctx.parse, tokens);
3185 ctx.type = ctx.info.processor;
3186 shader->processor_type = ctx.type;
3187 ctx.bc->type = shader->processor_type;
3188
3189 switch (ctx.type) {
3190 case PIPE_SHADER_VERTEX:
3191 shader->vs_as_gs_a = key.vs.as_gs_a;
3192 shader->vs_as_es = key.vs.as_es;
3193 shader->vs_as_ls = key.vs.as_ls;
3194 shader->atomic_base = key.vs.first_atomic_counter;
3195 if (shader->vs_as_es)
3196 ring_outputs = true;
3197 if (shader->vs_as_ls)
3198 lds_outputs = true;
3199 break;
3200 case PIPE_SHADER_GEOMETRY:
3201 ring_outputs = true;
3202 shader->atomic_base = key.gs.first_atomic_counter;
3203 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3204 break;
3205 case PIPE_SHADER_TESS_CTRL:
3206 shader->tcs_prim_mode = key.tcs.prim_mode;
3207 shader->atomic_base = key.tcs.first_atomic_counter;
3208 lds_outputs = true;
3209 lds_inputs = true;
3210 break;
3211 case PIPE_SHADER_TESS_EVAL:
3212 shader->tes_as_es = key.tes.as_es;
3213 shader->atomic_base = key.tes.first_atomic_counter;
3214 lds_inputs = true;
3215 if (shader->tes_as_es)
3216 ring_outputs = true;
3217 break;
3218 case PIPE_SHADER_FRAGMENT:
3219 shader->two_side = key.ps.color_two_side;
3220 shader->atomic_base = key.ps.first_atomic_counter;
3221 shader->rat_base = key.ps.nr_cbufs;
3222 shader->image_size_const_offset = key.ps.image_size_const_offset;
3223 break;
3224 case PIPE_SHADER_COMPUTE:
3225 shader->rat_base = 0;
3226 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3227 break;
3228 default:
3229 break;
3230 }
3231
3232 if (shader->vs_as_es || shader->tes_as_es) {
3233 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3234 } else {
3235 ctx.gs_for_vs = NULL;
3236 }
3237
3238 ctx.next_ring_offset = 0;
3239 ctx.gs_out_ring_offset = 0;
3240 ctx.gs_next_vertex = 0;
3241 ctx.gs_stream_output_info = &so;
3242
3243 ctx.face_gpr = -1;
3244 ctx.fixed_pt_position_gpr = -1;
3245 ctx.fragcoord_input = -1;
3246 ctx.colors_used = 0;
3247 ctx.clip_vertex_write = 0;
3248 ctx.thread_id_gpr_loaded = false;
3249
3250 ctx.helper_invoc_reg = -1;
3251 ctx.cs_block_size_reg = -1;
3252 ctx.cs_grid_size_reg = -1;
3253 ctx.cs_block_size_loaded = false;
3254 ctx.cs_grid_size_loaded = false;
3255
3256 shader->nr_ps_color_exports = 0;
3257 shader->nr_ps_max_color_exports = 0;
3258
3259
3260 /* register allocations */
3261 /* Values [0,127] correspond to GPR[0..127].
3262 * Values [128,159] correspond to constant buffer bank 0
3263 * Values [160,191] correspond to constant buffer bank 1
3264 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3265 * Values [256,287] correspond to constant buffer bank 2 (EG)
3266 * Values [288,319] correspond to constant buffer bank 3 (EG)
3267 * Other special values are shown in the list below.
3268 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3269 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3270 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3271 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3272 * 248 SQ_ALU_SRC_0: special constant 0.0.
3273 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3274 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3275 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3276 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3277 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3278 * 254 SQ_ALU_SRC_PV: previous vector result.
3279 * 255 SQ_ALU_SRC_PS: previous scalar result.
3280 */
3281 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3282 ctx.file_offset[i] = 0;
3283 }
3284
3285 if (ctx.type == PIPE_SHADER_VERTEX) {
3286
3287 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3288 if (ctx.info.num_inputs)
3289 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3290 }
3291 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3292 if (ctx.bc->chip_class >= EVERGREEN)
3293 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3294 else
3295 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3296
3297 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3298 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3299 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3300 shader->uses_helper_invocation = true;
3301 }
3302 }
3303 }
3304 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3305 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3306 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3307 }
3308 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3309 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3310 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3311 bool add_tesscoord = false, add_tess_inout = false;
3312 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3313 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3314 /* if we have tesscoord save one reg */
3315 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3316 add_tesscoord = true;
3317 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3318 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3319 add_tess_inout = true;
3320 }
3321 if (add_tesscoord || add_tess_inout)
3322 ctx.file_offset[TGSI_FILE_INPUT]++;
3323 if (add_tess_inout)
3324 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3325 }
3326 if (ctx.type == PIPE_SHADER_COMPUTE) {
3327 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3328 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3329 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3330 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3331 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3332 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3333 }
3334 }
3335
3336 ctx.file_offset[TGSI_FILE_OUTPUT] =
3337 ctx.file_offset[TGSI_FILE_INPUT] +
3338 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3339 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3340 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3341
3342 /* Outside the GPR range. This will be translated to one of the
3343 * kcache banks later. */
3344 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3345
3346 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3347
3348 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3349 ctx.info.file_max[TGSI_FILE_TEMPORARY];
3350 ctx.bc->ar_reg = ++regno;
3351 ctx.bc->index_reg[0] = ++regno;
3352 ctx.bc->index_reg[1] = ++regno;
3353
3354 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3355 ctx.tess_input_info = ++regno;
3356 ctx.tess_output_info = ++regno;
3357 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3358 ctx.tess_input_info = 0;
3359 ctx.tess_output_info = ++regno;
3360 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3361 ctx.gs_export_gpr_tregs[0] = ++regno;
3362 ctx.gs_export_gpr_tregs[1] = ++regno;
3363 ctx.gs_export_gpr_tregs[2] = ++regno;
3364 ctx.gs_export_gpr_tregs[3] = ++regno;
3365 if (ctx.shader->gs_tri_strip_adj_fix) {
3366 ctx.gs_rotated_input[0] = ++regno;
3367 ctx.gs_rotated_input[1] = ++regno;
3368 } else {
3369 ctx.gs_rotated_input[0] = 0;
3370 ctx.gs_rotated_input[1] = 1;
3371 }
3372 }
3373
3374 if (shader->uses_images) {
3375 ctx.thread_id_gpr = ++regno;
3376 ctx.thread_id_gpr_loaded = false;
3377 }
3378 ctx.temp_reg = ++regno;
3379
3380 shader->max_arrays = 0;
3381 shader->num_arrays = 0;
3382 if (indirect_gprs) {
3383
3384 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3385 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3386 ctx.file_offset[TGSI_FILE_OUTPUT] -
3387 ctx.file_offset[TGSI_FILE_INPUT],
3388 0x0F);
3389 }
3390 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3391 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3392 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3393 ctx.file_offset[TGSI_FILE_OUTPUT],
3394 0x0F);
3395 }
3396 }
3397
3398 ctx.nliterals = 0;
3399 ctx.literals = NULL;
3400 ctx.max_driver_temp_used = 0;
3401
3402 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3403 ctx.info.colors_written == 1;
3404 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3405 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3406
3407 if (ctx.type == PIPE_SHADER_VERTEX ||
3408 ctx.type == PIPE_SHADER_GEOMETRY ||
3409 ctx.type == PIPE_SHADER_TESS_EVAL) {
3410 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3411 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3412 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3413 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3414 }
3415
3416 if (shader->vs_as_gs_a)
3417 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3418
3419 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3420 r600_fetch_tess_io_info(&ctx);
3421
3422 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3423 tgsi_parse_token(&ctx.parse);
3424 switch (ctx.parse.FullToken.Token.Type) {
3425 case TGSI_TOKEN_TYPE_IMMEDIATE:
3426 immediate = &ctx.parse.FullToken.FullImmediate;
3427 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3428 if(ctx.literals == NULL) {
3429 r = -ENOMEM;
3430 goto out_err;
3431 }
3432 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3433 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3434 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3435 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3436 ctx.nliterals++;
3437 break;
3438 case TGSI_TOKEN_TYPE_DECLARATION:
3439 r = tgsi_declaration(&ctx);
3440 if (r)
3441 goto out_err;
3442 break;
3443 case TGSI_TOKEN_TYPE_INSTRUCTION:
3444 case TGSI_TOKEN_TYPE_PROPERTY:
3445 break;
3446 default:
3447 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3448 r = -EINVAL;
3449 goto out_err;
3450 }
3451 }
3452
3453 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3454 shader->ring_item_sizes[1] = 0;
3455 shader->ring_item_sizes[2] = 0;
3456 shader->ring_item_sizes[3] = 0;
3457
3458 /* Process two side if needed */
3459 if (shader->two_side && ctx.colors_used) {
3460 int i, count = ctx.shader->ninput;
3461 unsigned next_lds_loc = ctx.shader->nlds;
3462
3463 /* additional inputs will be allocated right after the existing inputs,
3464 * we won't need them after the color selection, so we don't need to
3465 * reserve these gprs for the rest of the shader code and to adjust
3466 * output offsets etc. */
3467 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3468 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3469
3470 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3471 if (ctx.face_gpr == -1) {
3472 i = ctx.shader->ninput++;
3473 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3474 ctx.shader->input[i].spi_sid = 0;
3475 ctx.shader->input[i].gpr = gpr++;
3476 ctx.face_gpr = ctx.shader->input[i].gpr;
3477 }
3478
3479 for (i = 0; i < count; i++) {
3480 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3481 int ni = ctx.shader->ninput++;
3482 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3483 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3484 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3485 ctx.shader->input[ni].gpr = gpr++;
3486 // TGSI to LLVM needs to know the lds position of inputs.
3487 // Non LLVM path computes it later (in process_twoside_color)
3488 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3489 ctx.shader->input[i].back_color_input = ni;
3490 if (ctx.bc->chip_class >= EVERGREEN) {
3491 if ((r = evergreen_interp_input(&ctx, ni)))
3492 return r;
3493 }
3494 }
3495 }
3496 }
3497
3498 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3499 shader->nr_ps_max_color_exports = 8;
3500
3501 if (ctx.shader->uses_helper_invocation) {
3502 if (ctx.bc->chip_class == CAYMAN)
3503 r = cm_load_helper_invocation(&ctx);
3504 else
3505 r = eg_load_helper_invocation(&ctx);
3506 if (r)
3507 return r;
3508 }
3509
3510 /*
3511 * XXX this relies on fixed_pt_position_gpr only being present when
3512 * this shader should be executed per sample. Should be the case for now...
3513 */
3514 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3515 /*
3516 * Fix up sample mask. The hw always gives us coverage mask for
3517 * the pixel. However, for per-sample shading, we need the
3518 * coverage for the shader invocation only.
3519 * Also, with disabled msaa, only the first bit should be set
3520 * (luckily the same fixup works for both problems).
3521 * For now, we can only do it if we know this shader is always
3522 * executed per sample (due to usage of bits in the shader
3523 * forcing per-sample execution).
3524 * If the fb is not multisampled, we'd do unnecessary work but
3525 * it should still be correct.
3526 * It will however do nothing for sample shading according
3527 * to MinSampleShading.
3528 */
3529 struct r600_bytecode_alu alu;
3530 int tmp = r600_get_temp(&ctx);
3531 assert(ctx.face_gpr != -1);
3532 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3533
3534 alu.op = ALU_OP2_LSHL_INT;
3535 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3536 alu.src[0].value = 0x1;
3537 alu.src[1].sel = ctx.fixed_pt_position_gpr;
3538 alu.src[1].chan = 3;
3539 alu.dst.sel = tmp;
3540 alu.dst.chan = 0;
3541 alu.dst.write = 1;
3542 alu.last = 1;
3543 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3544 return r;
3545
3546 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3547 alu.op = ALU_OP2_AND_INT;
3548 alu.src[0].sel = tmp;
3549 alu.src[1].sel = ctx.face_gpr;
3550 alu.src[1].chan = 2;
3551 alu.dst.sel = ctx.face_gpr;
3552 alu.dst.chan = 2;
3553 alu.dst.write = 1;
3554 alu.last = 1;
3555 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3556 return r;
3557 }
3558
3559 if (ctx.fragcoord_input >= 0) {
3560 if (ctx.bc->chip_class == CAYMAN) {
3561 for (j = 0 ; j < 4; j++) {
3562 struct r600_bytecode_alu alu;
3563 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3564 alu.op = ALU_OP1_RECIP_IEEE;
3565 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3566 alu.src[0].chan = 3;
3567
3568 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3569 alu.dst.chan = j;
3570 alu.dst.write = (j == 3);
3571 alu.last = 1;
3572 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3573 return r;
3574 }
3575 } else {
3576 struct r600_bytecode_alu alu;
3577 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3578 alu.op = ALU_OP1_RECIP_IEEE;
3579 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3580 alu.src[0].chan = 3;
3581
3582 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3583 alu.dst.chan = 3;
3584 alu.dst.write = 1;
3585 alu.last = 1;
3586 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3587 return r;
3588 }
3589 }
3590
3591 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3592 struct r600_bytecode_alu alu;
3593 int r;
3594
3595 /* GS thread with no output workaround - emit a cut at start of GS */
3596 if (ctx.bc->chip_class == R600)
3597 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3598
3599 for (j = 0; j < 4; j++) {
3600 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3601 alu.op = ALU_OP1_MOV;
3602 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3603 alu.src[0].value = 0;
3604 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3605 alu.dst.write = 1;
3606 alu.last = 1;
3607 r = r600_bytecode_add_alu(ctx.bc, &alu);
3608 if (r)
3609 return r;
3610 }
3611
3612 if (ctx.shader->gs_tri_strip_adj_fix) {
3613 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3614 ctx.gs_rotated_input[0], 2,
3615 0, 2,
3616 V_SQ_ALU_SRC_LITERAL, 1);
3617 if (r)
3618 return r;
3619
3620 for (i = 0; i < 6; i++) {
3621 int rotated = (i + 4) % 6;
3622 int offset_reg = i / 3;
3623 int offset_chan = i % 3;
3624 int rotated_offset_reg = rotated / 3;
3625 int rotated_offset_chan = rotated % 3;
3626
3627 if (offset_reg == 0 && offset_chan == 2)
3628 offset_chan = 3;
3629 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3630 rotated_offset_chan = 3;
3631
3632 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3633 ctx.gs_rotated_input[offset_reg], offset_chan,
3634 ctx.gs_rotated_input[0], 2,
3635 offset_reg, offset_chan,
3636 rotated_offset_reg, rotated_offset_chan);
3637 if (r)
3638 return r;
3639 }
3640 }
3641 }
3642
3643 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3644 r600_fetch_tess_io_info(&ctx);
3645
3646 if (shader->two_side && ctx.colors_used) {
3647 if ((r = process_twoside_color_inputs(&ctx)))
3648 return r;
3649 }
3650
3651 tgsi_parse_init(&ctx.parse, tokens);
3652 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3653 tgsi_parse_token(&ctx.parse);
3654 switch (ctx.parse.FullToken.Token.Type) {
3655 case TGSI_TOKEN_TYPE_INSTRUCTION:
3656 r = tgsi_is_supported(&ctx);
3657 if (r)
3658 goto out_err;
3659 ctx.max_driver_temp_used = 0;
3660 /* reserve first tmp for everyone */
3661 r600_get_temp(&ctx);
3662
3663 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3664 if ((r = tgsi_split_constant(&ctx)))
3665 goto out_err;
3666 if ((r = tgsi_split_literal_constant(&ctx)))
3667 goto out_err;
3668 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3669 if ((r = tgsi_split_gs_inputs(&ctx)))
3670 goto out_err;
3671 } else if (lds_inputs) {
3672 if ((r = tgsi_split_lds_inputs(&ctx)))
3673 goto out_err;
3674 }
3675 if (ctx.bc->chip_class == CAYMAN)
3676 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3677 else if (ctx.bc->chip_class >= EVERGREEN)
3678 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3679 else
3680 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3681 r = ctx.inst_info->process(&ctx);
3682 if (r)
3683 goto out_err;
3684
3685 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3686 r = r600_store_tcs_output(&ctx);
3687 if (r)
3688 goto out_err;
3689 }
3690 break;
3691 default:
3692 break;
3693 }
3694 }
3695
3696 /* Reset the temporary register counter. */
3697 ctx.max_driver_temp_used = 0;
3698
3699 noutput = shader->noutput;
3700
3701 if (!ring_outputs && ctx.clip_vertex_write) {
3702 unsigned clipdist_temp[2];
3703
3704 clipdist_temp[0] = r600_get_temp(&ctx);
3705 clipdist_temp[1] = r600_get_temp(&ctx);
3706
3707 /* need to convert a clipvertex write into clipdistance writes and not export
3708 the clip vertex anymore */
3709
3710 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3711 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3712 shader->output[noutput].gpr = clipdist_temp[0];
3713 noutput++;
3714 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3715 shader->output[noutput].gpr = clipdist_temp[1];
3716 noutput++;
3717
3718 /* reset spi_sid for clipvertex output to avoid confusing spi */
3719 shader->output[ctx.cv_output].spi_sid = 0;
3720
3721 shader->clip_dist_write = 0xFF;
3722 shader->cc_dist_mask = 0xFF;
3723
3724 for (i = 0; i < 8; i++) {
3725 int oreg = i >> 2;
3726 int ochan = i & 3;
3727
3728 for (j = 0; j < 4; j++) {
3729 struct r600_bytecode_alu alu;
3730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3731 alu.op = ALU_OP2_DOT4;
3732 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3733 alu.src[0].chan = j;
3734
3735 alu.src[1].sel = 512 + i;
3736 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3737 alu.src[1].chan = j;
3738
3739 alu.dst.sel = clipdist_temp[oreg];
3740 alu.dst.chan = j;
3741 alu.dst.write = (j == ochan);
3742 if (j == 3)
3743 alu.last = 1;
3744 r = r600_bytecode_add_alu(ctx.bc, &alu);
3745 if (r)
3746 return r;
3747 }
3748 }
3749 }
3750
3751 /* Add stream outputs. */
3752 if (so.num_outputs) {
3753 bool emit = false;
3754 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3755 emit = true;
3756 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3757 emit = true;
3758 if (emit)
3759 emit_streamout(&ctx, &so, -1, NULL);
3760 }
3761 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3762 convert_edgeflag_to_int(&ctx);
3763
3764 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3765 r600_emit_tess_factor(&ctx);
3766
3767 if (lds_outputs) {
3768 if (ctx.type == PIPE_SHADER_VERTEX) {
3769 if (ctx.shader->noutput)
3770 emit_lds_vs_writes(&ctx);
3771 }
3772 } else if (ring_outputs) {
3773 if (shader->vs_as_es || shader->tes_as_es) {
3774 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3775 ctx.gs_export_gpr_tregs[1] = -1;
3776 ctx.gs_export_gpr_tregs[2] = -1;
3777 ctx.gs_export_gpr_tregs[3] = -1;
3778
3779 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3780 }
3781 } else {
3782 /* Export output */
3783 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3784
3785 for (i = 0, j = 0; i < noutput; i++, j++) {
3786 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3787 output[j].gpr = shader->output[i].gpr;
3788 output[j].elem_size = 3;
3789 output[j].swizzle_x = 0;
3790 output[j].swizzle_y = 1;
3791 output[j].swizzle_z = 2;
3792 output[j].swizzle_w = 3;
3793 output[j].burst_count = 1;
3794 output[j].type = 0xffffffff;
3795 output[j].op = CF_OP_EXPORT;
3796 switch (ctx.type) {
3797 case PIPE_SHADER_VERTEX:
3798 case PIPE_SHADER_TESS_EVAL:
3799 switch (shader->output[i].name) {
3800 case TGSI_SEMANTIC_POSITION:
3801 output[j].array_base = 60;
3802 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3803 pos_emitted = true;
3804 break;
3805
3806 case TGSI_SEMANTIC_PSIZE:
3807 output[j].array_base = 61;
3808 output[j].swizzle_y = 7;
3809 output[j].swizzle_z = 7;
3810 output[j].swizzle_w = 7;
3811 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3812 pos_emitted = true;
3813 break;
3814 case TGSI_SEMANTIC_EDGEFLAG:
3815 output[j].array_base = 61;
3816 output[j].swizzle_x = 7;
3817 output[j].swizzle_y = 0;
3818 output[j].swizzle_z = 7;
3819 output[j].swizzle_w = 7;
3820 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3821 pos_emitted = true;
3822 break;
3823 case TGSI_SEMANTIC_LAYER:
3824 /* spi_sid is 0 for outputs that are
3825 * not consumed by PS */
3826 if (shader->output[i].spi_sid) {
3827 output[j].array_base = next_param_base++;
3828 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3829 j++;
3830 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3831 }
3832 output[j].array_base = 61;
3833 output[j].swizzle_x = 7;
3834 output[j].swizzle_y = 7;
3835 output[j].swizzle_z = 0;
3836 output[j].swizzle_w = 7;
3837 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3838 pos_emitted = true;
3839 break;
3840 case TGSI_SEMANTIC_VIEWPORT_INDEX:
3841 /* spi_sid is 0 for outputs that are
3842 * not consumed by PS */
3843 if (shader->output[i].spi_sid) {
3844 output[j].array_base = next_param_base++;
3845 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3846 j++;
3847 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3848 }
3849 output[j].array_base = 61;
3850 output[j].swizzle_x = 7;
3851 output[j].swizzle_y = 7;
3852 output[j].swizzle_z = 7;
3853 output[j].swizzle_w = 0;
3854 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3855 pos_emitted = true;
3856 break;
3857 case TGSI_SEMANTIC_CLIPVERTEX:
3858 j--;
3859 break;
3860 case TGSI_SEMANTIC_CLIPDIST:
3861 output[j].array_base = next_clip_base++;
3862 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3863 pos_emitted = true;
3864 /* spi_sid is 0 for clipdistance outputs that were generated
3865 * for clipvertex - we don't need to pass them to PS */
3866 if (shader->output[i].spi_sid) {
3867 j++;
3868 /* duplicate it as PARAM to pass to the pixel shader */
3869 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3870 output[j].array_base = next_param_base++;
3871 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3872 }
3873 break;
3874 case TGSI_SEMANTIC_FOG:
3875 output[j].swizzle_y = 4; /* 0 */
3876 output[j].swizzle_z = 4; /* 0 */
3877 output[j].swizzle_w = 5; /* 1 */
3878 break;
3879 case TGSI_SEMANTIC_PRIMID:
3880 output[j].swizzle_x = 2;
3881 output[j].swizzle_y = 4; /* 0 */
3882 output[j].swizzle_z = 4; /* 0 */
3883 output[j].swizzle_w = 4; /* 0 */
3884 break;
3885 }
3886
3887 break;
3888 case PIPE_SHADER_FRAGMENT:
3889 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3890 /* never export more colors than the number of CBs */
3891 if (shader->output[i].sid >= max_color_exports) {
3892 /* skip export */
3893 j--;
3894 continue;
3895 }
3896 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3897 output[j].array_base = shader->output[i].sid;
3898 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3899 shader->nr_ps_color_exports++;
3900 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
3901
3902 /* If the i-th target format is set, all previous target formats must
3903 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
3904 */
3905 if (shader->output[i].sid > 0)
3906 for (unsigned x = 0; x < shader->output[i].sid; x++)
3907 shader->ps_color_export_mask |= (1 << (x*4));
3908
3909 if (shader->output[i].sid > shader->ps_export_highest)
3910 shader->ps_export_highest = shader->output[i].sid;
3911 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3912 for (k = 1; k < max_color_exports; k++) {
3913 j++;
3914 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3915 output[j].gpr = shader->output[i].gpr;
3916 output[j].elem_size = 3;
3917 output[j].swizzle_x = 0;
3918 output[j].swizzle_y = 1;
3919 output[j].swizzle_z = 2;
3920 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3921 output[j].burst_count = 1;
3922 output[j].array_base = k;
3923 output[j].op = CF_OP_EXPORT;
3924 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3925 shader->nr_ps_color_exports++;
3926 shader->ps_color_export_mask |= (0xf << (j * 4));
3927 }
3928 }
3929 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3930 output[j].array_base = 61;
3931 output[j].swizzle_x = 2;
3932 output[j].swizzle_y = 7;
3933 output[j].swizzle_z = output[j].swizzle_w = 7;
3934 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3935 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3936 output[j].array_base = 61;
3937 output[j].swizzle_x = 7;
3938 output[j].swizzle_y = 1;
3939 output[j].swizzle_z = output[j].swizzle_w = 7;
3940 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3941 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3942 output[j].array_base = 61;
3943 output[j].swizzle_x = 7;
3944 output[j].swizzle_y = 7;
3945 output[j].swizzle_z = 0;
3946 output[j].swizzle_w = 7;
3947 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3948 } else {
3949 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3950 r = -EINVAL;
3951 goto out_err;
3952 }
3953 break;
3954 case PIPE_SHADER_TESS_CTRL:
3955 break;
3956 default:
3957 R600_ERR("unsupported processor type %d\n", ctx.type);
3958 r = -EINVAL;
3959 goto out_err;
3960 }
3961
3962 if (output[j].type == 0xffffffff) {
3963 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3964 output[j].array_base = next_param_base++;
3965 }
3966 }
3967
3968 /* add fake position export */
3969 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3970 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3971 output[j].gpr = 0;
3972 output[j].elem_size = 3;
3973 output[j].swizzle_x = 7;
3974 output[j].swizzle_y = 7;
3975 output[j].swizzle_z = 7;
3976 output[j].swizzle_w = 7;
3977 output[j].burst_count = 1;
3978 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3979 output[j].array_base = 60;
3980 output[j].op = CF_OP_EXPORT;
3981 j++;
3982 }
3983
3984 /* add fake param output for vertex shader if no param is exported */
3985 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3986 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3987 output[j].gpr = 0;
3988 output[j].elem_size = 3;
3989 output[j].swizzle_x = 7;
3990 output[j].swizzle_y = 7;
3991 output[j].swizzle_z = 7;
3992 output[j].swizzle_w = 7;
3993 output[j].burst_count = 1;
3994 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3995 output[j].array_base = 0;
3996 output[j].op = CF_OP_EXPORT;
3997 j++;
3998 }
3999
4000 /* add fake pixel export */
4001 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4002 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4003 output[j].gpr = 0;
4004 output[j].elem_size = 3;
4005 output[j].swizzle_x = 7;
4006 output[j].swizzle_y = 7;
4007 output[j].swizzle_z = 7;
4008 output[j].swizzle_w = 7;
4009 output[j].burst_count = 1;
4010 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4011 output[j].array_base = 0;
4012 output[j].op = CF_OP_EXPORT;
4013 j++;
4014 shader->nr_ps_color_exports++;
4015 shader->ps_color_export_mask = 0xf;
4016 }
4017
4018 noutput = j;
4019
4020 /* set export done on last export of each type */
4021 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4022 if (!(output_done & (1 << output[k].type))) {
4023 output_done |= (1 << output[k].type);
4024 output[k].op = CF_OP_EXPORT_DONE;
4025 }
4026 }
4027 /* add output to bytecode */
4028 for (i = 0; i < noutput; i++) {
4029 r = r600_bytecode_add_output(ctx.bc, &output[i]);
4030 if (r)
4031 goto out_err;
4032 }
4033 }
4034
4035 /* add program end */
4036 if (ctx.bc->chip_class == CAYMAN)
4037 cm_bytecode_add_cf_end(ctx.bc);
4038 else {
4039 const struct cf_op_info *last = NULL;
4040
4041 if (ctx.bc->cf_last)
4042 last = r600_isa_cf(ctx.bc->cf_last->op);
4043
4044 /* alu clause instructions don't have EOP bit, so add NOP */
4045 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4046 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4047
4048 ctx.bc->cf_last->end_of_program = 1;
4049 }
4050
4051 /* check GPR limit - we have 124 = 128 - 4
4052 * (4 are reserved as alu clause temporary registers) */
4053 if (ctx.bc->ngpr > 124) {
4054 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4055 r = -ENOMEM;
4056 goto out_err;
4057 }
4058
4059 if (ctx.type == PIPE_SHADER_GEOMETRY) {
4060 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4061 return r;
4062 }
4063
4064 free(ctx.literals);
4065 tgsi_parse_free(&ctx.parse);
4066 return 0;
4067 out_err:
4068 free(ctx.literals);
4069 tgsi_parse_free(&ctx.parse);
4070 return r;
4071 }
4072
4073 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4074 {
4075 const unsigned tgsi_opcode =
4076 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4077 R600_ERR("%s tgsi opcode unsupported\n",
4078 tgsi_get_opcode_name(tgsi_opcode));
4079 return -EINVAL;
4080 }
4081
4082 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4083 {
4084 return 0;
4085 }
4086
4087 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4088 const struct r600_shader_src *shader_src,
4089 unsigned chan)
4090 {
4091 bc_src->sel = shader_src->sel;
4092 bc_src->chan = shader_src->swizzle[chan];
4093 bc_src->neg = shader_src->neg;
4094 bc_src->abs = shader_src->abs;
4095 bc_src->rel = shader_src->rel;
4096 bc_src->value = shader_src->value[bc_src->chan];
4097 bc_src->kc_bank = shader_src->kc_bank;
4098 bc_src->kc_rel = shader_src->kc_rel;
4099 }
4100
4101 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4102 {
4103 bc_src->abs = 1;
4104 bc_src->neg = 0;
4105 }
4106
4107 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4108 {
4109 bc_src->neg = !bc_src->neg;
4110 }
4111
4112 static void tgsi_dst(struct r600_shader_ctx *ctx,
4113 const struct tgsi_full_dst_register *tgsi_dst,
4114 unsigned swizzle,
4115 struct r600_bytecode_alu_dst *r600_dst)
4116 {
4117 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4118
4119 r600_dst->sel = tgsi_dst->Register.Index;
4120 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4121 r600_dst->chan = swizzle;
4122 r600_dst->write = 1;
4123 if (inst->Instruction.Saturate) {
4124 r600_dst->clamp = 1;
4125 }
4126 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4127 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4128 return;
4129 }
4130 }
4131 if (tgsi_dst->Register.Indirect)
4132 r600_dst->rel = V_SQ_REL_RELATIVE;
4133
4134 }
4135
4136 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4137 {
4138 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4139 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4140 struct r600_bytecode_alu alu;
4141 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4142 int use_tmp = 0;
4143 int swizzle_x = inst->Src[0].Register.SwizzleX;
4144
4145 if (singledest) {
4146 switch (write_mask) {
4147 case 0x1:
4148 if (swizzle_x == 2) {
4149 write_mask = 0xc;
4150 use_tmp = 3;
4151 } else
4152 write_mask = 0x3;
4153 break;
4154 case 0x2:
4155 if (swizzle_x == 2) {
4156 write_mask = 0xc;
4157 use_tmp = 3;
4158 } else {
4159 write_mask = 0x3;
4160 use_tmp = 1;
4161 }
4162 break;
4163 case 0x4:
4164 if (swizzle_x == 0) {
4165 write_mask = 0x3;
4166 use_tmp = 1;
4167 } else
4168 write_mask = 0xc;
4169 break;
4170 case 0x8:
4171 if (swizzle_x == 0) {
4172 write_mask = 0x3;
4173 use_tmp = 1;
4174 } else {
4175 write_mask = 0xc;
4176 use_tmp = 3;
4177 }
4178 break;
4179 }
4180 }
4181
4182 lasti = tgsi_last_instruction(write_mask);
4183 for (i = 0; i <= lasti; i++) {
4184
4185 if (!(write_mask & (1 << i)))
4186 continue;
4187
4188 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4189
4190 if (singledest) {
4191 if (use_tmp || dest_temp) {
4192 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4193 alu.dst.chan = i;
4194 alu.dst.write = 1;
4195 } else {
4196 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4197 }
4198 if (i == 1 || i == 3)
4199 alu.dst.write = 0;
4200 } else
4201 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4202
4203 alu.op = op_override ? op_override : ctx->inst_info->op;
4204 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4205 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4206 } else if (!swap) {
4207 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4208 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4209 }
4210 } else {
4211 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4212 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4213 }
4214
4215 /* handle some special cases */
4216 if (i == 1 || i == 3) {
4217 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4218 case TGSI_OPCODE_DABS:
4219 r600_bytecode_src_set_abs(&alu.src[0]);
4220 break;
4221 default:
4222 break;
4223 }
4224 }
4225 if (i == lasti) {
4226 alu.last = 1;
4227 }
4228 r = r600_bytecode_add_alu(ctx->bc, &alu);
4229 if (r)
4230 return r;
4231 }
4232
4233 if (use_tmp) {
4234 write_mask = inst->Dst[0].Register.WriteMask;
4235
4236 lasti = tgsi_last_instruction(write_mask);
4237 /* move result from temp to dst */
4238 for (i = 0; i <= lasti; i++) {
4239 if (!(write_mask & (1 << i)))
4240 continue;
4241
4242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4243 alu.op = ALU_OP1_MOV;
4244
4245 if (dest_temp) {
4246 alu.dst.sel = dest_temp;
4247 alu.dst.chan = i;
4248 alu.dst.write = 1;
4249 } else
4250 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4251 alu.src[0].sel = ctx->temp_reg;
4252 alu.src[0].chan = use_tmp - 1;
4253 alu.last = (i == lasti);
4254
4255 r = r600_bytecode_add_alu(ctx->bc, &alu);
4256 if (r)
4257 return r;
4258 }
4259 }
4260 return 0;
4261 }
4262
4263 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4264 {
4265 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4266 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4267 /* confirm writemasking */
4268 if ((write_mask & 0x3) != 0x3 &&
4269 (write_mask & 0xc) != 0xc) {
4270 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4271 return -1;
4272 }
4273 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4274 }
4275
4276 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4277 {
4278 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4279 }
4280
4281 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4282 {
4283 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4284 }
4285
4286 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4287 {
4288 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4289 struct r600_bytecode_alu alu;
4290 int i, j, r;
4291 int lasti = 3;
4292 int tmp = r600_get_temp(ctx);
4293
4294 for (i = 0; i < lasti + 1; i++) {
4295
4296 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4297 alu.op = ctx->inst_info->op;
4298 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4299 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4300 }
4301
4302 if (inst->Dst[0].Register.WriteMask & (1 << i))
4303 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4304 else
4305 alu.dst.sel = tmp;
4306
4307 alu.dst.chan = i;
4308 alu.is_op3 = 1;
4309 if (i == lasti) {
4310 alu.last = 1;
4311 }
4312 r = r600_bytecode_add_alu(ctx->bc, &alu);
4313 if (r)
4314 return r;
4315 }
4316 return 0;
4317 }
4318
4319 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4320 {
4321 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4322 struct r600_bytecode_alu alu;
4323 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4324 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4325 /* use temp register if trans_only and more than one dst component */
4326 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4327 unsigned op = ctx->inst_info->op;
4328
4329 if (op == ALU_OP2_MUL_IEEE &&
4330 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4331 op = ALU_OP2_MUL;
4332
4333 for (i = 0; i <= lasti; i++) {
4334 if (!(write_mask & (1 << i)))
4335 continue;
4336
4337 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4338 if (use_tmp) {
4339 alu.dst.sel = ctx->temp_reg;
4340 alu.dst.chan = i;
4341 alu.dst.write = 1;
4342 } else
4343 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4344
4345 alu.op = op;
4346 if (!swap) {
4347 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4348 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4349 }
4350 } else {
4351 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4352 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4353 }
4354 if (i == lasti || trans_only) {
4355 alu.last = 1;
4356 }
4357 r = r600_bytecode_add_alu(ctx->bc, &alu);
4358 if (r)
4359 return r;
4360 }
4361
4362 if (use_tmp) {
4363 /* move result from temp to dst */
4364 for (i = 0; i <= lasti; i++) {
4365 if (!(write_mask & (1 << i)))
4366 continue;
4367
4368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4369 alu.op = ALU_OP1_MOV;
4370 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4371 alu.src[0].sel = ctx->temp_reg;
4372 alu.src[0].chan = i;
4373 alu.last = (i == lasti);
4374
4375 r = r600_bytecode_add_alu(ctx->bc, &alu);
4376 if (r)
4377 return r;
4378 }
4379 }
4380 return 0;
4381 }
4382
4383 static int tgsi_op2(struct r600_shader_ctx *ctx)
4384 {
4385 return tgsi_op2_s(ctx, 0, 0);
4386 }
4387
4388 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4389 {
4390 return tgsi_op2_s(ctx, 1, 0);
4391 }
4392
4393 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4394 {
4395 return tgsi_op2_s(ctx, 0, 1);
4396 }
4397
4398 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4399 {
4400 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4401 struct r600_bytecode_alu alu;
4402 int i, r;
4403 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4404
4405 for (i = 0; i < lasti + 1; i++) {
4406
4407 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4408 continue;
4409 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4410 alu.op = ctx->inst_info->op;
4411
4412 alu.src[0].sel = V_SQ_ALU_SRC_0;
4413
4414 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4415
4416 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4417
4418 if (i == lasti) {
4419 alu.last = 1;
4420 }
4421 r = r600_bytecode_add_alu(ctx->bc, &alu);
4422 if (r)
4423 return r;
4424 }
4425 return 0;
4426
4427 }
4428
4429 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4430 {
4431 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4432 struct r600_bytecode_alu alu;
4433 int i, r;
4434 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4435
4436 for (i = 0; i < lasti + 1; i++) {
4437
4438 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4439 continue;
4440 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4441 alu.op = ALU_OP1_MOV;
4442
4443 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4444
4445 if (i == 1 || i == 3)
4446 r600_bytecode_src_toggle_neg(&alu.src[0]);
4447 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4448
4449 if (i == lasti) {
4450 alu.last = 1;
4451 }
4452 r = r600_bytecode_add_alu(ctx->bc, &alu);
4453 if (r)
4454 return r;
4455 }
4456 return 0;
4457
4458 }
4459
4460 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4461 {
4462 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4463 struct r600_bytecode_alu alu;
4464 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4465 int i, j, r;
4466
4467 for (i = 0; i <= 3; i++) {
4468 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4469 alu.op = ctx->inst_info->op;
4470
4471 alu.dst.sel = ctx->temp_reg;
4472 alu.dst.chan = i;
4473 alu.dst.write = 1;
4474 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4475 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4476 }
4477
4478 if (i == 3)
4479 alu.last = 1;
4480
4481 r = r600_bytecode_add_alu(ctx->bc, &alu);
4482 if (r)
4483 return r;
4484 }
4485
4486 /* Replicate significand result across channels. */
4487 for (i = 0; i <= 3; i++) {
4488 if (!(write_mask & (1 << i)))
4489 continue;
4490
4491 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4492 alu.op = ALU_OP1_MOV;
4493 alu.src[0].chan = (i & 1) + 2;
4494 alu.src[0].sel = ctx->temp_reg;
4495
4496 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4497 alu.dst.write = 1;
4498 alu.last = 1;
4499 r = r600_bytecode_add_alu(ctx->bc, &alu);
4500 if (r)
4501 return r;
4502 }
4503
4504 for (i = 0; i <= 3; i++) {
4505 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4506 /* MOV third channels to writemask dst1 */
4507 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4508 alu.op = ALU_OP1_MOV;
4509 alu.src[0].chan = 1;
4510 alu.src[0].sel = ctx->temp_reg;
4511
4512 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4513 alu.last = 1;
4514 r = r600_bytecode_add_alu(ctx->bc, &alu);
4515 if (r)
4516 return r;
4517 break;
4518 }
4519 }
4520 return 0;
4521 }
4522
4523
4524 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4525 {
4526 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4527 struct r600_bytecode_alu alu;
4528 int i, c, r;
4529 int write_mask = inst->Dst[0].Register.WriteMask;
4530 int temp_reg = r600_get_temp(ctx);
4531
4532 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4533 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4534
4535 for (c = 0; c < 2; c++) {
4536 int dchan = c * 2;
4537 if (write_mask & (0x3 << dchan)) {
4538 /* split into 24-bit int and 8-bit int */
4539 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4540 alu.op = ALU_OP2_AND_INT;
4541 alu.dst.sel = temp_reg;
4542 alu.dst.chan = dchan;
4543 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4544 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4545 alu.src[1].value = 0xffffff00;
4546 alu.dst.write = 1;
4547 r = r600_bytecode_add_alu(ctx->bc, &alu);
4548 if (r)
4549 return r;
4550
4551 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4552 alu.op = ALU_OP2_AND_INT;
4553 alu.dst.sel = temp_reg;
4554 alu.dst.chan = dchan + 1;
4555 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4556 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4557 alu.src[1].value = 0xff;
4558 alu.dst.write = 1;
4559 alu.last = 1;
4560 r = r600_bytecode_add_alu(ctx->bc, &alu);
4561 if (r)
4562 return r;
4563 }
4564 }
4565
4566 for (c = 0; c < 2; c++) {
4567 int dchan = c * 2;
4568 if (write_mask & (0x3 << dchan)) {
4569 for (i = dchan; i <= dchan + 1; i++) {
4570 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4571 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4572
4573 alu.src[0].sel = temp_reg;
4574 alu.src[0].chan = i;
4575 alu.dst.sel = temp_reg;
4576 alu.dst.chan = i;
4577 alu.dst.write = 1;
4578 if (ctx->bc->chip_class == CAYMAN)
4579 alu.last = i == dchan + 1;
4580 else
4581 alu.last = 1; /* trans only ops on evergreen */
4582
4583 r = r600_bytecode_add_alu(ctx->bc, &alu);
4584 if (r)
4585 return r;
4586 }
4587 }
4588 }
4589
4590 for (c = 0; c < 2; c++) {
4591 int dchan = c * 2;
4592 if (write_mask & (0x3 << dchan)) {
4593 for (i = 0; i < 4; i++) {
4594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4595 alu.op = ALU_OP1_FLT32_TO_FLT64;
4596
4597 alu.src[0].chan = dchan + (i / 2);
4598 if (i == 0 || i == 2)
4599 alu.src[0].sel = temp_reg;
4600 else {
4601 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4602 alu.src[0].value = 0x0;
4603 }
4604 alu.dst.sel = ctx->temp_reg;
4605 alu.dst.chan = i;
4606 alu.last = i == 3;
4607 alu.dst.write = 1;
4608
4609 r = r600_bytecode_add_alu(ctx->bc, &alu);
4610 if (r)
4611 return r;
4612 }
4613
4614 for (i = 0; i <= 1; i++) {
4615 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4616 alu.op = ALU_OP2_ADD_64;
4617
4618 alu.src[0].chan = fp64_switch(i);
4619 alu.src[0].sel = ctx->temp_reg;
4620
4621 alu.src[1].chan = fp64_switch(i + 2);
4622 alu.src[1].sel = ctx->temp_reg;
4623 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
4624 alu.last = i == 1;
4625
4626 r = r600_bytecode_add_alu(ctx->bc, &alu);
4627 if (r)
4628 return r;
4629 }
4630 }
4631 }
4632
4633 return 0;
4634 }
4635
4636 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4637 {
4638 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4639 struct r600_bytecode_alu alu;
4640 int i, r;
4641 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4642 int treg = r600_get_temp(ctx);
4643 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4644 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4645
4646 /* do a 64->32 into a temp register */
4647 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4648 if (r)
4649 return r;
4650
4651 for (i = 0; i <= lasti; i++) {
4652 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4653 continue;
4654 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4655 alu.op = ctx->inst_info->op;
4656
4657 alu.src[0].chan = i;
4658 alu.src[0].sel = treg;
4659 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4660 alu.last = (i == lasti);
4661
4662 r = r600_bytecode_add_alu(ctx->bc, &alu);
4663 if (r)
4664 return r;
4665 }
4666
4667 return 0;
4668 }
4669
4670 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4671 unsigned op,
4672 int dst_reg,
4673 struct r600_shader_src *src,
4674 bool abs)
4675 {
4676 struct r600_bytecode_alu alu;
4677 const int last_slot = 3;
4678 int r;
4679
4680 /* these have to write the result to X/Y by the looks of it */
4681 for (int i = 0 ; i < last_slot; i++) {
4682 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4683 alu.op = op;
4684
4685 r600_bytecode_src(&alu.src[0], src, 1);
4686 r600_bytecode_src(&alu.src[1], src, 0);
4687
4688 if (abs)
4689 r600_bytecode_src_set_abs(&alu.src[1]);
4690
4691 alu.dst.sel = dst_reg;
4692 alu.dst.chan = i;
4693 alu.dst.write = (i == 0 || i == 1);
4694
4695 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4696 alu.last = 1;
4697 r = r600_bytecode_add_alu(bc, &alu);
4698 if (r)
4699 return r;
4700 }
4701
4702 return 0;
4703 }
4704
4705 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4706 {
4707 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4708 int i, r;
4709 struct r600_bytecode_alu alu;
4710 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4711 int t1 = ctx->temp_reg;
4712
4713 /* should only be one src regs */
4714 assert(inst->Instruction.NumSrcRegs == 1);
4715
4716 /* only support one double at a time */
4717 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4718 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4719
4720 r = cayman_emit_unary_double_raw(
4721 ctx->bc, ctx->inst_info->op, t1,
4722 &ctx->src[0],
4723 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4724 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4725 if (r)
4726 return r;
4727
4728 for (i = 0 ; i <= lasti; i++) {
4729 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4730 continue;
4731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4732 alu.op = ALU_OP1_MOV;
4733 alu.src[0].sel = t1;
4734 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4735 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4736 alu.dst.write = 1;
4737 if (i == lasti)
4738 alu.last = 1;
4739 r = r600_bytecode_add_alu(ctx->bc, &alu);
4740 if (r)
4741 return r;
4742 }
4743 return 0;
4744 }
4745
4746 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4747 {
4748 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4749 int i, j, r;
4750 struct r600_bytecode_alu alu;
4751 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4752
4753 for (i = 0 ; i < last_slot; i++) {
4754 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4755 alu.op = ctx->inst_info->op;
4756 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4757 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4758
4759 /* RSQ should take the absolute value of src */
4760 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4761 r600_bytecode_src_set_abs(&alu.src[j]);
4762 }
4763 }
4764 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4765 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4766
4767 if (i == last_slot - 1)
4768 alu.last = 1;
4769 r = r600_bytecode_add_alu(ctx->bc, &alu);
4770 if (r)
4771 return r;
4772 }
4773 return 0;
4774 }
4775
4776 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4777 {
4778 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4779 int i, j, k, r;
4780 struct r600_bytecode_alu alu;
4781 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4782 int t1 = ctx->temp_reg;
4783
4784 for (k = 0; k <= lasti; k++) {
4785 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4786 continue;
4787
4788 for (i = 0 ; i < 4; i++) {
4789 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4790 alu.op = ctx->inst_info->op;
4791 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4792 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4793 }
4794 alu.dst.sel = t1;
4795 alu.dst.chan = i;
4796 alu.dst.write = (i == k);
4797 if (i == 3)
4798 alu.last = 1;
4799 r = r600_bytecode_add_alu(ctx->bc, &alu);
4800 if (r)
4801 return r;
4802 }
4803 }
4804
4805 for (i = 0 ; i <= lasti; i++) {
4806 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4807 continue;
4808 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4809 alu.op = ALU_OP1_MOV;
4810 alu.src[0].sel = t1;
4811 alu.src[0].chan = i;
4812 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4813 alu.dst.write = 1;
4814 if (i == lasti)
4815 alu.last = 1;
4816 r = r600_bytecode_add_alu(ctx->bc, &alu);
4817 if (r)
4818 return r;
4819 }
4820
4821 return 0;
4822 }
4823
4824
4825 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4826 {
4827 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4828 int i, j, k, r;
4829 struct r600_bytecode_alu alu;
4830 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4831 int t1 = ctx->temp_reg;
4832
4833 /* t1 would get overwritten below if we actually tried to
4834 * multiply two pairs of doubles at a time. */
4835 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4836 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4837
4838 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4839
4840 for (i = 0; i < 4; i++) {
4841 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4842 alu.op = ctx->inst_info->op;
4843 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4844 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4845 }
4846 alu.dst.sel = t1;
4847 alu.dst.chan = i;
4848 alu.dst.write = 1;
4849 if (i == 3)
4850 alu.last = 1;
4851 r = r600_bytecode_add_alu(ctx->bc, &alu);
4852 if (r)
4853 return r;
4854 }
4855
4856 for (i = 0; i <= lasti; i++) {
4857 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4858 continue;
4859 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4860 alu.op = ALU_OP1_MOV;
4861 alu.src[0].sel = t1;
4862 alu.src[0].chan = i;
4863 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4864 alu.dst.write = 1;
4865 if (i == lasti)
4866 alu.last = 1;
4867 r = r600_bytecode_add_alu(ctx->bc, &alu);
4868 if (r)
4869 return r;
4870 }
4871
4872 return 0;
4873 }
4874
4875 /*
4876 * Emit RECIP_64 + MUL_64 to implement division.
4877 */
4878 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4879 {
4880 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4881 int r;
4882 struct r600_bytecode_alu alu;
4883 int t1 = ctx->temp_reg;
4884 int k;
4885
4886 /* Only support one double at a time. This is the same constraint as
4887 * in DMUL lowering. */
4888 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4889 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4890
4891 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4892
4893 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4894 if (r)
4895 return r;
4896
4897 for (int i = 0; i < 4; i++) {
4898 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4899 alu.op = ALU_OP2_MUL_64;
4900
4901 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4902
4903 alu.src[1].sel = t1;
4904 alu.src[1].chan = (i == 3) ? 0 : 1;
4905
4906 alu.dst.sel = t1;
4907 alu.dst.chan = i;
4908 alu.dst.write = 1;
4909 if (i == 3)
4910 alu.last = 1;
4911 r = r600_bytecode_add_alu(ctx->bc, &alu);
4912 if (r)
4913 return r;
4914 }
4915
4916 for (int i = 0; i < 2; i++) {
4917 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4918 alu.op = ALU_OP1_MOV;
4919 alu.src[0].sel = t1;
4920 alu.src[0].chan = i;
4921 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4922 alu.dst.write = 1;
4923 if (i == 1)
4924 alu.last = 1;
4925 r = r600_bytecode_add_alu(ctx->bc, &alu);
4926 if (r)
4927 return r;
4928 }
4929 return 0;
4930 }
4931
4932 /*
4933 * r600 - trunc to -PI..PI range
4934 * r700 - normalize by dividing by 2PI
4935 * see fdo bug 27901
4936 */
4937 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4938 {
4939 int r;
4940 struct r600_bytecode_alu alu;
4941
4942 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4943 alu.op = ALU_OP3_MULADD;
4944 alu.is_op3 = 1;
4945
4946 alu.dst.chan = 0;
4947 alu.dst.sel = ctx->temp_reg;
4948 alu.dst.write = 1;
4949
4950 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4951
4952 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4953 alu.src[1].chan = 0;
4954 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4955 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4956 alu.src[2].chan = 0;
4957 alu.last = 1;
4958 r = r600_bytecode_add_alu(ctx->bc, &alu);
4959 if (r)
4960 return r;
4961
4962 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4963 alu.op = ALU_OP1_FRACT;
4964
4965 alu.dst.chan = 0;
4966 alu.dst.sel = ctx->temp_reg;
4967 alu.dst.write = 1;
4968
4969 alu.src[0].sel = ctx->temp_reg;
4970 alu.src[0].chan = 0;
4971 alu.last = 1;
4972 r = r600_bytecode_add_alu(ctx->bc, &alu);
4973 if (r)
4974 return r;
4975
4976 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4977 alu.op = ALU_OP3_MULADD;
4978 alu.is_op3 = 1;
4979
4980 alu.dst.chan = 0;
4981 alu.dst.sel = ctx->temp_reg;
4982 alu.dst.write = 1;
4983
4984 alu.src[0].sel = ctx->temp_reg;
4985 alu.src[0].chan = 0;
4986
4987 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4988 alu.src[1].chan = 0;
4989 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4990 alu.src[2].chan = 0;
4991
4992 if (ctx->bc->chip_class == R600) {
4993 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4994 alu.src[2].value = u_bitcast_f2u(-M_PI);
4995 } else {
4996 alu.src[1].sel = V_SQ_ALU_SRC_1;
4997 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4998 alu.src[2].neg = 1;
4999 }
5000
5001 alu.last = 1;
5002 r = r600_bytecode_add_alu(ctx->bc, &alu);
5003 if (r)
5004 return r;
5005 return 0;
5006 }
5007
5008 static int cayman_trig(struct r600_shader_ctx *ctx)
5009 {
5010 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5011 struct r600_bytecode_alu alu;
5012 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5013 int i, r;
5014
5015 r = tgsi_setup_trig(ctx);
5016 if (r)
5017 return r;
5018
5019
5020 for (i = 0; i < last_slot; i++) {
5021 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5022 alu.op = ctx->inst_info->op;
5023 alu.dst.chan = i;
5024
5025 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5026 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5027
5028 alu.src[0].sel = ctx->temp_reg;
5029 alu.src[0].chan = 0;
5030 if (i == last_slot - 1)
5031 alu.last = 1;
5032 r = r600_bytecode_add_alu(ctx->bc, &alu);
5033 if (r)
5034 return r;
5035 }
5036 return 0;
5037 }
5038
5039 static int tgsi_trig(struct r600_shader_ctx *ctx)
5040 {
5041 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5042 struct r600_bytecode_alu alu;
5043 int i, r;
5044 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5045
5046 r = tgsi_setup_trig(ctx);
5047 if (r)
5048 return r;
5049
5050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5051 alu.op = ctx->inst_info->op;
5052 alu.dst.chan = 0;
5053 alu.dst.sel = ctx->temp_reg;
5054 alu.dst.write = 1;
5055
5056 alu.src[0].sel = ctx->temp_reg;
5057 alu.src[0].chan = 0;
5058 alu.last = 1;
5059 r = r600_bytecode_add_alu(ctx->bc, &alu);
5060 if (r)
5061 return r;
5062
5063 /* replicate result */
5064 for (i = 0; i < lasti + 1; i++) {
5065 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5066 continue;
5067
5068 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5069 alu.op = ALU_OP1_MOV;
5070
5071 alu.src[0].sel = ctx->temp_reg;
5072 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5073 if (i == lasti)
5074 alu.last = 1;
5075 r = r600_bytecode_add_alu(ctx->bc, &alu);
5076 if (r)
5077 return r;
5078 }
5079 return 0;
5080 }
5081
5082 static int tgsi_kill(struct r600_shader_ctx *ctx)
5083 {
5084 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5085 struct r600_bytecode_alu alu;
5086 int i, r;
5087
5088 for (i = 0; i < 4; i++) {
5089 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5090 alu.op = ctx->inst_info->op;
5091
5092 alu.dst.chan = i;
5093
5094 alu.src[0].sel = V_SQ_ALU_SRC_0;
5095
5096 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5097 alu.src[1].sel = V_SQ_ALU_SRC_1;
5098 alu.src[1].neg = 1;
5099 } else {
5100 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5101 }
5102 if (i == 3) {
5103 alu.last = 1;
5104 }
5105 r = r600_bytecode_add_alu(ctx->bc, &alu);
5106 if (r)
5107 return r;
5108 }
5109
5110 /* kill must be last in ALU */
5111 ctx->bc->force_add_cf = 1;
5112 ctx->shader->uses_kill = TRUE;
5113 return 0;
5114 }
5115
5116 static int tgsi_lit(struct r600_shader_ctx *ctx)
5117 {
5118 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5119 struct r600_bytecode_alu alu;
5120 int r;
5121
5122 /* tmp.x = max(src.y, 0.0) */
5123 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5124 alu.op = ALU_OP2_MAX;
5125 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5126 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5127 alu.src[1].chan = 1;
5128
5129 alu.dst.sel = ctx->temp_reg;
5130 alu.dst.chan = 0;
5131 alu.dst.write = 1;
5132
5133 alu.last = 1;
5134 r = r600_bytecode_add_alu(ctx->bc, &alu);
5135 if (r)
5136 return r;
5137
5138 if (inst->Dst[0].Register.WriteMask & (1 << 2))
5139 {
5140 int chan;
5141 int sel;
5142 unsigned i;
5143
5144 if (ctx->bc->chip_class == CAYMAN) {
5145 for (i = 0; i < 3; i++) {
5146 /* tmp.z = log(tmp.x) */
5147 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5148 alu.op = ALU_OP1_LOG_CLAMPED;
5149 alu.src[0].sel = ctx->temp_reg;
5150 alu.src[0].chan = 0;
5151 alu.dst.sel = ctx->temp_reg;
5152 alu.dst.chan = i;
5153 if (i == 2) {
5154 alu.dst.write = 1;
5155 alu.last = 1;
5156 } else
5157 alu.dst.write = 0;
5158
5159 r = r600_bytecode_add_alu(ctx->bc, &alu);
5160 if (r)
5161 return r;
5162 }
5163 } else {
5164 /* tmp.z = log(tmp.x) */
5165 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5166 alu.op = ALU_OP1_LOG_CLAMPED;
5167 alu.src[0].sel = ctx->temp_reg;
5168 alu.src[0].chan = 0;
5169 alu.dst.sel = ctx->temp_reg;
5170 alu.dst.chan = 2;
5171 alu.dst.write = 1;
5172 alu.last = 1;
5173 r = r600_bytecode_add_alu(ctx->bc, &alu);
5174 if (r)
5175 return r;
5176 }
5177
5178 chan = alu.dst.chan;
5179 sel = alu.dst.sel;
5180
5181 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5182 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5183 alu.op = ALU_OP3_MUL_LIT;
5184 alu.src[0].sel = sel;
5185 alu.src[0].chan = chan;
5186 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5187 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5188 alu.dst.sel = ctx->temp_reg;
5189 alu.dst.chan = 0;
5190 alu.dst.write = 1;
5191 alu.is_op3 = 1;
5192 alu.last = 1;
5193 r = r600_bytecode_add_alu(ctx->bc, &alu);
5194 if (r)
5195 return r;
5196
5197 if (ctx->bc->chip_class == CAYMAN) {
5198 for (i = 0; i < 3; i++) {
5199 /* dst.z = exp(tmp.x) */
5200 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5201 alu.op = ALU_OP1_EXP_IEEE;
5202 alu.src[0].sel = ctx->temp_reg;
5203 alu.src[0].chan = 0;
5204 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5205 if (i == 2) {
5206 alu.dst.write = 1;
5207 alu.last = 1;
5208 } else
5209 alu.dst.write = 0;
5210 r = r600_bytecode_add_alu(ctx->bc, &alu);
5211 if (r)
5212 return r;
5213 }
5214 } else {
5215 /* dst.z = exp(tmp.x) */
5216 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5217 alu.op = ALU_OP1_EXP_IEEE;
5218 alu.src[0].sel = ctx->temp_reg;
5219 alu.src[0].chan = 0;
5220 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5221 alu.last = 1;
5222 r = r600_bytecode_add_alu(ctx->bc, &alu);
5223 if (r)
5224 return r;
5225 }
5226 }
5227
5228 /* dst.x, <- 1.0 */
5229 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5230 alu.op = ALU_OP1_MOV;
5231 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5232 alu.src[0].chan = 0;
5233 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5234 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5235 r = r600_bytecode_add_alu(ctx->bc, &alu);
5236 if (r)
5237 return r;
5238
5239 /* dst.y = max(src.x, 0.0) */
5240 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5241 alu.op = ALU_OP2_MAX;
5242 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5243 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5244 alu.src[1].chan = 0;
5245 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5246 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5247 r = r600_bytecode_add_alu(ctx->bc, &alu);
5248 if (r)
5249 return r;
5250
5251 /* dst.w, <- 1.0 */
5252 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5253 alu.op = ALU_OP1_MOV;
5254 alu.src[0].sel = V_SQ_ALU_SRC_1;
5255 alu.src[0].chan = 0;
5256 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5257 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5258 alu.last = 1;
5259 r = r600_bytecode_add_alu(ctx->bc, &alu);
5260 if (r)
5261 return r;
5262
5263 return 0;
5264 }
5265
5266 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5267 {
5268 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5269 struct r600_bytecode_alu alu;
5270 int i, r;
5271
5272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5273
5274 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5275
5276 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5277 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5278 r600_bytecode_src_set_abs(&alu.src[i]);
5279 }
5280 alu.dst.sel = ctx->temp_reg;
5281 alu.dst.write = 1;
5282 alu.last = 1;
5283 r = r600_bytecode_add_alu(ctx->bc, &alu);
5284 if (r)
5285 return r;
5286 /* replicate result */
5287 return tgsi_helper_tempx_replicate(ctx);
5288 }
5289
5290 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5291 {
5292 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5293 struct r600_bytecode_alu alu;
5294 int i, r;
5295
5296 for (i = 0; i < 4; i++) {
5297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5298 alu.src[0].sel = ctx->temp_reg;
5299 alu.op = ALU_OP1_MOV;
5300 alu.dst.chan = i;
5301 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5302 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5303 if (i == 3)
5304 alu.last = 1;
5305 r = r600_bytecode_add_alu(ctx->bc, &alu);
5306 if (r)
5307 return r;
5308 }
5309 return 0;
5310 }
5311
5312 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5313 {
5314 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5315 struct r600_bytecode_alu alu;
5316 int i, r;
5317
5318 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5319 alu.op = ctx->inst_info->op;
5320 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5321 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5322 }
5323 alu.dst.sel = ctx->temp_reg;
5324 alu.dst.write = 1;
5325 alu.last = 1;
5326 r = r600_bytecode_add_alu(ctx->bc, &alu);
5327 if (r)
5328 return r;
5329 /* replicate result */
5330 return tgsi_helper_tempx_replicate(ctx);
5331 }
5332
5333 static int cayman_pow(struct r600_shader_ctx *ctx)
5334 {
5335 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5336 int i, r;
5337 struct r600_bytecode_alu alu;
5338 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5339
5340 for (i = 0; i < 3; i++) {
5341 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5342 alu.op = ALU_OP1_LOG_IEEE;
5343 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5344 alu.dst.sel = ctx->temp_reg;
5345 alu.dst.chan = i;
5346 alu.dst.write = 1;
5347 if (i == 2)
5348 alu.last = 1;
5349 r = r600_bytecode_add_alu(ctx->bc, &alu);
5350 if (r)
5351 return r;
5352 }
5353
5354 /* b * LOG2(a) */
5355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5356 alu.op = ALU_OP2_MUL;
5357 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5358 alu.src[1].sel = ctx->temp_reg;
5359 alu.dst.sel = ctx->temp_reg;
5360 alu.dst.write = 1;
5361 alu.last = 1;
5362 r = r600_bytecode_add_alu(ctx->bc, &alu);
5363 if (r)
5364 return r;
5365
5366 for (i = 0; i < last_slot; i++) {
5367 /* POW(a,b) = EXP2(b * LOG2(a))*/
5368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5369 alu.op = ALU_OP1_EXP_IEEE;
5370 alu.src[0].sel = ctx->temp_reg;
5371
5372 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5373 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5374 if (i == last_slot - 1)
5375 alu.last = 1;
5376 r = r600_bytecode_add_alu(ctx->bc, &alu);
5377 if (r)
5378 return r;
5379 }
5380 return 0;
5381 }
5382
5383 static int tgsi_pow(struct r600_shader_ctx *ctx)
5384 {
5385 struct r600_bytecode_alu alu;
5386 int r;
5387
5388 /* LOG2(a) */
5389 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5390 alu.op = ALU_OP1_LOG_IEEE;
5391 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5392 alu.dst.sel = ctx->temp_reg;
5393 alu.dst.write = 1;
5394 alu.last = 1;
5395 r = r600_bytecode_add_alu(ctx->bc, &alu);
5396 if (r)
5397 return r;
5398 /* b * LOG2(a) */
5399 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5400 alu.op = ALU_OP2_MUL;
5401 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5402 alu.src[1].sel = ctx->temp_reg;
5403 alu.dst.sel = ctx->temp_reg;
5404 alu.dst.write = 1;
5405 alu.last = 1;
5406 r = r600_bytecode_add_alu(ctx->bc, &alu);
5407 if (r)
5408 return r;
5409 /* POW(a,b) = EXP2(b * LOG2(a))*/
5410 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5411 alu.op = ALU_OP1_EXP_IEEE;
5412 alu.src[0].sel = ctx->temp_reg;
5413 alu.dst.sel = ctx->temp_reg;
5414 alu.dst.write = 1;
5415 alu.last = 1;
5416 r = r600_bytecode_add_alu(ctx->bc, &alu);
5417 if (r)
5418 return r;
5419 return tgsi_helper_tempx_replicate(ctx);
5420 }
5421
5422 static int emit_mul_int_op(struct r600_bytecode *bc,
5423 struct r600_bytecode_alu *alu_src)
5424 {
5425 struct r600_bytecode_alu alu;
5426 int i, r;
5427 alu = *alu_src;
5428 if (bc->chip_class == CAYMAN) {
5429 for (i = 0; i < 4; i++) {
5430 alu.dst.chan = i;
5431 alu.dst.write = (i == alu_src->dst.chan);
5432 alu.last = (i == 3);
5433
5434 r = r600_bytecode_add_alu(bc, &alu);
5435 if (r)
5436 return r;
5437 }
5438 } else {
5439 alu.last = 1;
5440 r = r600_bytecode_add_alu(bc, &alu);
5441 if (r)
5442 return r;
5443 }
5444 return 0;
5445 }
5446
5447 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5448 {
5449 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5450 struct r600_bytecode_alu alu;
5451 int i, r, j;
5452 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5453 int tmp0 = ctx->temp_reg;
5454 int tmp1 = r600_get_temp(ctx);
5455 int tmp2 = r600_get_temp(ctx);
5456 int tmp3 = r600_get_temp(ctx);
5457 /* Unsigned path:
5458 *
5459 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5460 *
5461 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5462 * 2. tmp0.z = lo (tmp0.x * src2)
5463 * 3. tmp0.w = -tmp0.z
5464 * 4. tmp0.y = hi (tmp0.x * src2)
5465 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5466 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5467 * 7. tmp1.x = tmp0.x - tmp0.w
5468 * 8. tmp1.y = tmp0.x + tmp0.w
5469 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5470 * 10. tmp0.z = hi(tmp0.x * src1) = q
5471 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5472 *
5473 * 12. tmp0.w = src1 - tmp0.y = r
5474 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5475 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5476 *
5477 * if DIV
5478 *
5479 * 15. tmp1.z = tmp0.z + 1 = q + 1
5480 * 16. tmp1.w = tmp0.z - 1 = q - 1
5481 *
5482 * else MOD
5483 *
5484 * 15. tmp1.z = tmp0.w - src2 = r - src2
5485 * 16. tmp1.w = tmp0.w + src2 = r + src2
5486 *
5487 * endif
5488 *
5489 * 17. tmp1.x = tmp1.x & tmp1.y
5490 *
5491 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5492 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5493 *
5494 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5495 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5496 *
5497 * Signed path:
5498 *
5499 * Same as unsigned, using abs values of the operands,
5500 * and fixing the sign of the result in the end.
5501 */
5502
5503 for (i = 0; i < 4; i++) {
5504 if (!(write_mask & (1<<i)))
5505 continue;
5506
5507 if (signed_op) {
5508
5509 /* tmp2.x = -src0 */
5510 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5511 alu.op = ALU_OP2_SUB_INT;
5512
5513 alu.dst.sel = tmp2;
5514 alu.dst.chan = 0;
5515 alu.dst.write = 1;
5516
5517 alu.src[0].sel = V_SQ_ALU_SRC_0;
5518
5519 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5520
5521 alu.last = 1;
5522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5523 return r;
5524
5525 /* tmp2.y = -src1 */
5526 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5527 alu.op = ALU_OP2_SUB_INT;
5528
5529 alu.dst.sel = tmp2;
5530 alu.dst.chan = 1;
5531 alu.dst.write = 1;
5532
5533 alu.src[0].sel = V_SQ_ALU_SRC_0;
5534
5535 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5536
5537 alu.last = 1;
5538 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5539 return r;
5540
5541 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5542 /* it will be a sign of the quotient */
5543 if (!mod) {
5544
5545 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5546 alu.op = ALU_OP2_XOR_INT;
5547
5548 alu.dst.sel = tmp2;
5549 alu.dst.chan = 2;
5550 alu.dst.write = 1;
5551
5552 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5553 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5554
5555 alu.last = 1;
5556 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5557 return r;
5558 }
5559
5560 /* tmp2.x = |src0| */
5561 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5562 alu.op = ALU_OP3_CNDGE_INT;
5563 alu.is_op3 = 1;
5564
5565 alu.dst.sel = tmp2;
5566 alu.dst.chan = 0;
5567 alu.dst.write = 1;
5568
5569 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5570 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5571 alu.src[2].sel = tmp2;
5572 alu.src[2].chan = 0;
5573
5574 alu.last = 1;
5575 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5576 return r;
5577
5578 /* tmp2.y = |src1| */
5579 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5580 alu.op = ALU_OP3_CNDGE_INT;
5581 alu.is_op3 = 1;
5582
5583 alu.dst.sel = tmp2;
5584 alu.dst.chan = 1;
5585 alu.dst.write = 1;
5586
5587 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5588 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5589 alu.src[2].sel = tmp2;
5590 alu.src[2].chan = 1;
5591
5592 alu.last = 1;
5593 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5594 return r;
5595
5596 }
5597
5598 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5599 if (ctx->bc->chip_class == CAYMAN) {
5600 /* tmp3.x = u2f(src2) */
5601 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5602 alu.op = ALU_OP1_UINT_TO_FLT;
5603
5604 alu.dst.sel = tmp3;
5605 alu.dst.chan = 0;
5606 alu.dst.write = 1;
5607
5608 if (signed_op) {
5609 alu.src[0].sel = tmp2;
5610 alu.src[0].chan = 1;
5611 } else {
5612 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5613 }
5614
5615 alu.last = 1;
5616 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5617 return r;
5618
5619 /* tmp0.x = recip(tmp3.x) */
5620 for (j = 0 ; j < 3; j++) {
5621 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5622 alu.op = ALU_OP1_RECIP_IEEE;
5623
5624 alu.dst.sel = tmp0;
5625 alu.dst.chan = j;
5626 alu.dst.write = (j == 0);
5627
5628 alu.src[0].sel = tmp3;
5629 alu.src[0].chan = 0;
5630
5631 if (j == 2)
5632 alu.last = 1;
5633 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5634 return r;
5635 }
5636
5637 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5638 alu.op = ALU_OP2_MUL;
5639
5640 alu.src[0].sel = tmp0;
5641 alu.src[0].chan = 0;
5642
5643 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5644 alu.src[1].value = 0x4f800000;
5645
5646 alu.dst.sel = tmp3;
5647 alu.dst.write = 1;
5648 alu.last = 1;
5649 r = r600_bytecode_add_alu(ctx->bc, &alu);
5650 if (r)
5651 return r;
5652
5653 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5654 alu.op = ALU_OP1_FLT_TO_UINT;
5655
5656 alu.dst.sel = tmp0;
5657 alu.dst.chan = 0;
5658 alu.dst.write = 1;
5659
5660 alu.src[0].sel = tmp3;
5661 alu.src[0].chan = 0;
5662
5663 alu.last = 1;
5664 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5665 return r;
5666
5667 } else {
5668 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5669 alu.op = ALU_OP1_RECIP_UINT;
5670
5671 alu.dst.sel = tmp0;
5672 alu.dst.chan = 0;
5673 alu.dst.write = 1;
5674
5675 if (signed_op) {
5676 alu.src[0].sel = tmp2;
5677 alu.src[0].chan = 1;
5678 } else {
5679 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5680 }
5681
5682 alu.last = 1;
5683 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5684 return r;
5685 }
5686
5687 /* 2. tmp0.z = lo (tmp0.x * src2) */
5688 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5689 alu.op = ALU_OP2_MULLO_UINT;
5690
5691 alu.dst.sel = tmp0;
5692 alu.dst.chan = 2;
5693 alu.dst.write = 1;
5694
5695 alu.src[0].sel = tmp0;
5696 alu.src[0].chan = 0;
5697 if (signed_op) {
5698 alu.src[1].sel = tmp2;
5699 alu.src[1].chan = 1;
5700 } else {
5701 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5702 }
5703
5704 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5705 return r;
5706
5707 /* 3. tmp0.w = -tmp0.z */
5708 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5709 alu.op = ALU_OP2_SUB_INT;
5710
5711 alu.dst.sel = tmp0;
5712 alu.dst.chan = 3;
5713 alu.dst.write = 1;
5714
5715 alu.src[0].sel = V_SQ_ALU_SRC_0;
5716 alu.src[1].sel = tmp0;
5717 alu.src[1].chan = 2;
5718
5719 alu.last = 1;
5720 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5721 return r;
5722
5723 /* 4. tmp0.y = hi (tmp0.x * src2) */
5724 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5725 alu.op = ALU_OP2_MULHI_UINT;
5726
5727 alu.dst.sel = tmp0;
5728 alu.dst.chan = 1;
5729 alu.dst.write = 1;
5730
5731 alu.src[0].sel = tmp0;
5732 alu.src[0].chan = 0;
5733
5734 if (signed_op) {
5735 alu.src[1].sel = tmp2;
5736 alu.src[1].chan = 1;
5737 } else {
5738 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5739 }
5740
5741 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5742 return r;
5743
5744 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
5745 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5746 alu.op = ALU_OP3_CNDE_INT;
5747 alu.is_op3 = 1;
5748
5749 alu.dst.sel = tmp0;
5750 alu.dst.chan = 2;
5751 alu.dst.write = 1;
5752
5753 alu.src[0].sel = tmp0;
5754 alu.src[0].chan = 1;
5755 alu.src[1].sel = tmp0;
5756 alu.src[1].chan = 3;
5757 alu.src[2].sel = tmp0;
5758 alu.src[2].chan = 2;
5759
5760 alu.last = 1;
5761 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5762 return r;
5763
5764 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
5765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5766 alu.op = ALU_OP2_MULHI_UINT;
5767
5768 alu.dst.sel = tmp0;
5769 alu.dst.chan = 3;
5770 alu.dst.write = 1;
5771
5772 alu.src[0].sel = tmp0;
5773 alu.src[0].chan = 2;
5774
5775 alu.src[1].sel = tmp0;
5776 alu.src[1].chan = 0;
5777
5778 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5779 return r;
5780
5781 /* 7. tmp1.x = tmp0.x - tmp0.w */
5782 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5783 alu.op = ALU_OP2_SUB_INT;
5784
5785 alu.dst.sel = tmp1;
5786 alu.dst.chan = 0;
5787 alu.dst.write = 1;
5788
5789 alu.src[0].sel = tmp0;
5790 alu.src[0].chan = 0;
5791 alu.src[1].sel = tmp0;
5792 alu.src[1].chan = 3;
5793
5794 alu.last = 1;
5795 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5796 return r;
5797
5798 /* 8. tmp1.y = tmp0.x + tmp0.w */
5799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5800 alu.op = ALU_OP2_ADD_INT;
5801
5802 alu.dst.sel = tmp1;
5803 alu.dst.chan = 1;
5804 alu.dst.write = 1;
5805
5806 alu.src[0].sel = tmp0;
5807 alu.src[0].chan = 0;
5808 alu.src[1].sel = tmp0;
5809 alu.src[1].chan = 3;
5810
5811 alu.last = 1;
5812 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5813 return r;
5814
5815 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5817 alu.op = ALU_OP3_CNDE_INT;
5818 alu.is_op3 = 1;
5819
5820 alu.dst.sel = tmp0;
5821 alu.dst.chan = 0;
5822 alu.dst.write = 1;
5823
5824 alu.src[0].sel = tmp0;
5825 alu.src[0].chan = 1;
5826 alu.src[1].sel = tmp1;
5827 alu.src[1].chan = 1;
5828 alu.src[2].sel = tmp1;
5829 alu.src[2].chan = 0;
5830
5831 alu.last = 1;
5832 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5833 return r;
5834
5835 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
5836 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5837 alu.op = ALU_OP2_MULHI_UINT;
5838
5839 alu.dst.sel = tmp0;
5840 alu.dst.chan = 2;
5841 alu.dst.write = 1;
5842
5843 alu.src[0].sel = tmp0;
5844 alu.src[0].chan = 0;
5845
5846 if (signed_op) {
5847 alu.src[1].sel = tmp2;
5848 alu.src[1].chan = 0;
5849 } else {
5850 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5851 }
5852
5853 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5854 return r;
5855
5856 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
5857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5858 alu.op = ALU_OP2_MULLO_UINT;
5859
5860 alu.dst.sel = tmp0;
5861 alu.dst.chan = 1;
5862 alu.dst.write = 1;
5863
5864 if (signed_op) {
5865 alu.src[0].sel = tmp2;
5866 alu.src[0].chan = 1;
5867 } else {
5868 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5869 }
5870
5871 alu.src[1].sel = tmp0;
5872 alu.src[1].chan = 2;
5873
5874 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5875 return r;
5876
5877 /* 12. tmp0.w = src1 - tmp0.y = r */
5878 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5879 alu.op = ALU_OP2_SUB_INT;
5880
5881 alu.dst.sel = tmp0;
5882 alu.dst.chan = 3;
5883 alu.dst.write = 1;
5884
5885 if (signed_op) {
5886 alu.src[0].sel = tmp2;
5887 alu.src[0].chan = 0;
5888 } else {
5889 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5890 }
5891
5892 alu.src[1].sel = tmp0;
5893 alu.src[1].chan = 1;
5894
5895 alu.last = 1;
5896 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5897 return r;
5898
5899 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
5900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5901 alu.op = ALU_OP2_SETGE_UINT;
5902
5903 alu.dst.sel = tmp1;
5904 alu.dst.chan = 0;
5905 alu.dst.write = 1;
5906
5907 alu.src[0].sel = tmp0;
5908 alu.src[0].chan = 3;
5909 if (signed_op) {
5910 alu.src[1].sel = tmp2;
5911 alu.src[1].chan = 1;
5912 } else {
5913 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5914 }
5915
5916 alu.last = 1;
5917 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5918 return r;
5919
5920 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
5921 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5922 alu.op = ALU_OP2_SETGE_UINT;
5923
5924 alu.dst.sel = tmp1;
5925 alu.dst.chan = 1;
5926 alu.dst.write = 1;
5927
5928 if (signed_op) {
5929 alu.src[0].sel = tmp2;
5930 alu.src[0].chan = 0;
5931 } else {
5932 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5933 }
5934
5935 alu.src[1].sel = tmp0;
5936 alu.src[1].chan = 1;
5937
5938 alu.last = 1;
5939 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5940 return r;
5941
5942 if (mod) { /* UMOD */
5943
5944 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
5945 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5946 alu.op = ALU_OP2_SUB_INT;
5947
5948 alu.dst.sel = tmp1;
5949 alu.dst.chan = 2;
5950 alu.dst.write = 1;
5951
5952 alu.src[0].sel = tmp0;
5953 alu.src[0].chan = 3;
5954
5955 if (signed_op) {
5956 alu.src[1].sel = tmp2;
5957 alu.src[1].chan = 1;
5958 } else {
5959 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5960 }
5961
5962 alu.last = 1;
5963 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5964 return r;
5965
5966 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
5967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5968 alu.op = ALU_OP2_ADD_INT;
5969
5970 alu.dst.sel = tmp1;
5971 alu.dst.chan = 3;
5972 alu.dst.write = 1;
5973
5974 alu.src[0].sel = tmp0;
5975 alu.src[0].chan = 3;
5976 if (signed_op) {
5977 alu.src[1].sel = tmp2;
5978 alu.src[1].chan = 1;
5979 } else {
5980 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5981 }
5982
5983 alu.last = 1;
5984 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5985 return r;
5986
5987 } else { /* UDIV */
5988
5989 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
5990 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5991 alu.op = ALU_OP2_ADD_INT;
5992
5993 alu.dst.sel = tmp1;
5994 alu.dst.chan = 2;
5995 alu.dst.write = 1;
5996
5997 alu.src[0].sel = tmp0;
5998 alu.src[0].chan = 2;
5999 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6000
6001 alu.last = 1;
6002 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6003 return r;
6004
6005 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
6006 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6007 alu.op = ALU_OP2_ADD_INT;
6008
6009 alu.dst.sel = tmp1;
6010 alu.dst.chan = 3;
6011 alu.dst.write = 1;
6012
6013 alu.src[0].sel = tmp0;
6014 alu.src[0].chan = 2;
6015 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6016
6017 alu.last = 1;
6018 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6019 return r;
6020
6021 }
6022
6023 /* 17. tmp1.x = tmp1.x & tmp1.y */
6024 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6025 alu.op = ALU_OP2_AND_INT;
6026
6027 alu.dst.sel = tmp1;
6028 alu.dst.chan = 0;
6029 alu.dst.write = 1;
6030
6031 alu.src[0].sel = tmp1;
6032 alu.src[0].chan = 0;
6033 alu.src[1].sel = tmp1;
6034 alu.src[1].chan = 1;
6035
6036 alu.last = 1;
6037 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6038 return r;
6039
6040 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
6041 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
6042 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6043 alu.op = ALU_OP3_CNDE_INT;
6044 alu.is_op3 = 1;
6045
6046 alu.dst.sel = tmp0;
6047 alu.dst.chan = 2;
6048 alu.dst.write = 1;
6049
6050 alu.src[0].sel = tmp1;
6051 alu.src[0].chan = 0;
6052 alu.src[1].sel = tmp0;
6053 alu.src[1].chan = mod ? 3 : 2;
6054 alu.src[2].sel = tmp1;
6055 alu.src[2].chan = 2;
6056
6057 alu.last = 1;
6058 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6059 return r;
6060
6061 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6062 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6063 alu.op = ALU_OP3_CNDE_INT;
6064 alu.is_op3 = 1;
6065
6066 if (signed_op) {
6067 alu.dst.sel = tmp0;
6068 alu.dst.chan = 2;
6069 alu.dst.write = 1;
6070 } else {
6071 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6072 }
6073
6074 alu.src[0].sel = tmp1;
6075 alu.src[0].chan = 1;
6076 alu.src[1].sel = tmp1;
6077 alu.src[1].chan = 3;
6078 alu.src[2].sel = tmp0;
6079 alu.src[2].chan = 2;
6080
6081 alu.last = 1;
6082 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6083 return r;
6084
6085 if (signed_op) {
6086
6087 /* fix the sign of the result */
6088
6089 if (mod) {
6090
6091 /* tmp0.x = -tmp0.z */
6092 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6093 alu.op = ALU_OP2_SUB_INT;
6094
6095 alu.dst.sel = tmp0;
6096 alu.dst.chan = 0;
6097 alu.dst.write = 1;
6098
6099 alu.src[0].sel = V_SQ_ALU_SRC_0;
6100 alu.src[1].sel = tmp0;
6101 alu.src[1].chan = 2;
6102
6103 alu.last = 1;
6104 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6105 return r;
6106
6107 /* sign of the remainder is the same as the sign of src0 */
6108 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6110 alu.op = ALU_OP3_CNDGE_INT;
6111 alu.is_op3 = 1;
6112
6113 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6114
6115 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6116 alu.src[1].sel = tmp0;
6117 alu.src[1].chan = 2;
6118 alu.src[2].sel = tmp0;
6119 alu.src[2].chan = 0;
6120
6121 alu.last = 1;
6122 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6123 return r;
6124
6125 } else {
6126
6127 /* tmp0.x = -tmp0.z */
6128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6129 alu.op = ALU_OP2_SUB_INT;
6130
6131 alu.dst.sel = tmp0;
6132 alu.dst.chan = 0;
6133 alu.dst.write = 1;
6134
6135 alu.src[0].sel = V_SQ_ALU_SRC_0;
6136 alu.src[1].sel = tmp0;
6137 alu.src[1].chan = 2;
6138
6139 alu.last = 1;
6140 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6141 return r;
6142
6143 /* fix the quotient sign (same as the sign of src0*src1) */
6144 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6145 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6146 alu.op = ALU_OP3_CNDGE_INT;
6147 alu.is_op3 = 1;
6148
6149 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6150
6151 alu.src[0].sel = tmp2;
6152 alu.src[0].chan = 2;
6153 alu.src[1].sel = tmp0;
6154 alu.src[1].chan = 2;
6155 alu.src[2].sel = tmp0;
6156 alu.src[2].chan = 0;
6157
6158 alu.last = 1;
6159 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6160 return r;
6161 }
6162 }
6163 }
6164 return 0;
6165 }
6166
6167 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6168 {
6169 return tgsi_divmod(ctx, 0, 0);
6170 }
6171
6172 static int tgsi_umod(struct r600_shader_ctx *ctx)
6173 {
6174 return tgsi_divmod(ctx, 1, 0);
6175 }
6176
6177 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6178 {
6179 return tgsi_divmod(ctx, 0, 1);
6180 }
6181
6182 static int tgsi_imod(struct r600_shader_ctx *ctx)
6183 {
6184 return tgsi_divmod(ctx, 1, 1);
6185 }
6186
6187
6188 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6189 {
6190 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6191 struct r600_bytecode_alu alu;
6192 int i, r;
6193 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6194 int last_inst = tgsi_last_instruction(write_mask);
6195
6196 for (i = 0; i < 4; i++) {
6197 if (!(write_mask & (1<<i)))
6198 continue;
6199
6200 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6201 alu.op = ALU_OP1_TRUNC;
6202
6203 alu.dst.sel = ctx->temp_reg;
6204 alu.dst.chan = i;
6205 alu.dst.write = 1;
6206
6207 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6208 if (i == last_inst)
6209 alu.last = 1;
6210 r = r600_bytecode_add_alu(ctx->bc, &alu);
6211 if (r)
6212 return r;
6213 }
6214
6215 for (i = 0; i < 4; i++) {
6216 if (!(write_mask & (1<<i)))
6217 continue;
6218
6219 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6220 alu.op = ctx->inst_info->op;
6221
6222 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6223
6224 alu.src[0].sel = ctx->temp_reg;
6225 alu.src[0].chan = i;
6226
6227 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6228 alu.last = 1;
6229 r = r600_bytecode_add_alu(ctx->bc, &alu);
6230 if (r)
6231 return r;
6232 }
6233
6234 return 0;
6235 }
6236
6237 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6238 {
6239 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6240 struct r600_bytecode_alu alu;
6241 int i, r;
6242 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6243 int last_inst = tgsi_last_instruction(write_mask);
6244
6245 /* tmp = -src */
6246 for (i = 0; i < 4; i++) {
6247 if (!(write_mask & (1<<i)))
6248 continue;
6249
6250 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6251 alu.op = ALU_OP2_SUB_INT;
6252
6253 alu.dst.sel = ctx->temp_reg;
6254 alu.dst.chan = i;
6255 alu.dst.write = 1;
6256
6257 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6258 alu.src[0].sel = V_SQ_ALU_SRC_0;
6259
6260 if (i == last_inst)
6261 alu.last = 1;
6262 r = r600_bytecode_add_alu(ctx->bc, &alu);
6263 if (r)
6264 return r;
6265 }
6266
6267 /* dst = (src >= 0 ? src : tmp) */
6268 for (i = 0; i < 4; i++) {
6269 if (!(write_mask & (1<<i)))
6270 continue;
6271
6272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6273 alu.op = ALU_OP3_CNDGE_INT;
6274 alu.is_op3 = 1;
6275 alu.dst.write = 1;
6276
6277 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6278
6279 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6280 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6281 alu.src[2].sel = ctx->temp_reg;
6282 alu.src[2].chan = i;
6283
6284 if (i == last_inst)
6285 alu.last = 1;
6286 r = r600_bytecode_add_alu(ctx->bc, &alu);
6287 if (r)
6288 return r;
6289 }
6290 return 0;
6291 }
6292
6293 static int tgsi_issg(struct r600_shader_ctx *ctx)
6294 {
6295 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6296 struct r600_bytecode_alu alu;
6297 int i, r;
6298 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6299 int last_inst = tgsi_last_instruction(write_mask);
6300
6301 /* tmp = (src >= 0 ? src : -1) */
6302 for (i = 0; i < 4; i++) {
6303 if (!(write_mask & (1<<i)))
6304 continue;
6305
6306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6307 alu.op = ALU_OP3_CNDGE_INT;
6308 alu.is_op3 = 1;
6309
6310 alu.dst.sel = ctx->temp_reg;
6311 alu.dst.chan = i;
6312 alu.dst.write = 1;
6313
6314 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6315 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6316 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6317
6318 if (i == last_inst)
6319 alu.last = 1;
6320 r = r600_bytecode_add_alu(ctx->bc, &alu);
6321 if (r)
6322 return r;
6323 }
6324
6325 /* dst = (tmp > 0 ? 1 : tmp) */
6326 for (i = 0; i < 4; i++) {
6327 if (!(write_mask & (1<<i)))
6328 continue;
6329
6330 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6331 alu.op = ALU_OP3_CNDGT_INT;
6332 alu.is_op3 = 1;
6333 alu.dst.write = 1;
6334
6335 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6336
6337 alu.src[0].sel = ctx->temp_reg;
6338 alu.src[0].chan = i;
6339
6340 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6341
6342 alu.src[2].sel = ctx->temp_reg;
6343 alu.src[2].chan = i;
6344
6345 if (i == last_inst)
6346 alu.last = 1;
6347 r = r600_bytecode_add_alu(ctx->bc, &alu);
6348 if (r)
6349 return r;
6350 }
6351 return 0;
6352 }
6353
6354
6355
6356 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6357 {
6358 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6359 struct r600_bytecode_alu alu;
6360 int i, r;
6361
6362 /* tmp = (src > 0 ? 1 : src) */
6363 for (i = 0; i < 4; i++) {
6364 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6365 alu.op = ALU_OP3_CNDGT;
6366 alu.is_op3 = 1;
6367
6368 alu.dst.sel = ctx->temp_reg;
6369 alu.dst.chan = i;
6370
6371 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6372 alu.src[1].sel = V_SQ_ALU_SRC_1;
6373 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6374
6375 if (i == 3)
6376 alu.last = 1;
6377 r = r600_bytecode_add_alu(ctx->bc, &alu);
6378 if (r)
6379 return r;
6380 }
6381
6382 /* dst = (-tmp > 0 ? -1 : tmp) */
6383 for (i = 0; i < 4; i++) {
6384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6385 alu.op = ALU_OP3_CNDGT;
6386 alu.is_op3 = 1;
6387 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6388
6389 alu.src[0].sel = ctx->temp_reg;
6390 alu.src[0].chan = i;
6391 alu.src[0].neg = 1;
6392
6393 alu.src[1].sel = V_SQ_ALU_SRC_1;
6394 alu.src[1].neg = 1;
6395
6396 alu.src[2].sel = ctx->temp_reg;
6397 alu.src[2].chan = i;
6398
6399 if (i == 3)
6400 alu.last = 1;
6401 r = r600_bytecode_add_alu(ctx->bc, &alu);
6402 if (r)
6403 return r;
6404 }
6405 return 0;
6406 }
6407
6408 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6409 {
6410 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6411 struct r600_bytecode_alu alu;
6412 int i, r, t1, t2;
6413
6414 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6415 int last_inst = tgsi_last_instruction(write_mask);
6416
6417 t1 = r600_get_temp(ctx);
6418
6419 for (i = 0; i < 4; i++) {
6420 if (!(write_mask & (1<<i)))
6421 continue;
6422
6423 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6424 alu.op = ALU_OP2_SETGE_INT;
6425 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6426 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6427 alu.src[1].value = 32;
6428 alu.dst.sel = ctx->temp_reg;
6429 alu.dst.chan = i;
6430 alu.dst.write = 1;
6431 alu.last = i == last_inst;
6432 r = r600_bytecode_add_alu(ctx->bc, &alu);
6433 if (r)
6434 return r;
6435 }
6436
6437 for (i = 0; i < 4; i++) {
6438 if (!(write_mask & (1<<i)))
6439 continue;
6440
6441 /* create mask tmp */
6442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6443 alu.op = ALU_OP2_BFM_INT;
6444 alu.dst.sel = t1;
6445 alu.dst.chan = i;
6446 alu.dst.write = 1;
6447 alu.last = i == last_inst;
6448
6449 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6450 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6451
6452 r = r600_bytecode_add_alu(ctx->bc, &alu);
6453 if (r)
6454 return r;
6455 }
6456
6457 t2 = r600_get_temp(ctx);
6458
6459 for (i = 0; i < 4; i++) {
6460 if (!(write_mask & (1<<i)))
6461 continue;
6462
6463 /* shift insert left */
6464 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6465 alu.op = ALU_OP2_LSHL_INT;
6466 alu.dst.sel = t2;
6467 alu.dst.chan = i;
6468 alu.dst.write = 1;
6469 alu.last = i == last_inst;
6470
6471 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6472 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6473
6474 r = r600_bytecode_add_alu(ctx->bc, &alu);
6475 if (r)
6476 return r;
6477 }
6478
6479 for (i = 0; i < 4; i++) {
6480 if (!(write_mask & (1<<i)))
6481 continue;
6482
6483 /* actual bitfield insert */
6484 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6485 alu.op = ALU_OP3_BFI_INT;
6486 alu.is_op3 = 1;
6487 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6488 alu.dst.chan = i;
6489 alu.dst.write = 1;
6490 alu.last = i == last_inst;
6491
6492 alu.src[0].sel = t1;
6493 alu.src[0].chan = i;
6494 alu.src[1].sel = t2;
6495 alu.src[1].chan = i;
6496 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6497
6498 r = r600_bytecode_add_alu(ctx->bc, &alu);
6499 if (r)
6500 return r;
6501 }
6502
6503 for (i = 0; i < 4; i++) {
6504 if (!(write_mask & (1<<i)))
6505 continue;
6506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6507 alu.op = ALU_OP3_CNDE_INT;
6508 alu.is_op3 = 1;
6509 alu.src[0].sel = ctx->temp_reg;
6510 alu.src[0].chan = i;
6511 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6512
6513 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6514
6515 alu.src[1].sel = alu.dst.sel;
6516 alu.src[1].chan = i;
6517
6518 alu.last = i == last_inst;
6519 r = r600_bytecode_add_alu(ctx->bc, &alu);
6520 if (r)
6521 return r;
6522 }
6523 return 0;
6524 }
6525
6526 static int tgsi_msb(struct r600_shader_ctx *ctx)
6527 {
6528 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6529 struct r600_bytecode_alu alu;
6530 int i, r, t1, t2;
6531
6532 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6533 int last_inst = tgsi_last_instruction(write_mask);
6534
6535 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6536 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6537
6538 t1 = ctx->temp_reg;
6539
6540 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6541 for (i = 0; i < 4; i++) {
6542 if (!(write_mask & (1<<i)))
6543 continue;
6544
6545 /* t1 = FFBH_INT / FFBH_UINT */
6546 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6547 alu.op = ctx->inst_info->op;
6548 alu.dst.sel = t1;
6549 alu.dst.chan = i;
6550 alu.dst.write = 1;
6551 alu.last = i == last_inst;
6552
6553 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6554
6555 r = r600_bytecode_add_alu(ctx->bc, &alu);
6556 if (r)
6557 return r;
6558 }
6559
6560 t2 = r600_get_temp(ctx);
6561
6562 for (i = 0; i < 4; i++) {
6563 if (!(write_mask & (1<<i)))
6564 continue;
6565
6566 /* t2 = 31 - t1 */
6567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6568 alu.op = ALU_OP2_SUB_INT;
6569 alu.dst.sel = t2;
6570 alu.dst.chan = i;
6571 alu.dst.write = 1;
6572 alu.last = i == last_inst;
6573
6574 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6575 alu.src[0].value = 31;
6576 alu.src[1].sel = t1;
6577 alu.src[1].chan = i;
6578
6579 r = r600_bytecode_add_alu(ctx->bc, &alu);
6580 if (r)
6581 return r;
6582 }
6583
6584 for (i = 0; i < 4; i++) {
6585 if (!(write_mask & (1<<i)))
6586 continue;
6587
6588 /* result = t1 >= 0 ? t2 : t1 */
6589 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6590 alu.op = ALU_OP3_CNDGE_INT;
6591 alu.is_op3 = 1;
6592 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6593 alu.dst.chan = i;
6594 alu.dst.write = 1;
6595 alu.last = i == last_inst;
6596
6597 alu.src[0].sel = t1;
6598 alu.src[0].chan = i;
6599 alu.src[1].sel = t2;
6600 alu.src[1].chan = i;
6601 alu.src[2].sel = t1;
6602 alu.src[2].chan = i;
6603
6604 r = r600_bytecode_add_alu(ctx->bc, &alu);
6605 if (r)
6606 return r;
6607 }
6608
6609 return 0;
6610 }
6611
6612 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6613 {
6614 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6615 struct r600_bytecode_alu alu;
6616 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6617 unsigned location;
6618 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6619
6620 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6621
6622 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6623 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6624 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6625 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6626 }
6627 else {
6628 location = TGSI_INTERPOLATE_LOC_CENTROID;
6629 }
6630
6631 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6632 if (k < 0)
6633 k = 0;
6634 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6635 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6636
6637 /* NOTE: currently offset is not perspective correct */
6638 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6639 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6640 int sample_gpr = -1;
6641 int gradientsH, gradientsV;
6642 struct r600_bytecode_tex tex;
6643
6644 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6645 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6646 }
6647
6648 gradientsH = r600_get_temp(ctx);
6649 gradientsV = r600_get_temp(ctx);
6650 for (i = 0; i < 2; i++) {
6651 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6652 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6653 tex.src_gpr = interp_gpr;
6654 tex.src_sel_x = interp_base_chan + 0;
6655 tex.src_sel_y = interp_base_chan + 1;
6656 tex.src_sel_z = 0;
6657 tex.src_sel_w = 0;
6658 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6659 tex.dst_sel_x = 0;
6660 tex.dst_sel_y = 1;
6661 tex.dst_sel_z = 7;
6662 tex.dst_sel_w = 7;
6663 tex.inst_mod = 1; // Use per pixel gradient calculation
6664 tex.sampler_id = 0;
6665 tex.resource_id = tex.sampler_id;
6666 r = r600_bytecode_add_tex(ctx->bc, &tex);
6667 if (r)
6668 return r;
6669 }
6670
6671 for (i = 0; i < 2; i++) {
6672 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6673 alu.op = ALU_OP3_MULADD;
6674 alu.is_op3 = 1;
6675 alu.src[0].sel = gradientsH;
6676 alu.src[0].chan = i;
6677 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6678 alu.src[1].sel = sample_gpr;
6679 alu.src[1].chan = 2;
6680 }
6681 else {
6682 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6683 }
6684 alu.src[2].sel = interp_gpr;
6685 alu.src[2].chan = interp_base_chan + i;
6686 alu.dst.sel = ctx->temp_reg;
6687 alu.dst.chan = i;
6688 alu.last = i == 1;
6689
6690 r = r600_bytecode_add_alu(ctx->bc, &alu);
6691 if (r)
6692 return r;
6693 }
6694
6695 for (i = 0; i < 2; i++) {
6696 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6697 alu.op = ALU_OP3_MULADD;
6698 alu.is_op3 = 1;
6699 alu.src[0].sel = gradientsV;
6700 alu.src[0].chan = i;
6701 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6702 alu.src[1].sel = sample_gpr;
6703 alu.src[1].chan = 3;
6704 }
6705 else {
6706 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6707 }
6708 alu.src[2].sel = ctx->temp_reg;
6709 alu.src[2].chan = i;
6710 alu.dst.sel = ctx->temp_reg;
6711 alu.dst.chan = i;
6712 alu.last = i == 1;
6713
6714 r = r600_bytecode_add_alu(ctx->bc, &alu);
6715 if (r)
6716 return r;
6717 }
6718 }
6719
6720 tmp = r600_get_temp(ctx);
6721 for (i = 0; i < 8; i++) {
6722 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6723 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6724
6725 alu.dst.sel = tmp;
6726 if ((i > 1 && i < 6)) {
6727 alu.dst.write = 1;
6728 }
6729 else {
6730 alu.dst.write = 0;
6731 }
6732 alu.dst.chan = i % 4;
6733
6734 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6735 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6736 alu.src[0].sel = ctx->temp_reg;
6737 alu.src[0].chan = 1 - (i % 2);
6738 } else {
6739 alu.src[0].sel = interp_gpr;
6740 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6741 }
6742 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6743 alu.src[1].chan = 0;
6744
6745 alu.last = i % 4 == 3;
6746 alu.bank_swizzle_force = SQ_ALU_VEC_210;
6747
6748 r = r600_bytecode_add_alu(ctx->bc, &alu);
6749 if (r)
6750 return r;
6751 }
6752
6753 // INTERP can't swizzle dst
6754 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6755 for (i = 0; i <= lasti; i++) {
6756 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6757 continue;
6758
6759 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6760 alu.op = ALU_OP1_MOV;
6761 alu.src[0].sel = tmp;
6762 alu.src[0].chan = ctx->src[0].swizzle[i];
6763 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6764 alu.dst.write = 1;
6765 alu.last = i == lasti;
6766 r = r600_bytecode_add_alu(ctx->bc, &alu);
6767 if (r)
6768 return r;
6769 }
6770
6771 return 0;
6772 }
6773
6774
6775 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6776 {
6777 struct r600_bytecode_alu alu;
6778 int i, r;
6779
6780 for (i = 0; i < 4; i++) {
6781 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6782 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6783 alu.op = ALU_OP0_NOP;
6784 alu.dst.chan = i;
6785 } else {
6786 alu.op = ALU_OP1_MOV;
6787 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6788 alu.src[0].sel = ctx->temp_reg;
6789 alu.src[0].chan = i;
6790 }
6791 if (i == 3) {
6792 alu.last = 1;
6793 }
6794 r = r600_bytecode_add_alu(ctx->bc, &alu);
6795 if (r)
6796 return r;
6797 }
6798 return 0;
6799 }
6800
6801 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6802 unsigned temp, int chan,
6803 struct r600_bytecode_alu_src *bc_src,
6804 const struct r600_shader_src *shader_src)
6805 {
6806 struct r600_bytecode_alu alu;
6807 int r;
6808
6809 r600_bytecode_src(bc_src, shader_src, chan);
6810
6811 /* op3 operands don't support abs modifier */
6812 if (bc_src->abs) {
6813 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
6814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6815 alu.op = ALU_OP1_MOV;
6816 alu.dst.sel = temp;
6817 alu.dst.chan = chan;
6818 alu.dst.write = 1;
6819
6820 alu.src[0] = *bc_src;
6821 alu.last = true; // sufficient?
6822 r = r600_bytecode_add_alu(ctx->bc, &alu);
6823 if (r)
6824 return r;
6825
6826 memset(bc_src, 0, sizeof(*bc_src));
6827 bc_src->sel = temp;
6828 bc_src->chan = chan;
6829 }
6830 return 0;
6831 }
6832
6833 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
6834 {
6835 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6836 struct r600_bytecode_alu alu;
6837 int i, j, r;
6838 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6839 int temp_regs[4];
6840 unsigned op = ctx->inst_info->op;
6841
6842 if (op == ALU_OP3_MULADD_IEEE &&
6843 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6844 op = ALU_OP3_MULADD;
6845
6846 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6847 temp_regs[j] = 0;
6848 if (ctx->src[j].abs)
6849 temp_regs[j] = r600_get_temp(ctx);
6850 }
6851 for (i = 0; i < lasti + 1; i++) {
6852 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6853 continue;
6854
6855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6856 alu.op = op;
6857 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6858 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6859 if (r)
6860 return r;
6861 }
6862
6863 if (dst == -1) {
6864 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6865 } else {
6866 alu.dst.sel = dst;
6867 }
6868 alu.dst.chan = i;
6869 alu.dst.write = 1;
6870 alu.is_op3 = 1;
6871 if (i == lasti) {
6872 alu.last = 1;
6873 }
6874 r = r600_bytecode_add_alu(ctx->bc, &alu);
6875 if (r)
6876 return r;
6877 }
6878 return 0;
6879 }
6880
6881 static int tgsi_op3(struct r600_shader_ctx *ctx)
6882 {
6883 return tgsi_op3_dst(ctx, -1);
6884 }
6885
6886 static int tgsi_dp(struct r600_shader_ctx *ctx)
6887 {
6888 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6889 struct r600_bytecode_alu alu;
6890 int i, j, r;
6891 unsigned op = ctx->inst_info->op;
6892 if (op == ALU_OP2_DOT4_IEEE &&
6893 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6894 op = ALU_OP2_DOT4;
6895
6896 for (i = 0; i < 4; i++) {
6897 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6898 alu.op = op;
6899 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6900 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6901 }
6902
6903 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6904 alu.dst.chan = i;
6905 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6906 /* handle some special cases */
6907 switch (inst->Instruction.Opcode) {
6908 case TGSI_OPCODE_DP2:
6909 if (i > 1) {
6910 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6911 alu.src[0].chan = alu.src[1].chan = 0;
6912 }
6913 break;
6914 case TGSI_OPCODE_DP3:
6915 if (i > 2) {
6916 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6917 alu.src[0].chan = alu.src[1].chan = 0;
6918 }
6919 break;
6920 default:
6921 break;
6922 }
6923 if (i == 3) {
6924 alu.last = 1;
6925 }
6926 r = r600_bytecode_add_alu(ctx->bc, &alu);
6927 if (r)
6928 return r;
6929 }
6930 return 0;
6931 }
6932
6933 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6934 unsigned index)
6935 {
6936 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6937 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6938 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6939 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6940 ctx->src[index].neg || ctx->src[index].abs ||
6941 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6942 }
6943
6944 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6945 unsigned index)
6946 {
6947 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6948 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6949 }
6950
6951 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6952 {
6953 struct r600_bytecode_vtx vtx;
6954 struct r600_bytecode_alu alu;
6955 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6956 int src_gpr, r, i;
6957 int id = tgsi_tex_get_src_gpr(ctx, 1);
6958 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6959
6960 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6961 if (src_requires_loading) {
6962 for (i = 0; i < 4; i++) {
6963 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6964 alu.op = ALU_OP1_MOV;
6965 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6966 alu.dst.sel = ctx->temp_reg;
6967 alu.dst.chan = i;
6968 if (i == 3)
6969 alu.last = 1;
6970 alu.dst.write = 1;
6971 r = r600_bytecode_add_alu(ctx->bc, &alu);
6972 if (r)
6973 return r;
6974 }
6975 src_gpr = ctx->temp_reg;
6976 }
6977
6978 memset(&vtx, 0, sizeof(vtx));
6979 vtx.op = FETCH_OP_VFETCH;
6980 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6981 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6982 vtx.src_gpr = src_gpr;
6983 vtx.mega_fetch_count = 16;
6984 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6985 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6986 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
6987 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
6988 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
6989 vtx.use_const_fields = 1;
6990 vtx.buffer_index_mode = sampler_index_mode;
6991
6992 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6993 return r;
6994
6995 if (ctx->bc->chip_class >= EVERGREEN)
6996 return 0;
6997
6998 for (i = 0; i < 4; i++) {
6999 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7000 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7001 continue;
7002
7003 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7004 alu.op = ALU_OP2_AND_INT;
7005
7006 alu.dst.chan = i;
7007 alu.dst.sel = vtx.dst_gpr;
7008 alu.dst.write = 1;
7009
7010 alu.src[0].sel = vtx.dst_gpr;
7011 alu.src[0].chan = i;
7012
7013 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7014 alu.src[1].sel += (id * 2);
7015 alu.src[1].chan = i % 4;
7016 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7017
7018 if (i == lasti)
7019 alu.last = 1;
7020 r = r600_bytecode_add_alu(ctx->bc, &alu);
7021 if (r)
7022 return r;
7023 }
7024
7025 if (inst->Dst[0].Register.WriteMask & 3) {
7026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7027 alu.op = ALU_OP2_OR_INT;
7028
7029 alu.dst.chan = 3;
7030 alu.dst.sel = vtx.dst_gpr;
7031 alu.dst.write = 1;
7032
7033 alu.src[0].sel = vtx.dst_gpr;
7034 alu.src[0].chan = 3;
7035
7036 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7037 alu.src[1].chan = 0;
7038 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7039
7040 alu.last = 1;
7041 r = r600_bytecode_add_alu(ctx->bc, &alu);
7042 if (r)
7043 return r;
7044 }
7045 return 0;
7046 }
7047
7048 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7049 {
7050 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7051 int r;
7052 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7053 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7054
7055 if (ctx->bc->chip_class < EVERGREEN) {
7056 struct r600_bytecode_alu alu;
7057 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7058 alu.op = ALU_OP1_MOV;
7059 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7060 /* r600 we have them at channel 2 of the second dword */
7061 alu.src[0].sel += (id * 2) + 1;
7062 alu.src[0].chan = 1;
7063 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7064 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7065 alu.last = 1;
7066 r = r600_bytecode_add_alu(ctx->bc, &alu);
7067 if (r)
7068 return r;
7069 return 0;
7070 } else {
7071 struct r600_bytecode_vtx vtx;
7072 memset(&vtx, 0, sizeof(vtx));
7073 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7074 vtx.buffer_id = id + eg_buffer_base;
7075 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7076 vtx.src_gpr = 0;
7077 vtx.mega_fetch_count = 16; /* no idea here really... */
7078 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7079 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7080 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
7081 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
7082 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
7083 vtx.data_format = FMT_32_32_32_32;
7084 vtx.buffer_index_mode = sampler_index_mode;
7085
7086 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7087 return r;
7088 return 0;
7089 }
7090 }
7091
7092
7093 static int tgsi_tex(struct r600_shader_ctx *ctx)
7094 {
7095 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7096 struct r600_bytecode_tex tex;
7097 struct r600_bytecode_alu alu;
7098 unsigned src_gpr;
7099 int r, i, j;
7100 int opcode;
7101 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7102 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7103 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7104 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7105
7106 bool txf_add_offsets = inst->Texture.NumOffsets &&
7107 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7108 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7109
7110 /* Texture fetch instructions can only use gprs as source.
7111 * Also they cannot negate the source or take the absolute value */
7112 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7113 tgsi_tex_src_requires_loading(ctx, 0)) ||
7114 read_compressed_msaa || txf_add_offsets;
7115
7116 boolean src_loaded = FALSE;
7117 unsigned sampler_src_reg = 1;
7118 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7119 boolean has_txq_cube_array_z = false;
7120 unsigned sampler_index_mode;
7121
7122 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7123 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7124 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7125 if (inst->Dst[0].Register.WriteMask & 4) {
7126 ctx->shader->has_txq_cube_array_z_comp = true;
7127 has_txq_cube_array_z = true;
7128 }
7129
7130 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7131 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7132 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7133 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7134 sampler_src_reg = 2;
7135
7136 /* TGSI moves the sampler to src reg 3 for TXD */
7137 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7138 sampler_src_reg = 3;
7139
7140 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7141
7142 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7143
7144 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7145 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7146 if (ctx->bc->chip_class < EVERGREEN)
7147 ctx->shader->uses_tex_buffers = true;
7148 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7149 }
7150 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7151 if (ctx->bc->chip_class < EVERGREEN)
7152 ctx->shader->uses_tex_buffers = true;
7153 return do_vtx_fetch_inst(ctx, src_requires_loading);
7154 }
7155 }
7156
7157 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7158 int out_chan;
7159 /* Add perspective divide */
7160 if (ctx->bc->chip_class == CAYMAN) {
7161 out_chan = 2;
7162 for (i = 0; i < 3; i++) {
7163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7164 alu.op = ALU_OP1_RECIP_IEEE;
7165 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7166
7167 alu.dst.sel = ctx->temp_reg;
7168 alu.dst.chan = i;
7169 if (i == 2)
7170 alu.last = 1;
7171 if (out_chan == i)
7172 alu.dst.write = 1;
7173 r = r600_bytecode_add_alu(ctx->bc, &alu);
7174 if (r)
7175 return r;
7176 }
7177
7178 } else {
7179 out_chan = 3;
7180 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7181 alu.op = ALU_OP1_RECIP_IEEE;
7182 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7183
7184 alu.dst.sel = ctx->temp_reg;
7185 alu.dst.chan = out_chan;
7186 alu.last = 1;
7187 alu.dst.write = 1;
7188 r = r600_bytecode_add_alu(ctx->bc, &alu);
7189 if (r)
7190 return r;
7191 }
7192
7193 for (i = 0; i < 3; i++) {
7194 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7195 alu.op = ALU_OP2_MUL;
7196 alu.src[0].sel = ctx->temp_reg;
7197 alu.src[0].chan = out_chan;
7198 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7199 alu.dst.sel = ctx->temp_reg;
7200 alu.dst.chan = i;
7201 alu.dst.write = 1;
7202 r = r600_bytecode_add_alu(ctx->bc, &alu);
7203 if (r)
7204 return r;
7205 }
7206 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7207 alu.op = ALU_OP1_MOV;
7208 alu.src[0].sel = V_SQ_ALU_SRC_1;
7209 alu.src[0].chan = 0;
7210 alu.dst.sel = ctx->temp_reg;
7211 alu.dst.chan = 3;
7212 alu.last = 1;
7213 alu.dst.write = 1;
7214 r = r600_bytecode_add_alu(ctx->bc, &alu);
7215 if (r)
7216 return r;
7217 src_loaded = TRUE;
7218 src_gpr = ctx->temp_reg;
7219 }
7220
7221
7222 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7223 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7224 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7225 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7226 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7227
7228 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7229 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7230
7231 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7232 for (i = 0; i < 4; i++) {
7233 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7234 alu.op = ALU_OP2_CUBE;
7235 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7236 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7237 alu.dst.sel = ctx->temp_reg;
7238 alu.dst.chan = i;
7239 if (i == 3)
7240 alu.last = 1;
7241 alu.dst.write = 1;
7242 r = r600_bytecode_add_alu(ctx->bc, &alu);
7243 if (r)
7244 return r;
7245 }
7246
7247 /* tmp1.z = RCP_e(|tmp1.z|) */
7248 if (ctx->bc->chip_class == CAYMAN) {
7249 for (i = 0; i < 3; i++) {
7250 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7251 alu.op = ALU_OP1_RECIP_IEEE;
7252 alu.src[0].sel = ctx->temp_reg;
7253 alu.src[0].chan = 2;
7254 alu.src[0].abs = 1;
7255 alu.dst.sel = ctx->temp_reg;
7256 alu.dst.chan = i;
7257 if (i == 2)
7258 alu.dst.write = 1;
7259 if (i == 2)
7260 alu.last = 1;
7261 r = r600_bytecode_add_alu(ctx->bc, &alu);
7262 if (r)
7263 return r;
7264 }
7265 } else {
7266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7267 alu.op = ALU_OP1_RECIP_IEEE;
7268 alu.src[0].sel = ctx->temp_reg;
7269 alu.src[0].chan = 2;
7270 alu.src[0].abs = 1;
7271 alu.dst.sel = ctx->temp_reg;
7272 alu.dst.chan = 2;
7273 alu.dst.write = 1;
7274 alu.last = 1;
7275 r = r600_bytecode_add_alu(ctx->bc, &alu);
7276 if (r)
7277 return r;
7278 }
7279
7280 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7281 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7282 * muladd has no writemask, have to use another temp
7283 */
7284 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7285 alu.op = ALU_OP3_MULADD;
7286 alu.is_op3 = 1;
7287
7288 alu.src[0].sel = ctx->temp_reg;
7289 alu.src[0].chan = 0;
7290 alu.src[1].sel = ctx->temp_reg;
7291 alu.src[1].chan = 2;
7292
7293 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7294 alu.src[2].chan = 0;
7295 alu.src[2].value = u_bitcast_f2u(1.5f);
7296
7297 alu.dst.sel = ctx->temp_reg;
7298 alu.dst.chan = 0;
7299 alu.dst.write = 1;
7300
7301 r = r600_bytecode_add_alu(ctx->bc, &alu);
7302 if (r)
7303 return r;
7304
7305 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7306 alu.op = ALU_OP3_MULADD;
7307 alu.is_op3 = 1;
7308
7309 alu.src[0].sel = ctx->temp_reg;
7310 alu.src[0].chan = 1;
7311 alu.src[1].sel = ctx->temp_reg;
7312 alu.src[1].chan = 2;
7313
7314 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7315 alu.src[2].chan = 0;
7316 alu.src[2].value = u_bitcast_f2u(1.5f);
7317
7318 alu.dst.sel = ctx->temp_reg;
7319 alu.dst.chan = 1;
7320 alu.dst.write = 1;
7321
7322 alu.last = 1;
7323 r = r600_bytecode_add_alu(ctx->bc, &alu);
7324 if (r)
7325 return r;
7326 /* write initial compare value into Z component
7327 - W src 0 for shadow cube
7328 - X src 1 for shadow cube array */
7329 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7330 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7332 alu.op = ALU_OP1_MOV;
7333 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7334 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7335 else
7336 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7337 alu.dst.sel = ctx->temp_reg;
7338 alu.dst.chan = 2;
7339 alu.dst.write = 1;
7340 alu.last = 1;
7341 r = r600_bytecode_add_alu(ctx->bc, &alu);
7342 if (r)
7343 return r;
7344 }
7345
7346 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7347 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7348 if (ctx->bc->chip_class >= EVERGREEN) {
7349 int mytmp = r600_get_temp(ctx);
7350 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7351 alu.op = ALU_OP1_MOV;
7352 alu.src[0].sel = ctx->temp_reg;
7353 alu.src[0].chan = 3;
7354 alu.dst.sel = mytmp;
7355 alu.dst.chan = 0;
7356 alu.dst.write = 1;
7357 alu.last = 1;
7358 r = r600_bytecode_add_alu(ctx->bc, &alu);
7359 if (r)
7360 return r;
7361
7362 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7363 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7364 alu.op = ALU_OP3_MULADD;
7365 alu.is_op3 = 1;
7366 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7367 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7368 alu.src[1].chan = 0;
7369 alu.src[1].value = u_bitcast_f2u(8.0f);
7370 alu.src[2].sel = mytmp;
7371 alu.src[2].chan = 0;
7372 alu.dst.sel = ctx->temp_reg;
7373 alu.dst.chan = 3;
7374 alu.dst.write = 1;
7375 alu.last = 1;
7376 r = r600_bytecode_add_alu(ctx->bc, &alu);
7377 if (r)
7378 return r;
7379 } else if (ctx->bc->chip_class < EVERGREEN) {
7380 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7381 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7382 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7383 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7384 tex.src_gpr = r600_get_temp(ctx);
7385 tex.src_sel_x = 0;
7386 tex.src_sel_y = 0;
7387 tex.src_sel_z = 0;
7388 tex.src_sel_w = 0;
7389 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7390 tex.coord_type_x = 1;
7391 tex.coord_type_y = 1;
7392 tex.coord_type_z = 1;
7393 tex.coord_type_w = 1;
7394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7395 alu.op = ALU_OP1_MOV;
7396 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7397 alu.dst.sel = tex.src_gpr;
7398 alu.dst.chan = 0;
7399 alu.last = 1;
7400 alu.dst.write = 1;
7401 r = r600_bytecode_add_alu(ctx->bc, &alu);
7402 if (r)
7403 return r;
7404
7405 r = r600_bytecode_add_tex(ctx->bc, &tex);
7406 if (r)
7407 return r;
7408 }
7409
7410 }
7411
7412 /* for cube forms of lod and bias we need to route things */
7413 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7414 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7415 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7416 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7417 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7418 alu.op = ALU_OP1_MOV;
7419 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7420 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7421 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7422 else
7423 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7424 alu.dst.sel = ctx->temp_reg;
7425 alu.dst.chan = 2;
7426 alu.last = 1;
7427 alu.dst.write = 1;
7428 r = r600_bytecode_add_alu(ctx->bc, &alu);
7429 if (r)
7430 return r;
7431 }
7432
7433 src_loaded = TRUE;
7434 src_gpr = ctx->temp_reg;
7435 }
7436
7437 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7438 int temp_h = 0, temp_v = 0;
7439 int start_val = 0;
7440
7441 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7442 if (src_loaded == TRUE)
7443 start_val = 1;
7444 else
7445 src_loaded = TRUE;
7446 for (i = start_val; i < 3; i++) {
7447 int treg = r600_get_temp(ctx);
7448
7449 if (i == 0)
7450 src_gpr = treg;
7451 else if (i == 1)
7452 temp_h = treg;
7453 else
7454 temp_v = treg;
7455
7456 for (j = 0; j < 4; j++) {
7457 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7458 alu.op = ALU_OP1_MOV;
7459 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7460 alu.dst.sel = treg;
7461 alu.dst.chan = j;
7462 if (j == 3)
7463 alu.last = 1;
7464 alu.dst.write = 1;
7465 r = r600_bytecode_add_alu(ctx->bc, &alu);
7466 if (r)
7467 return r;
7468 }
7469 }
7470 for (i = 1; i < 3; i++) {
7471 /* set gradients h/v */
7472 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7473 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7474 FETCH_OP_SET_GRADIENTS_V;
7475 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7476 tex.sampler_index_mode = sampler_index_mode;
7477 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7478 tex.resource_index_mode = sampler_index_mode;
7479
7480 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7481 tex.src_sel_x = 0;
7482 tex.src_sel_y = 1;
7483 tex.src_sel_z = 2;
7484 tex.src_sel_w = 3;
7485
7486 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7487 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7488 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7489 tex.coord_type_x = 1;
7490 tex.coord_type_y = 1;
7491 tex.coord_type_z = 1;
7492 tex.coord_type_w = 1;
7493 }
7494 r = r600_bytecode_add_tex(ctx->bc, &tex);
7495 if (r)
7496 return r;
7497 }
7498 }
7499
7500 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7501 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
7502 * incorrectly forces nearest filtering if the texture format is integer.
7503 * The only effect it has on Gather4, which always returns 4 texels for
7504 * bilinear filtering, is that the final coordinates are off by 0.5 of
7505 * the texel size.
7506 *
7507 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7508 * or (0.5 / size) from the normalized coordinates.
7509 */
7510 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7511 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7512 int treg = r600_get_temp(ctx);
7513
7514 /* mov array and comparison oordinate to temp_reg if needed */
7515 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7516 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7517 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7518 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7519 for (i = 2; i <= end; i++) {
7520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7521 alu.op = ALU_OP1_MOV;
7522 alu.dst.sel = ctx->temp_reg;
7523 alu.dst.chan = i;
7524 alu.dst.write = 1;
7525 alu.last = (i == end);
7526 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7527 r = r600_bytecode_add_alu(ctx->bc, &alu);
7528 if (r)
7529 return r;
7530 }
7531 }
7532
7533 if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
7534 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
7535 for (i = 0; i < 2; i++) {
7536 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7537 alu.op = ALU_OP2_ADD;
7538 alu.dst.sel = ctx->temp_reg;
7539 alu.dst.chan = i;
7540 alu.dst.write = 1;
7541 alu.last = i == 1;
7542 if (src_loaded) {
7543 alu.src[0].sel = ctx->temp_reg;
7544 alu.src[0].chan = i;
7545 } else
7546 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7547 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7548 alu.src[1].neg = 1;
7549 r = r600_bytecode_add_alu(ctx->bc, &alu);
7550 if (r)
7551 return r;
7552 }
7553 } else {
7554 /* execute a TXQ */
7555 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7556 tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
7557 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7558 tex.sampler_index_mode = sampler_index_mode;
7559 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7560 tex.resource_index_mode = sampler_index_mode;
7561 tex.dst_gpr = treg;
7562 tex.src_sel_x = 4;
7563 tex.src_sel_y = 4;
7564 tex.src_sel_z = 4;
7565 tex.src_sel_w = 4;
7566 tex.dst_sel_x = 0;
7567 tex.dst_sel_y = 1;
7568 tex.dst_sel_z = 7;
7569 tex.dst_sel_w = 7;
7570 r = r600_bytecode_add_tex(ctx->bc, &tex);
7571 if (r)
7572 return r;
7573
7574 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
7575 if (ctx->bc->chip_class == CAYMAN) {
7576 /* */
7577 for (i = 0; i < 2; i++) {
7578 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7579 alu.op = ALU_OP1_INT_TO_FLT;
7580 alu.dst.sel = treg;
7581 alu.dst.chan = i;
7582 alu.dst.write = 1;
7583 alu.src[0].sel = treg;
7584 alu.src[0].chan = i;
7585 alu.last = (i == 1) ? 1 : 0;
7586 r = r600_bytecode_add_alu(ctx->bc, &alu);
7587 if (r)
7588 return r;
7589 }
7590 for (j = 0; j < 2; j++) {
7591 for (i = 0; i < 3; i++) {
7592 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7593 alu.op = ALU_OP1_RECIP_IEEE;
7594 alu.src[0].sel = treg;
7595 alu.src[0].chan = j;
7596 alu.dst.sel = treg;
7597 alu.dst.chan = i;
7598 if (i == 2)
7599 alu.last = 1;
7600 if (i == j)
7601 alu.dst.write = 1;
7602 r = r600_bytecode_add_alu(ctx->bc, &alu);
7603 if (r)
7604 return r;
7605 }
7606 }
7607 } else {
7608 for (i = 0; i < 2; i++) {
7609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7610 alu.op = ALU_OP1_INT_TO_FLT;
7611 alu.dst.sel = treg;
7612 alu.dst.chan = i;
7613 alu.dst.write = 1;
7614 alu.src[0].sel = treg;
7615 alu.src[0].chan = i;
7616 alu.last = 1;
7617 r = r600_bytecode_add_alu(ctx->bc, &alu);
7618 if (r)
7619 return r;
7620 }
7621 for (i = 0; i < 2; i++) {
7622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7623 alu.op = ALU_OP1_RECIP_IEEE;
7624 alu.src[0].sel = treg;
7625 alu.src[0].chan = i;
7626 alu.dst.sel = treg;
7627 alu.dst.chan = i;
7628 alu.last = 1;
7629 alu.dst.write = 1;
7630 r = r600_bytecode_add_alu(ctx->bc, &alu);
7631 if (r)
7632 return r;
7633 }
7634 }
7635 for (i = 0; i < 2; i++) {
7636 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7637 alu.op = ALU_OP3_MULADD;
7638 alu.is_op3 = 1;
7639 alu.dst.sel = ctx->temp_reg;
7640 alu.dst.chan = i;
7641 alu.dst.write = 1;
7642 alu.last = i == 1;
7643 alu.src[0].sel = treg;
7644 alu.src[0].chan = i;
7645 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7646 alu.src[1].neg = 1;
7647 if (src_loaded) {
7648 alu.src[2].sel = ctx->temp_reg;
7649 alu.src[2].chan = i;
7650 } else
7651 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
7652 r = r600_bytecode_add_alu(ctx->bc, &alu);
7653 if (r)
7654 return r;
7655 }
7656 }
7657 src_loaded = TRUE;
7658 src_gpr = ctx->temp_reg;
7659 }
7660 }
7661
7662 if (src_requires_loading && !src_loaded) {
7663 for (i = 0; i < 4; i++) {
7664 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7665 alu.op = ALU_OP1_MOV;
7666 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7667 alu.dst.sel = ctx->temp_reg;
7668 alu.dst.chan = i;
7669 if (i == 3)
7670 alu.last = 1;
7671 alu.dst.write = 1;
7672 r = r600_bytecode_add_alu(ctx->bc, &alu);
7673 if (r)
7674 return r;
7675 }
7676 src_loaded = TRUE;
7677 src_gpr = ctx->temp_reg;
7678 }
7679
7680 /* get offset values */
7681 if (inst->Texture.NumOffsets) {
7682 assert(inst->Texture.NumOffsets == 1);
7683
7684 /* The texture offset feature doesn't work with the TXF instruction
7685 * and must be emulated by adding the offset to the texture coordinates. */
7686 if (txf_add_offsets) {
7687 const struct tgsi_texture_offset *off = inst->TexOffsets;
7688
7689 switch (inst->Texture.Texture) {
7690 case TGSI_TEXTURE_3D:
7691 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7692 alu.op = ALU_OP2_ADD_INT;
7693 alu.src[0].sel = src_gpr;
7694 alu.src[0].chan = 2;
7695 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7696 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7697 alu.dst.sel = src_gpr;
7698 alu.dst.chan = 2;
7699 alu.dst.write = 1;
7700 alu.last = 1;
7701 r = r600_bytecode_add_alu(ctx->bc, &alu);
7702 if (r)
7703 return r;
7704 /* fall through */
7705
7706 case TGSI_TEXTURE_2D:
7707 case TGSI_TEXTURE_SHADOW2D:
7708 case TGSI_TEXTURE_RECT:
7709 case TGSI_TEXTURE_SHADOWRECT:
7710 case TGSI_TEXTURE_2D_ARRAY:
7711 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7712 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7713 alu.op = ALU_OP2_ADD_INT;
7714 alu.src[0].sel = src_gpr;
7715 alu.src[0].chan = 1;
7716 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7717 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7718 alu.dst.sel = src_gpr;
7719 alu.dst.chan = 1;
7720 alu.dst.write = 1;
7721 alu.last = 1;
7722 r = r600_bytecode_add_alu(ctx->bc, &alu);
7723 if (r)
7724 return r;
7725 /* fall through */
7726
7727 case TGSI_TEXTURE_1D:
7728 case TGSI_TEXTURE_SHADOW1D:
7729 case TGSI_TEXTURE_1D_ARRAY:
7730 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7732 alu.op = ALU_OP2_ADD_INT;
7733 alu.src[0].sel = src_gpr;
7734 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7735 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7736 alu.dst.sel = src_gpr;
7737 alu.dst.write = 1;
7738 alu.last = 1;
7739 r = r600_bytecode_add_alu(ctx->bc, &alu);
7740 if (r)
7741 return r;
7742 break;
7743 /* texture offsets do not apply to other texture targets */
7744 }
7745 } else {
7746 switch (inst->Texture.Texture) {
7747 case TGSI_TEXTURE_3D:
7748 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7749 /* fallthrough */
7750 case TGSI_TEXTURE_2D:
7751 case TGSI_TEXTURE_SHADOW2D:
7752 case TGSI_TEXTURE_RECT:
7753 case TGSI_TEXTURE_SHADOWRECT:
7754 case TGSI_TEXTURE_2D_ARRAY:
7755 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7756 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7757 /* fallthrough */
7758 case TGSI_TEXTURE_1D:
7759 case TGSI_TEXTURE_SHADOW1D:
7760 case TGSI_TEXTURE_1D_ARRAY:
7761 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7762 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7763 }
7764 }
7765 }
7766
7767 /* Obtain the sample index for reading a compressed MSAA color texture.
7768 * To read the FMASK, we use the ldfptr instruction, which tells us
7769 * where the samples are stored.
7770 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7771 * which is the identity mapping. Each nibble says which physical sample
7772 * should be fetched to get that sample.
7773 *
7774 * Assume src.z contains the sample index. It should be modified like this:
7775 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7776 * Then fetch the texel with src.
7777 */
7778 if (read_compressed_msaa) {
7779 unsigned sample_chan = 3;
7780 unsigned temp = r600_get_temp(ctx);
7781 assert(src_loaded);
7782
7783 /* temp.w = ldfptr() */
7784 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7785 tex.op = FETCH_OP_LD;
7786 tex.inst_mod = 1; /* to indicate this is ldfptr */
7787 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7788 tex.sampler_index_mode = sampler_index_mode;
7789 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7790 tex.resource_index_mode = sampler_index_mode;
7791 tex.src_gpr = src_gpr;
7792 tex.dst_gpr = temp;
7793 tex.dst_sel_x = 7; /* mask out these components */
7794 tex.dst_sel_y = 7;
7795 tex.dst_sel_z = 7;
7796 tex.dst_sel_w = 0; /* store X */
7797 tex.src_sel_x = 0;
7798 tex.src_sel_y = 1;
7799 tex.src_sel_z = 2;
7800 tex.src_sel_w = 3;
7801 tex.offset_x = offset_x;
7802 tex.offset_y = offset_y;
7803 tex.offset_z = offset_z;
7804 r = r600_bytecode_add_tex(ctx->bc, &tex);
7805 if (r)
7806 return r;
7807
7808 /* temp.x = sample_index*4 */
7809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7810 alu.op = ALU_OP2_MULLO_INT;
7811 alu.src[0].sel = src_gpr;
7812 alu.src[0].chan = sample_chan;
7813 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7814 alu.src[1].value = 4;
7815 alu.dst.sel = temp;
7816 alu.dst.chan = 0;
7817 alu.dst.write = 1;
7818 r = emit_mul_int_op(ctx->bc, &alu);
7819 if (r)
7820 return r;
7821
7822 /* sample_index = temp.w >> temp.x */
7823 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7824 alu.op = ALU_OP2_LSHR_INT;
7825 alu.src[0].sel = temp;
7826 alu.src[0].chan = 3;
7827 alu.src[1].sel = temp;
7828 alu.src[1].chan = 0;
7829 alu.dst.sel = src_gpr;
7830 alu.dst.chan = sample_chan;
7831 alu.dst.write = 1;
7832 alu.last = 1;
7833 r = r600_bytecode_add_alu(ctx->bc, &alu);
7834 if (r)
7835 return r;
7836
7837 /* sample_index & 0xF */
7838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7839 alu.op = ALU_OP2_AND_INT;
7840 alu.src[0].sel = src_gpr;
7841 alu.src[0].chan = sample_chan;
7842 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7843 alu.src[1].value = 0xF;
7844 alu.dst.sel = src_gpr;
7845 alu.dst.chan = sample_chan;
7846 alu.dst.write = 1;
7847 alu.last = 1;
7848 r = r600_bytecode_add_alu(ctx->bc, &alu);
7849 if (r)
7850 return r;
7851 #if 0
7852 /* visualize the FMASK */
7853 for (i = 0; i < 4; i++) {
7854 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7855 alu.op = ALU_OP1_INT_TO_FLT;
7856 alu.src[0].sel = src_gpr;
7857 alu.src[0].chan = sample_chan;
7858 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7859 alu.dst.chan = i;
7860 alu.dst.write = 1;
7861 alu.last = 1;
7862 r = r600_bytecode_add_alu(ctx->bc, &alu);
7863 if (r)
7864 return r;
7865 }
7866 return 0;
7867 #endif
7868 }
7869
7870 /* does this shader want a num layers from TXQ for a cube array? */
7871 if (has_txq_cube_array_z) {
7872 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7873
7874 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7875 alu.op = ALU_OP1_MOV;
7876
7877 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7878 if (ctx->bc->chip_class >= EVERGREEN) {
7879 /* with eg each dword is number of cubes */
7880 alu.src[0].sel += id / 4;
7881 alu.src[0].chan = id % 4;
7882 } else {
7883 /* r600 we have them at channel 2 of the second dword */
7884 alu.src[0].sel += (id * 2) + 1;
7885 alu.src[0].chan = 2;
7886 }
7887 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7888 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7889 alu.last = 1;
7890 r = r600_bytecode_add_alu(ctx->bc, &alu);
7891 if (r)
7892 return r;
7893 /* disable writemask from texture instruction */
7894 inst->Dst[0].Register.WriteMask &= ~4;
7895 }
7896
7897 opcode = ctx->inst_info->op;
7898 if (opcode == FETCH_OP_GATHER4 &&
7899 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7900 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7901 opcode = FETCH_OP_GATHER4_O;
7902
7903 /* GATHER4_O/GATHER4_C_O use offset values loaded by
7904 SET_TEXTURE_OFFSETS instruction. The immediate offset values
7905 encoded in the instruction are ignored. */
7906 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7907 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7908 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7909 tex.sampler_index_mode = sampler_index_mode;
7910 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7911 tex.resource_index_mode = sampler_index_mode;
7912
7913 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7914 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7915 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7916 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7917 tex.src_sel_w = 4;
7918
7919 tex.dst_sel_x = 7;
7920 tex.dst_sel_y = 7;
7921 tex.dst_sel_z = 7;
7922 tex.dst_sel_w = 7;
7923
7924 r = r600_bytecode_add_tex(ctx->bc, &tex);
7925 if (r)
7926 return r;
7927 }
7928
7929 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7930 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7931 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7932 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7933 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7934 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7935 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7936 switch (opcode) {
7937 case FETCH_OP_SAMPLE:
7938 opcode = FETCH_OP_SAMPLE_C;
7939 break;
7940 case FETCH_OP_SAMPLE_L:
7941 opcode = FETCH_OP_SAMPLE_C_L;
7942 break;
7943 case FETCH_OP_SAMPLE_LB:
7944 opcode = FETCH_OP_SAMPLE_C_LB;
7945 break;
7946 case FETCH_OP_SAMPLE_G:
7947 opcode = FETCH_OP_SAMPLE_C_G;
7948 break;
7949 /* Texture gather variants */
7950 case FETCH_OP_GATHER4:
7951 opcode = FETCH_OP_GATHER4_C;
7952 break;
7953 case FETCH_OP_GATHER4_O:
7954 opcode = FETCH_OP_GATHER4_C_O;
7955 break;
7956 }
7957 }
7958
7959 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7960 tex.op = opcode;
7961
7962 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7963 tex.sampler_index_mode = sampler_index_mode;
7964 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7965 tex.resource_index_mode = sampler_index_mode;
7966 tex.src_gpr = src_gpr;
7967 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7968
7969 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7970 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7971 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7972 }
7973
7974 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7975 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7976 tex.inst_mod = texture_component_select;
7977
7978 if (ctx->bc->chip_class == CAYMAN) {
7979 /* GATHER4 result order is different from TGSI TG4 */
7980 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7981 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7982 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7983 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7984 } else {
7985 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7986 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7987 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7988 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7989 }
7990 }
7991 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7992 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7993 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7994 tex.dst_sel_z = 7;
7995 tex.dst_sel_w = 7;
7996 }
7997 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7998 tex.dst_sel_x = 3;
7999 tex.dst_sel_y = 7;
8000 tex.dst_sel_z = 7;
8001 tex.dst_sel_w = 7;
8002 }
8003 else {
8004 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8005 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8006 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8007 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8008 }
8009
8010
8011 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8012 tex.src_sel_x = 4;
8013 tex.src_sel_y = 4;
8014 tex.src_sel_z = 4;
8015 tex.src_sel_w = 4;
8016 } else if (src_loaded) {
8017 tex.src_sel_x = 0;
8018 tex.src_sel_y = 1;
8019 tex.src_sel_z = 2;
8020 tex.src_sel_w = 3;
8021 } else {
8022 tex.src_sel_x = ctx->src[0].swizzle[0];
8023 tex.src_sel_y = ctx->src[0].swizzle[1];
8024 tex.src_sel_z = ctx->src[0].swizzle[2];
8025 tex.src_sel_w = ctx->src[0].swizzle[3];
8026 tex.src_rel = ctx->src[0].rel;
8027 }
8028
8029 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8030 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8031 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8032 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8033 tex.src_sel_x = 1;
8034 tex.src_sel_y = 0;
8035 tex.src_sel_z = 3;
8036 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8037 }
8038
8039 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8040 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8041 tex.coord_type_x = 1;
8042 tex.coord_type_y = 1;
8043 }
8044 tex.coord_type_z = 1;
8045 tex.coord_type_w = 1;
8046
8047 tex.offset_x = offset_x;
8048 tex.offset_y = offset_y;
8049 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8050 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8051 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8052 tex.offset_z = 0;
8053 }
8054 else {
8055 tex.offset_z = offset_z;
8056 }
8057
8058 /* Put the depth for comparison in W.
8059 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8060 * Some instructions expect the depth in Z. */
8061 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8062 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8063 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8064 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8065 opcode != FETCH_OP_SAMPLE_C_L &&
8066 opcode != FETCH_OP_SAMPLE_C_LB) {
8067 tex.src_sel_w = tex.src_sel_z;
8068 }
8069
8070 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8071 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8072 if (opcode == FETCH_OP_SAMPLE_C_L ||
8073 opcode == FETCH_OP_SAMPLE_C_LB) {
8074 /* the array index is read from Y */
8075 tex.coord_type_y = 0;
8076 } else {
8077 /* the array index is read from Z */
8078 tex.coord_type_z = 0;
8079 tex.src_sel_z = tex.src_sel_y;
8080 }
8081 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8082 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8083 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8084 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8085 (ctx->bc->chip_class >= EVERGREEN)))
8086 /* the array index is read from Z */
8087 tex.coord_type_z = 0;
8088
8089 /* mask unused source components */
8090 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8091 switch (inst->Texture.Texture) {
8092 case TGSI_TEXTURE_2D:
8093 case TGSI_TEXTURE_RECT:
8094 tex.src_sel_z = 7;
8095 tex.src_sel_w = 7;
8096 break;
8097 case TGSI_TEXTURE_1D_ARRAY:
8098 tex.src_sel_y = 7;
8099 tex.src_sel_w = 7;
8100 break;
8101 case TGSI_TEXTURE_1D:
8102 tex.src_sel_y = 7;
8103 tex.src_sel_z = 7;
8104 tex.src_sel_w = 7;
8105 break;
8106 }
8107 }
8108
8109 r = r600_bytecode_add_tex(ctx->bc, &tex);
8110 if (r)
8111 return r;
8112
8113 /* add shadow ambient support - gallium doesn't do it yet */
8114 return 0;
8115 }
8116
8117 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8118 struct tgsi_full_src_register *src)
8119 {
8120 unsigned i;
8121
8122 if (src->Register.Indirect) {
8123 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8124 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8125 return ctx->shader->atomics[i].hw_idx;
8126 }
8127 } else {
8128 uint32_t index = src->Register.Index;
8129 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8130 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8131 continue;
8132 if (index > ctx->shader->atomics[i].end)
8133 continue;
8134 if (index < ctx->shader->atomics[i].start)
8135 continue;
8136 uint32_t offset = (index - ctx->shader->atomics[i].start);
8137 return ctx->shader->atomics[i].hw_idx + offset;
8138 }
8139 }
8140 assert(0);
8141 return -1;
8142 }
8143
8144 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8145 int *uav_id_p, int *uav_index_mode_p)
8146 {
8147 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8148 int uav_id, uav_index_mode = 0;
8149 int r;
8150 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8151
8152 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8153
8154 if (inst->Src[0].Register.Indirect) {
8155 if (is_cm) {
8156 struct r600_bytecode_alu alu;
8157 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8158 alu.op = ALU_OP2_LSHL_INT;
8159 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8160 alu.src[0].chan = 0;
8161 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8162 alu.src[1].value = 2;
8163 alu.dst.sel = ctx->temp_reg;
8164 alu.dst.chan = 0;
8165 alu.dst.write = 1;
8166 alu.last = 1;
8167 r = r600_bytecode_add_alu(ctx->bc, &alu);
8168 if (r)
8169 return r;
8170
8171 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8172 ctx->temp_reg, 0,
8173 ctx->temp_reg, 0,
8174 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8175 if (r)
8176 return r;
8177 } else
8178 uav_index_mode = 2;
8179 } else if (is_cm) {
8180 r = single_alu_op2(ctx, ALU_OP1_MOV,
8181 ctx->temp_reg, 0,
8182 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8183 0, 0);
8184 if (r)
8185 return r;
8186 }
8187 *uav_id_p = uav_id;
8188 *uav_index_mode_p = uav_index_mode;
8189 return 0;
8190 }
8191
8192 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8193 {
8194 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8195 int r;
8196 struct r600_bytecode_gds gds;
8197 int uav_id = 0;
8198 int uav_index_mode = 0;
8199 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8200
8201 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8202 if (r)
8203 return r;
8204
8205 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8206 gds.op = FETCH_OP_GDS_READ_RET;
8207 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8208 gds.uav_id = is_cm ? 0 : uav_id;
8209 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8210 gds.src_gpr = ctx->temp_reg;
8211 gds.src_sel_x = (is_cm) ? 0 : 4;
8212 gds.src_sel_y = 4;
8213 gds.src_sel_z = 4;
8214 gds.dst_sel_x = 0;
8215 gds.dst_sel_y = 7;
8216 gds.dst_sel_z = 7;
8217 gds.dst_sel_w = 7;
8218 gds.src_gpr2 = 0;
8219 gds.alloc_consume = !is_cm;
8220 r = r600_bytecode_add_gds(ctx->bc, &gds);
8221 if (r)
8222 return r;
8223
8224 ctx->bc->cf_last->vpm = 1;
8225 return 0;
8226 }
8227
8228 /* this fixes up 1D arrays properly */
8229 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8230 {
8231 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8232 int r, i;
8233 struct r600_bytecode_alu alu;
8234 int temp_reg = r600_get_temp(ctx);
8235
8236 for (i = 0; i < 4; i++) {
8237 bool def_val = true, write_zero = false;
8238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8239 alu.op = ALU_OP1_MOV;
8240 alu.dst.sel = temp_reg;
8241 alu.dst.chan = i;
8242
8243 switch (inst->Memory.Texture) {
8244 case TGSI_TEXTURE_BUFFER:
8245 case TGSI_TEXTURE_1D:
8246 if (i == 1 || i == 2 || i == 3) {
8247 write_zero = true;
8248 }
8249 break;
8250 case TGSI_TEXTURE_1D_ARRAY:
8251 if (i == 1 || i == 3)
8252 write_zero = true;
8253 else if (i == 2) {
8254 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8255 def_val = false;
8256 }
8257 break;
8258 case TGSI_TEXTURE_2D:
8259 if (i == 2 || i == 3)
8260 write_zero = true;
8261 break;
8262 default:
8263 if (i == 3)
8264 write_zero = true;
8265 break;
8266 }
8267
8268 if (write_zero) {
8269 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8270 alu.src[0].value = 0;
8271 } else if (def_val) {
8272 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8273 }
8274
8275 if (i == 3)
8276 alu.last = 1;
8277 alu.dst.write = 1;
8278 r = r600_bytecode_add_alu(ctx->bc, &alu);
8279 if (r)
8280 return r;
8281 }
8282 *idx_gpr = temp_reg;
8283 return 0;
8284 }
8285
8286 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8287 int temp_reg)
8288 {
8289 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8290 int r;
8291 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8292 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8293 r = single_alu_op2(ctx, ALU_OP1_MOV,
8294 temp_reg, 0,
8295 V_SQ_ALU_SRC_LITERAL, value >> 2,
8296 0, 0);
8297 if (r)
8298 return r;
8299 } else {
8300 struct r600_bytecode_alu alu;
8301 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8302 alu.op = ALU_OP2_LSHR_INT;
8303 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8304 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8305 alu.src[1].value = 2;
8306 alu.dst.sel = temp_reg;
8307 alu.dst.write = 1;
8308 alu.last = 1;
8309 r = r600_bytecode_add_alu(ctx->bc, &alu);
8310 if (r)
8311 return r;
8312 }
8313 return 0;
8314 }
8315
8316 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8317 {
8318 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8319 /* have to work out the offset into the RAT immediate return buffer */
8320 struct r600_bytecode_vtx vtx;
8321 struct r600_bytecode_cf *cf;
8322 int r;
8323 int temp_reg = r600_get_temp(ctx);
8324 unsigned rat_index_mode;
8325 unsigned base;
8326
8327 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8328 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8329
8330 r = load_buffer_coord(ctx, 1, temp_reg);
8331 if (r)
8332 return r;
8333 ctx->bc->cf_last->barrier = 1;
8334 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8335 vtx.op = FETCH_OP_VFETCH;
8336 vtx.buffer_id = inst->Src[0].Register.Index + base;
8337 vtx.buffer_index_mode = rat_index_mode;
8338 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8339 vtx.src_gpr = temp_reg;
8340 vtx.src_sel_x = 0;
8341 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8342 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8343 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8344 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8345 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8346 vtx.num_format_all = 1;
8347 vtx.format_comp_all = 1;
8348 vtx.srf_mode_all = 0;
8349
8350 if (inst->Dst[0].Register.WriteMask & 8) {
8351 vtx.data_format = FMT_32_32_32_32;
8352 vtx.use_const_fields = 0;
8353 } else if (inst->Dst[0].Register.WriteMask & 4) {
8354 vtx.data_format = FMT_32_32_32;
8355 vtx.use_const_fields = 0;
8356 } else if (inst->Dst[0].Register.WriteMask & 2) {
8357 vtx.data_format = FMT_32_32;
8358 vtx.use_const_fields = 0;
8359 } else {
8360 vtx.data_format = FMT_32;
8361 vtx.use_const_fields = 0;
8362 }
8363
8364 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8365 if (r)
8366 return r;
8367 cf = ctx->bc->cf_last;
8368 cf->barrier = 1;
8369 return 0;
8370 }
8371
8372 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8373 {
8374 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8375 /* have to work out the offset into the RAT immediate return buffer */
8376 struct r600_bytecode_vtx vtx;
8377 struct r600_bytecode_cf *cf;
8378 int r;
8379 int idx_gpr;
8380 unsigned format, num_format, format_comp, endian;
8381 const struct util_format_description *desc;
8382 unsigned rat_index_mode;
8383 unsigned immed_base;
8384
8385 r = load_thread_id_gpr(ctx);
8386 if (r)
8387 return r;
8388
8389 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8390
8391 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8392 r = load_index_src(ctx, 1, &idx_gpr);
8393 if (r)
8394 return r;
8395
8396 if (rat_index_mode)
8397 egcm_load_index_reg(ctx->bc, 1, false);
8398
8399 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8400 cf = ctx->bc->cf_last;
8401
8402 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8403 cf->rat.inst = V_RAT_INST_NOP_RTN;
8404 cf->rat.index_mode = rat_index_mode;
8405 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8406 cf->output.gpr = ctx->thread_id_gpr;
8407 cf->output.index_gpr = idx_gpr;
8408 cf->output.comp_mask = 0xf;
8409 cf->output.burst_count = 1;
8410 cf->vpm = 1;
8411 cf->barrier = 1;
8412 cf->mark = 1;
8413 cf->output.elem_size = 0;
8414
8415 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8416 cf = ctx->bc->cf_last;
8417 cf->barrier = 1;
8418
8419 desc = util_format_description(inst->Memory.Format);
8420 r600_vertex_data_type(inst->Memory.Format,
8421 &format, &num_format, &format_comp, &endian);
8422 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8423 vtx.op = FETCH_OP_VFETCH;
8424 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8425 vtx.buffer_index_mode = rat_index_mode;
8426 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8427 vtx.src_gpr = ctx->thread_id_gpr;
8428 vtx.src_sel_x = 1;
8429 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8430 vtx.dst_sel_x = desc->swizzle[0];
8431 vtx.dst_sel_y = desc->swizzle[1];
8432 vtx.dst_sel_z = desc->swizzle[2];
8433 vtx.dst_sel_w = desc->swizzle[3];
8434 vtx.srf_mode_all = 1;
8435 vtx.data_format = format;
8436 vtx.num_format_all = num_format;
8437 vtx.format_comp_all = format_comp;
8438 vtx.endian = endian;
8439 vtx.offset = 0;
8440 vtx.mega_fetch_count = 3;
8441 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8442 if (r)
8443 return r;
8444 cf = ctx->bc->cf_last;
8445 cf->barrier = 1;
8446 return 0;
8447 }
8448
8449 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8450 {
8451 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8452 struct r600_bytecode_alu alu;
8453 int r;
8454 int temp_reg = r600_get_temp(ctx);
8455
8456 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8457 alu.op = ALU_OP1_MOV;
8458 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8459 alu.dst.sel = temp_reg;
8460 alu.dst.write = 1;
8461 alu.last = 1;
8462 r = r600_bytecode_add_alu(ctx->bc, &alu);
8463 if (r)
8464 return r;
8465
8466 r = do_lds_fetch_values(ctx, temp_reg,
8467 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8468 if (r)
8469 return r;
8470 return 0;
8471 }
8472
8473 static int tgsi_load(struct r600_shader_ctx *ctx)
8474 {
8475 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8476 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8477 return tgsi_load_rat(ctx);
8478 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8479 return tgsi_load_gds(ctx);
8480 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8481 return tgsi_load_buffer(ctx);
8482 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8483 return tgsi_load_lds(ctx);
8484 return 0;
8485 }
8486
8487 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8488 {
8489 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8490 struct r600_bytecode_cf *cf;
8491 int r, i;
8492 unsigned rat_index_mode;
8493 int lasti;
8494 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8495
8496 r = load_buffer_coord(ctx, 0, treg2);
8497 if (r)
8498 return r;
8499
8500 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8501 if (rat_index_mode)
8502 egcm_load_index_reg(ctx->bc, 1, false);
8503
8504 for (i = 0; i <= 3; i++) {
8505 struct r600_bytecode_alu alu;
8506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8507 alu.op = ALU_OP1_MOV;
8508 alu.dst.sel = temp_reg;
8509 alu.dst.chan = i;
8510 alu.src[0].sel = V_SQ_ALU_SRC_0;
8511 alu.last = (i == 3);
8512 alu.dst.write = 1;
8513 r = r600_bytecode_add_alu(ctx->bc, &alu);
8514 if (r)
8515 return r;
8516 }
8517
8518 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8519 for (i = 0; i <= lasti; i++) {
8520 struct r600_bytecode_alu alu;
8521 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8522 continue;
8523
8524 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8525 temp_reg, 0,
8526 treg2, 0,
8527 V_SQ_ALU_SRC_LITERAL, i);
8528 if (r)
8529 return r;
8530
8531 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8532 alu.op = ALU_OP1_MOV;
8533 alu.dst.sel = ctx->temp_reg;
8534 alu.dst.chan = 0;
8535
8536 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8537 alu.last = 1;
8538 alu.dst.write = 1;
8539 r = r600_bytecode_add_alu(ctx->bc, &alu);
8540 if (r)
8541 return r;
8542
8543 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8544 cf = ctx->bc->cf_last;
8545
8546 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8547 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8548 cf->rat.index_mode = rat_index_mode;
8549 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8550 cf->output.gpr = ctx->temp_reg;
8551 cf->output.index_gpr = temp_reg;
8552 cf->output.comp_mask = 1;
8553 cf->output.burst_count = 1;
8554 cf->vpm = 1;
8555 cf->barrier = 1;
8556 cf->output.elem_size = 0;
8557 }
8558 return 0;
8559 }
8560
8561 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8562 {
8563 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8564 struct r600_bytecode_cf *cf;
8565 bool src_requires_loading = false;
8566 int val_gpr, idx_gpr;
8567 int r, i;
8568 unsigned rat_index_mode;
8569
8570 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8571
8572 r = load_index_src(ctx, 0, &idx_gpr);
8573 if (r)
8574 return r;
8575
8576 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8577 src_requires_loading = true;
8578
8579 if (src_requires_loading) {
8580 struct r600_bytecode_alu alu;
8581 for (i = 0; i < 4; i++) {
8582 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8583 alu.op = ALU_OP1_MOV;
8584 alu.dst.sel = ctx->temp_reg;
8585 alu.dst.chan = i;
8586
8587 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8588 if (i == 3)
8589 alu.last = 1;
8590 alu.dst.write = 1;
8591 r = r600_bytecode_add_alu(ctx->bc, &alu);
8592 if (r)
8593 return r;
8594 }
8595 val_gpr = ctx->temp_reg;
8596 } else
8597 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8598 if (rat_index_mode)
8599 egcm_load_index_reg(ctx->bc, 1, false);
8600
8601 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8602 cf = ctx->bc->cf_last;
8603
8604 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8605 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8606 cf->rat.index_mode = rat_index_mode;
8607 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8608 cf->output.gpr = val_gpr;
8609 cf->output.index_gpr = idx_gpr;
8610 cf->output.comp_mask = 0xf;
8611 cf->output.burst_count = 1;
8612 cf->vpm = 1;
8613 cf->barrier = 1;
8614 cf->output.elem_size = 0;
8615 return 0;
8616 }
8617
8618 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
8619 {
8620 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8621 struct r600_bytecode_alu alu;
8622 int r, i, lasti;
8623 int write_mask = inst->Dst[0].Register.WriteMask;
8624 int temp_reg = r600_get_temp(ctx);
8625
8626 /* LDS write */
8627 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8628 alu.op = ALU_OP1_MOV;
8629 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8630 alu.dst.sel = temp_reg;
8631 alu.dst.write = 1;
8632 alu.last = 1;
8633 r = r600_bytecode_add_alu(ctx->bc, &alu);
8634 if (r)
8635 return r;
8636
8637 lasti = tgsi_last_instruction(write_mask);
8638 for (i = 1; i <= lasti; i++) {
8639 if (!(write_mask & (1 << i)))
8640 continue;
8641 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8642 temp_reg, i,
8643 temp_reg, 0,
8644 V_SQ_ALU_SRC_LITERAL, 4 * i);
8645 if (r)
8646 return r;
8647 }
8648 for (i = 0; i <= lasti; i++) {
8649 if (!(write_mask & (1 << i)))
8650 continue;
8651
8652 if ((i == 0 && ((write_mask & 3) == 3)) ||
8653 (i == 2 && ((write_mask & 0xc) == 0xc))) {
8654 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8655 alu.op = LDS_OP3_LDS_WRITE_REL;
8656
8657 alu.src[0].sel = temp_reg;
8658 alu.src[0].chan = i;
8659 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8660 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
8661 alu.last = 1;
8662 alu.is_lds_idx_op = true;
8663 alu.lds_idx = 1;
8664 r = r600_bytecode_add_alu(ctx->bc, &alu);
8665 if (r)
8666 return r;
8667 i += 1;
8668 continue;
8669 }
8670 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8671 alu.op = LDS_OP2_LDS_WRITE;
8672
8673 alu.src[0].sel = temp_reg;
8674 alu.src[0].chan = i;
8675 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8676
8677 alu.last = 1;
8678 alu.is_lds_idx_op = true;
8679
8680 r = r600_bytecode_add_alu(ctx->bc, &alu);
8681 if (r)
8682 return r;
8683 }
8684 return 0;
8685 }
8686
8687 static int tgsi_store(struct r600_shader_ctx *ctx)
8688 {
8689 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8690 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
8691 return tgsi_store_buffer_rat(ctx);
8692 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
8693 return tgsi_store_lds(ctx);
8694 else
8695 return tgsi_store_rat(ctx);
8696 }
8697
8698 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8699 {
8700 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8701 /* have to work out the offset into the RAT immediate return buffer */
8702 struct r600_bytecode_alu alu;
8703 struct r600_bytecode_vtx vtx;
8704 struct r600_bytecode_cf *cf;
8705 int r;
8706 int idx_gpr;
8707 unsigned format, num_format, format_comp, endian;
8708 const struct util_format_description *desc;
8709 unsigned rat_index_mode;
8710 unsigned immed_base;
8711 unsigned rat_base;
8712
8713 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8714 rat_base = ctx->shader->rat_base;
8715
8716 r = load_thread_id_gpr(ctx);
8717 if (r)
8718 return r;
8719
8720 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
8721 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8722 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8723
8724 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
8725 if (r)
8726 return r;
8727 idx_gpr = ctx->temp_reg;
8728 } else {
8729 r = load_index_src(ctx, 1, &idx_gpr);
8730 if (r)
8731 return r;
8732 }
8733
8734 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8735
8736 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8737 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8738 alu.op = ALU_OP1_MOV;
8739 alu.dst.sel = ctx->thread_id_gpr;
8740 alu.dst.chan = 0;
8741 alu.dst.write = 1;
8742 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8743 alu.last = 1;
8744 r = r600_bytecode_add_alu(ctx->bc, &alu);
8745 if (r)
8746 return r;
8747
8748 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8749 alu.op = ALU_OP1_MOV;
8750 alu.dst.sel = ctx->thread_id_gpr;
8751 if (ctx->bc->chip_class == CAYMAN)
8752 alu.dst.chan = 2;
8753 else
8754 alu.dst.chan = 3;
8755 alu.dst.write = 1;
8756 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8757 alu.last = 1;
8758 r = r600_bytecode_add_alu(ctx->bc, &alu);
8759 if (r)
8760 return r;
8761 } else {
8762 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8763 alu.op = ALU_OP1_MOV;
8764 alu.dst.sel = ctx->thread_id_gpr;
8765 alu.dst.chan = 0;
8766 alu.dst.write = 1;
8767 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8768 alu.last = 1;
8769 r = r600_bytecode_add_alu(ctx->bc, &alu);
8770 if (r)
8771 return r;
8772 }
8773
8774 if (rat_index_mode)
8775 egcm_load_index_reg(ctx->bc, 1, false);
8776 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8777 cf = ctx->bc->cf_last;
8778
8779 cf->rat.id = rat_base + inst->Src[0].Register.Index;
8780 cf->rat.inst = ctx->inst_info->op;
8781 cf->rat.index_mode = rat_index_mode;
8782 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8783 cf->output.gpr = ctx->thread_id_gpr;
8784 cf->output.index_gpr = idx_gpr;
8785 cf->output.comp_mask = 0xf;
8786 cf->output.burst_count = 1;
8787 cf->vpm = 1;
8788 cf->barrier = 1;
8789 cf->mark = 1;
8790 cf->output.elem_size = 0;
8791 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8792 cf = ctx->bc->cf_last;
8793 cf->barrier = 1;
8794 cf->cf_addr = 1;
8795
8796 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8797 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
8798 desc = util_format_description(inst->Memory.Format);
8799 r600_vertex_data_type(inst->Memory.Format,
8800 &format, &num_format, &format_comp, &endian);
8801 vtx.dst_sel_x = desc->swizzle[0];
8802 } else {
8803 format = FMT_32;
8804 num_format = 1;
8805 format_comp = 0;
8806 endian = 0;
8807 vtx.dst_sel_x = 0;
8808 }
8809 vtx.op = FETCH_OP_VFETCH;
8810 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8811 vtx.buffer_index_mode = rat_index_mode;
8812 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8813 vtx.src_gpr = ctx->thread_id_gpr;
8814 vtx.src_sel_x = 1;
8815 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8816 vtx.dst_sel_y = 7;
8817 vtx.dst_sel_z = 7;
8818 vtx.dst_sel_w = 7;
8819 vtx.use_const_fields = 0;
8820 vtx.srf_mode_all = 1;
8821 vtx.data_format = format;
8822 vtx.num_format_all = num_format;
8823 vtx.format_comp_all = format_comp;
8824 vtx.endian = endian;
8825 vtx.offset = 0;
8826 vtx.mega_fetch_count = 0xf;
8827 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8828 if (r)
8829 return r;
8830 cf = ctx->bc->cf_last;
8831 cf->vpm = 1;
8832 cf->barrier = 1;
8833 return 0;
8834 }
8835
8836 static int get_gds_op(int opcode)
8837 {
8838 switch (opcode) {
8839 case TGSI_OPCODE_ATOMUADD:
8840 return FETCH_OP_GDS_ADD_RET;
8841 case TGSI_OPCODE_ATOMAND:
8842 return FETCH_OP_GDS_AND_RET;
8843 case TGSI_OPCODE_ATOMOR:
8844 return FETCH_OP_GDS_OR_RET;
8845 case TGSI_OPCODE_ATOMXOR:
8846 return FETCH_OP_GDS_XOR_RET;
8847 case TGSI_OPCODE_ATOMUMIN:
8848 return FETCH_OP_GDS_MIN_UINT_RET;
8849 case TGSI_OPCODE_ATOMUMAX:
8850 return FETCH_OP_GDS_MAX_UINT_RET;
8851 case TGSI_OPCODE_ATOMXCHG:
8852 return FETCH_OP_GDS_XCHG_RET;
8853 case TGSI_OPCODE_ATOMCAS:
8854 return FETCH_OP_GDS_CMP_XCHG_RET;
8855 default:
8856 return -1;
8857 }
8858 }
8859
8860 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8861 {
8862 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8863 struct r600_bytecode_gds gds;
8864 struct r600_bytecode_alu alu;
8865 int gds_op = get_gds_op(inst->Instruction.Opcode);
8866 int r;
8867 int uav_id = 0;
8868 int uav_index_mode = 0;
8869 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8870
8871 if (gds_op == -1) {
8872 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8873 return -1;
8874 }
8875
8876 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8877 if (r)
8878 return r;
8879
8880 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
8881 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
8882 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
8883 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8884 alu.op = ALU_OP1_MOV;
8885 alu.dst.sel = ctx->temp_reg;
8886 alu.dst.chan = is_cm ? 2 : 1;
8887 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8888 alu.src[0].value = value;
8889 alu.last = 1;
8890 alu.dst.write = 1;
8891 r = r600_bytecode_add_alu(ctx->bc, &alu);
8892 if (r)
8893 return r;
8894 } else {
8895 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8896 alu.op = ALU_OP1_MOV;
8897 alu.dst.sel = ctx->temp_reg;
8898 alu.dst.chan = is_cm ? 2 : 1;
8899 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8900 alu.last = 1;
8901 alu.dst.write = 1;
8902 r = r600_bytecode_add_alu(ctx->bc, &alu);
8903 if (r)
8904 return r;
8905 }
8906 }
8907 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8908 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8909 int abs_value = abs(value);
8910 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8911 gds_op = FETCH_OP_GDS_SUB_RET;
8912 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8913 alu.op = ALU_OP1_MOV;
8914 alu.dst.sel = ctx->temp_reg;
8915 alu.dst.chan = is_cm ? 1 : 0;
8916 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8917 alu.src[0].value = abs_value;
8918 alu.last = 1;
8919 alu.dst.write = 1;
8920 r = r600_bytecode_add_alu(ctx->bc, &alu);
8921 if (r)
8922 return r;
8923 } else {
8924 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8925 alu.op = ALU_OP1_MOV;
8926 alu.dst.sel = ctx->temp_reg;
8927 alu.dst.chan = is_cm ? 1 : 0;
8928 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8929 alu.last = 1;
8930 alu.dst.write = 1;
8931 r = r600_bytecode_add_alu(ctx->bc, &alu);
8932 if (r)
8933 return r;
8934 }
8935
8936
8937 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8938 gds.op = gds_op;
8939 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8940 gds.uav_id = is_cm ? 0 : uav_id;
8941 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8942 gds.src_gpr = ctx->temp_reg;
8943 gds.src_gpr2 = 0;
8944 gds.src_sel_x = is_cm ? 0 : 4;
8945 gds.src_sel_y = is_cm ? 1 : 0;
8946 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
8947 gds.src_sel_z = is_cm ? 2 : 1;
8948 else
8949 gds.src_sel_z = 7;
8950 gds.dst_sel_x = 0;
8951 gds.dst_sel_y = 7;
8952 gds.dst_sel_z = 7;
8953 gds.dst_sel_w = 7;
8954 gds.alloc_consume = !is_cm;
8955
8956 r = r600_bytecode_add_gds(ctx->bc, &gds);
8957 if (r)
8958 return r;
8959 ctx->bc->cf_last->vpm = 1;
8960 return 0;
8961 }
8962
8963 static int get_lds_op(int opcode)
8964 {
8965 switch (opcode) {
8966 case TGSI_OPCODE_ATOMUADD:
8967 return LDS_OP2_LDS_ADD_RET;
8968 case TGSI_OPCODE_ATOMAND:
8969 return LDS_OP2_LDS_AND_RET;
8970 case TGSI_OPCODE_ATOMOR:
8971 return LDS_OP2_LDS_OR_RET;
8972 case TGSI_OPCODE_ATOMXOR:
8973 return LDS_OP2_LDS_XOR_RET;
8974 case TGSI_OPCODE_ATOMUMIN:
8975 return LDS_OP2_LDS_MIN_UINT_RET;
8976 case TGSI_OPCODE_ATOMUMAX:
8977 return LDS_OP2_LDS_MAX_UINT_RET;
8978 case TGSI_OPCODE_ATOMIMIN:
8979 return LDS_OP2_LDS_MIN_INT_RET;
8980 case TGSI_OPCODE_ATOMIMAX:
8981 return LDS_OP2_LDS_MAX_INT_RET;
8982 case TGSI_OPCODE_ATOMXCHG:
8983 return LDS_OP2_LDS_XCHG_RET;
8984 case TGSI_OPCODE_ATOMCAS:
8985 return LDS_OP3_LDS_CMP_XCHG_RET;
8986 default:
8987 return -1;
8988 }
8989 }
8990
8991 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
8992 {
8993 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8994 int lds_op = get_lds_op(inst->Instruction.Opcode);
8995 int r;
8996
8997 struct r600_bytecode_alu alu;
8998 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8999 alu.op = lds_op;
9000 alu.is_lds_idx_op = true;
9001 alu.last = 1;
9002 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9003 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9004 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9005 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9006 else
9007 alu.src[2].sel = V_SQ_ALU_SRC_0;
9008 r = r600_bytecode_add_alu(ctx->bc, &alu);
9009 if (r)
9010 return r;
9011
9012 /* then read from LDS_OQ_A_POP */
9013 memset(&alu, 0, sizeof(alu));
9014
9015 alu.op = ALU_OP1_MOV;
9016 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9017 alu.src[0].chan = 0;
9018 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9019 alu.dst.write = 1;
9020 alu.last = 1;
9021 r = r600_bytecode_add_alu(ctx->bc, &alu);
9022 if (r)
9023 return r;
9024
9025 return 0;
9026 }
9027
9028 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9029 {
9030 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9031 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9032 return tgsi_atomic_op_rat(ctx);
9033 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9034 return tgsi_atomic_op_gds(ctx);
9035 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9036 return tgsi_atomic_op_rat(ctx);
9037 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9038 return tgsi_atomic_op_lds(ctx);
9039 return 0;
9040 }
9041
9042 static int tgsi_resq(struct r600_shader_ctx *ctx)
9043 {
9044 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9045 unsigned sampler_index_mode;
9046 struct r600_bytecode_tex tex;
9047 int r;
9048 boolean has_txq_cube_array_z = false;
9049
9050 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9051 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9052 if (ctx->bc->chip_class < EVERGREEN)
9053 ctx->shader->uses_tex_buffers = true;
9054 unsigned eg_buffer_base = 0;
9055 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9056 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9057 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9058 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9059 }
9060
9061 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9062 inst->Dst[0].Register.WriteMask & 4) {
9063 ctx->shader->has_txq_cube_array_z_comp = true;
9064 has_txq_cube_array_z = true;
9065 }
9066
9067 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9068 if (sampler_index_mode)
9069 egcm_load_index_reg(ctx->bc, 1, false);
9070
9071
9072 /* does this shader want a num layers from TXQ for a cube array? */
9073 if (has_txq_cube_array_z) {
9074 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9075 struct r600_bytecode_alu alu;
9076
9077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9078 alu.op = ALU_OP1_MOV;
9079
9080 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9081 /* with eg each dword is either number of cubes */
9082 alu.src[0].sel += id / 4;
9083 alu.src[0].chan = id % 4;
9084 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9085 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9086 alu.last = 1;
9087 r = r600_bytecode_add_alu(ctx->bc, &alu);
9088 if (r)
9089 return r;
9090 /* disable writemask from texture instruction */
9091 inst->Dst[0].Register.WriteMask &= ~4;
9092 }
9093 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9094 tex.op = ctx->inst_info->op;
9095 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9096 tex.sampler_index_mode = sampler_index_mode;
9097 tex.resource_id = tex.sampler_id;
9098 tex.resource_index_mode = sampler_index_mode;
9099 tex.src_sel_x = 4;
9100 tex.src_sel_y = 4;
9101 tex.src_sel_z = 4;
9102 tex.src_sel_w = 4;
9103 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9104 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9105 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9106 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9107 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9108 r = r600_bytecode_add_tex(ctx->bc, &tex);
9109 if (r)
9110 return r;
9111
9112 return 0;
9113 }
9114
9115 static int tgsi_lrp(struct r600_shader_ctx *ctx)
9116 {
9117 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9118 struct r600_bytecode_alu alu;
9119 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9120 unsigned i, temp_regs[2];
9121 int r;
9122
9123 /* optimize if it's just an equal balance */
9124 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9125 for (i = 0; i < lasti + 1; i++) {
9126 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9127 continue;
9128
9129 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9130 alu.op = ALU_OP2_ADD;
9131 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9132 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9133 alu.omod = 3;
9134 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9135 alu.dst.chan = i;
9136 if (i == lasti) {
9137 alu.last = 1;
9138 }
9139 r = r600_bytecode_add_alu(ctx->bc, &alu);
9140 if (r)
9141 return r;
9142 }
9143 return 0;
9144 }
9145
9146 /* 1 - src0 */
9147 for (i = 0; i < lasti + 1; i++) {
9148 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9149 continue;
9150
9151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9152 alu.op = ALU_OP2_ADD;
9153 alu.src[0].sel = V_SQ_ALU_SRC_1;
9154 alu.src[0].chan = 0;
9155 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9156 r600_bytecode_src_toggle_neg(&alu.src[1]);
9157 alu.dst.sel = ctx->temp_reg;
9158 alu.dst.chan = i;
9159 if (i == lasti) {
9160 alu.last = 1;
9161 }
9162 alu.dst.write = 1;
9163 r = r600_bytecode_add_alu(ctx->bc, &alu);
9164 if (r)
9165 return r;
9166 }
9167
9168 /* (1 - src0) * src2 */
9169 for (i = 0; i < lasti + 1; i++) {
9170 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9171 continue;
9172
9173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9174 alu.op = ALU_OP2_MUL;
9175 alu.src[0].sel = ctx->temp_reg;
9176 alu.src[0].chan = i;
9177 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9178 alu.dst.sel = ctx->temp_reg;
9179 alu.dst.chan = i;
9180 if (i == lasti) {
9181 alu.last = 1;
9182 }
9183 alu.dst.write = 1;
9184 r = r600_bytecode_add_alu(ctx->bc, &alu);
9185 if (r)
9186 return r;
9187 }
9188
9189 /* src0 * src1 + (1 - src0) * src2 */
9190 if (ctx->src[0].abs)
9191 temp_regs[0] = r600_get_temp(ctx);
9192 else
9193 temp_regs[0] = 0;
9194 if (ctx->src[1].abs)
9195 temp_regs[1] = r600_get_temp(ctx);
9196 else
9197 temp_regs[1] = 0;
9198
9199 for (i = 0; i < lasti + 1; i++) {
9200 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9201 continue;
9202
9203 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9204 alu.op = ALU_OP3_MULADD;
9205 alu.is_op3 = 1;
9206 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
9207 if (r)
9208 return r;
9209 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
9210 if (r)
9211 return r;
9212 alu.src[2].sel = ctx->temp_reg;
9213 alu.src[2].chan = i;
9214
9215 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9216 alu.dst.chan = i;
9217 if (i == lasti) {
9218 alu.last = 1;
9219 }
9220 r = r600_bytecode_add_alu(ctx->bc, &alu);
9221 if (r)
9222 return r;
9223 }
9224 return 0;
9225 }
9226
9227 static int tgsi_cmp(struct r600_shader_ctx *ctx)
9228 {
9229 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9230 struct r600_bytecode_alu alu;
9231 int i, r, j;
9232 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9233 int temp_regs[3];
9234 unsigned op;
9235
9236 if (ctx->src[0].abs && ctx->src[0].neg) {
9237 op = ALU_OP3_CNDE;
9238 ctx->src[0].abs = 0;
9239 ctx->src[0].neg = 0;
9240 } else {
9241 op = ALU_OP3_CNDGE;
9242 }
9243
9244 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9245 temp_regs[j] = 0;
9246 if (ctx->src[j].abs)
9247 temp_regs[j] = r600_get_temp(ctx);
9248 }
9249
9250 for (i = 0; i < lasti + 1; i++) {
9251 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9252 continue;
9253
9254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9255 alu.op = op;
9256 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
9257 if (r)
9258 return r;
9259 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
9260 if (r)
9261 return r;
9262 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
9263 if (r)
9264 return r;
9265 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9266 alu.dst.chan = i;
9267 alu.dst.write = 1;
9268 alu.is_op3 = 1;
9269 if (i == lasti)
9270 alu.last = 1;
9271 r = r600_bytecode_add_alu(ctx->bc, &alu);
9272 if (r)
9273 return r;
9274 }
9275 return 0;
9276 }
9277
9278 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9279 {
9280 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9281 struct r600_bytecode_alu alu;
9282 int i, r;
9283 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9284
9285 for (i = 0; i < lasti + 1; i++) {
9286 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9287 continue;
9288
9289 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9290 alu.op = ALU_OP3_CNDE_INT;
9291 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9292 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9293 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9294 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9295 alu.dst.chan = i;
9296 alu.dst.write = 1;
9297 alu.is_op3 = 1;
9298 if (i == lasti)
9299 alu.last = 1;
9300 r = r600_bytecode_add_alu(ctx->bc, &alu);
9301 if (r)
9302 return r;
9303 }
9304 return 0;
9305 }
9306
9307 static int tgsi_exp(struct r600_shader_ctx *ctx)
9308 {
9309 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9310 struct r600_bytecode_alu alu;
9311 int r;
9312 unsigned i;
9313
9314 /* result.x = 2^floor(src); */
9315 if (inst->Dst[0].Register.WriteMask & 1) {
9316 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9317
9318 alu.op = ALU_OP1_FLOOR;
9319 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9320
9321 alu.dst.sel = ctx->temp_reg;
9322 alu.dst.chan = 0;
9323 alu.dst.write = 1;
9324 alu.last = 1;
9325 r = r600_bytecode_add_alu(ctx->bc, &alu);
9326 if (r)
9327 return r;
9328
9329 if (ctx->bc->chip_class == CAYMAN) {
9330 for (i = 0; i < 3; i++) {
9331 alu.op = ALU_OP1_EXP_IEEE;
9332 alu.src[0].sel = ctx->temp_reg;
9333 alu.src[0].chan = 0;
9334
9335 alu.dst.sel = ctx->temp_reg;
9336 alu.dst.chan = i;
9337 alu.dst.write = i == 0;
9338 alu.last = i == 2;
9339 r = r600_bytecode_add_alu(ctx->bc, &alu);
9340 if (r)
9341 return r;
9342 }
9343 } else {
9344 alu.op = ALU_OP1_EXP_IEEE;
9345 alu.src[0].sel = ctx->temp_reg;
9346 alu.src[0].chan = 0;
9347
9348 alu.dst.sel = ctx->temp_reg;
9349 alu.dst.chan = 0;
9350 alu.dst.write = 1;
9351 alu.last = 1;
9352 r = r600_bytecode_add_alu(ctx->bc, &alu);
9353 if (r)
9354 return r;
9355 }
9356 }
9357
9358 /* result.y = tmp - floor(tmp); */
9359 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9360 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9361
9362 alu.op = ALU_OP1_FRACT;
9363 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9364
9365 alu.dst.sel = ctx->temp_reg;
9366 #if 0
9367 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9368 if (r)
9369 return r;
9370 #endif
9371 alu.dst.write = 1;
9372 alu.dst.chan = 1;
9373
9374 alu.last = 1;
9375
9376 r = r600_bytecode_add_alu(ctx->bc, &alu);
9377 if (r)
9378 return r;
9379 }
9380
9381 /* result.z = RoughApprox2ToX(tmp);*/
9382 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9383 if (ctx->bc->chip_class == CAYMAN) {
9384 for (i = 0; i < 3; i++) {
9385 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9386 alu.op = ALU_OP1_EXP_IEEE;
9387 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9388
9389 alu.dst.sel = ctx->temp_reg;
9390 alu.dst.chan = i;
9391 if (i == 2) {
9392 alu.dst.write = 1;
9393 alu.last = 1;
9394 }
9395
9396 r = r600_bytecode_add_alu(ctx->bc, &alu);
9397 if (r)
9398 return r;
9399 }
9400 } else {
9401 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9402 alu.op = ALU_OP1_EXP_IEEE;
9403 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9404
9405 alu.dst.sel = ctx->temp_reg;
9406 alu.dst.write = 1;
9407 alu.dst.chan = 2;
9408
9409 alu.last = 1;
9410
9411 r = r600_bytecode_add_alu(ctx->bc, &alu);
9412 if (r)
9413 return r;
9414 }
9415 }
9416
9417 /* result.w = 1.0;*/
9418 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9419 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9420
9421 alu.op = ALU_OP1_MOV;
9422 alu.src[0].sel = V_SQ_ALU_SRC_1;
9423 alu.src[0].chan = 0;
9424
9425 alu.dst.sel = ctx->temp_reg;
9426 alu.dst.chan = 3;
9427 alu.dst.write = 1;
9428 alu.last = 1;
9429 r = r600_bytecode_add_alu(ctx->bc, &alu);
9430 if (r)
9431 return r;
9432 }
9433 return tgsi_helper_copy(ctx, inst);
9434 }
9435
9436 static int tgsi_log(struct r600_shader_ctx *ctx)
9437 {
9438 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9439 struct r600_bytecode_alu alu;
9440 int r;
9441 unsigned i;
9442
9443 /* result.x = floor(log2(|src|)); */
9444 if (inst->Dst[0].Register.WriteMask & 1) {
9445 if (ctx->bc->chip_class == CAYMAN) {
9446 for (i = 0; i < 3; i++) {
9447 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9448
9449 alu.op = ALU_OP1_LOG_IEEE;
9450 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9451 r600_bytecode_src_set_abs(&alu.src[0]);
9452
9453 alu.dst.sel = ctx->temp_reg;
9454 alu.dst.chan = i;
9455 if (i == 0)
9456 alu.dst.write = 1;
9457 if (i == 2)
9458 alu.last = 1;
9459 r = r600_bytecode_add_alu(ctx->bc, &alu);
9460 if (r)
9461 return r;
9462 }
9463
9464 } else {
9465 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9466
9467 alu.op = ALU_OP1_LOG_IEEE;
9468 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9469 r600_bytecode_src_set_abs(&alu.src[0]);
9470
9471 alu.dst.sel = ctx->temp_reg;
9472 alu.dst.chan = 0;
9473 alu.dst.write = 1;
9474 alu.last = 1;
9475 r = r600_bytecode_add_alu(ctx->bc, &alu);
9476 if (r)
9477 return r;
9478 }
9479
9480 alu.op = ALU_OP1_FLOOR;
9481 alu.src[0].sel = ctx->temp_reg;
9482 alu.src[0].chan = 0;
9483
9484 alu.dst.sel = ctx->temp_reg;
9485 alu.dst.chan = 0;
9486 alu.dst.write = 1;
9487 alu.last = 1;
9488
9489 r = r600_bytecode_add_alu(ctx->bc, &alu);
9490 if (r)
9491 return r;
9492 }
9493
9494 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9495 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9496
9497 if (ctx->bc->chip_class == CAYMAN) {
9498 for (i = 0; i < 3; i++) {
9499 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9500
9501 alu.op = ALU_OP1_LOG_IEEE;
9502 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9503 r600_bytecode_src_set_abs(&alu.src[0]);
9504
9505 alu.dst.sel = ctx->temp_reg;
9506 alu.dst.chan = i;
9507 if (i == 1)
9508 alu.dst.write = 1;
9509 if (i == 2)
9510 alu.last = 1;
9511
9512 r = r600_bytecode_add_alu(ctx->bc, &alu);
9513 if (r)
9514 return r;
9515 }
9516 } else {
9517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9518
9519 alu.op = ALU_OP1_LOG_IEEE;
9520 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9521 r600_bytecode_src_set_abs(&alu.src[0]);
9522
9523 alu.dst.sel = ctx->temp_reg;
9524 alu.dst.chan = 1;
9525 alu.dst.write = 1;
9526 alu.last = 1;
9527
9528 r = r600_bytecode_add_alu(ctx->bc, &alu);
9529 if (r)
9530 return r;
9531 }
9532
9533 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9534
9535 alu.op = ALU_OP1_FLOOR;
9536 alu.src[0].sel = ctx->temp_reg;
9537 alu.src[0].chan = 1;
9538
9539 alu.dst.sel = ctx->temp_reg;
9540 alu.dst.chan = 1;
9541 alu.dst.write = 1;
9542 alu.last = 1;
9543
9544 r = r600_bytecode_add_alu(ctx->bc, &alu);
9545 if (r)
9546 return r;
9547
9548 if (ctx->bc->chip_class == CAYMAN) {
9549 for (i = 0; i < 3; i++) {
9550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9551 alu.op = ALU_OP1_EXP_IEEE;
9552 alu.src[0].sel = ctx->temp_reg;
9553 alu.src[0].chan = 1;
9554
9555 alu.dst.sel = ctx->temp_reg;
9556 alu.dst.chan = i;
9557 if (i == 1)
9558 alu.dst.write = 1;
9559 if (i == 2)
9560 alu.last = 1;
9561
9562 r = r600_bytecode_add_alu(ctx->bc, &alu);
9563 if (r)
9564 return r;
9565 }
9566 } else {
9567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9568 alu.op = ALU_OP1_EXP_IEEE;
9569 alu.src[0].sel = ctx->temp_reg;
9570 alu.src[0].chan = 1;
9571
9572 alu.dst.sel = ctx->temp_reg;
9573 alu.dst.chan = 1;
9574 alu.dst.write = 1;
9575 alu.last = 1;
9576
9577 r = r600_bytecode_add_alu(ctx->bc, &alu);
9578 if (r)
9579 return r;
9580 }
9581
9582 if (ctx->bc->chip_class == CAYMAN) {
9583 for (i = 0; i < 3; i++) {
9584 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9585 alu.op = ALU_OP1_RECIP_IEEE;
9586 alu.src[0].sel = ctx->temp_reg;
9587 alu.src[0].chan = 1;
9588
9589 alu.dst.sel = ctx->temp_reg;
9590 alu.dst.chan = i;
9591 if (i == 1)
9592 alu.dst.write = 1;
9593 if (i == 2)
9594 alu.last = 1;
9595
9596 r = r600_bytecode_add_alu(ctx->bc, &alu);
9597 if (r)
9598 return r;
9599 }
9600 } else {
9601 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9602 alu.op = ALU_OP1_RECIP_IEEE;
9603 alu.src[0].sel = ctx->temp_reg;
9604 alu.src[0].chan = 1;
9605
9606 alu.dst.sel = ctx->temp_reg;
9607 alu.dst.chan = 1;
9608 alu.dst.write = 1;
9609 alu.last = 1;
9610
9611 r = r600_bytecode_add_alu(ctx->bc, &alu);
9612 if (r)
9613 return r;
9614 }
9615
9616 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9617
9618 alu.op = ALU_OP2_MUL;
9619
9620 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9621 r600_bytecode_src_set_abs(&alu.src[0]);
9622
9623 alu.src[1].sel = ctx->temp_reg;
9624 alu.src[1].chan = 1;
9625
9626 alu.dst.sel = ctx->temp_reg;
9627 alu.dst.chan = 1;
9628 alu.dst.write = 1;
9629 alu.last = 1;
9630
9631 r = r600_bytecode_add_alu(ctx->bc, &alu);
9632 if (r)
9633 return r;
9634 }
9635
9636 /* result.z = log2(|src|);*/
9637 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
9638 if (ctx->bc->chip_class == CAYMAN) {
9639 for (i = 0; i < 3; i++) {
9640 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9641
9642 alu.op = ALU_OP1_LOG_IEEE;
9643 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9644 r600_bytecode_src_set_abs(&alu.src[0]);
9645
9646 alu.dst.sel = ctx->temp_reg;
9647 if (i == 2)
9648 alu.dst.write = 1;
9649 alu.dst.chan = i;
9650 if (i == 2)
9651 alu.last = 1;
9652
9653 r = r600_bytecode_add_alu(ctx->bc, &alu);
9654 if (r)
9655 return r;
9656 }
9657 } else {
9658 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9659
9660 alu.op = ALU_OP1_LOG_IEEE;
9661 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9662 r600_bytecode_src_set_abs(&alu.src[0]);
9663
9664 alu.dst.sel = ctx->temp_reg;
9665 alu.dst.write = 1;
9666 alu.dst.chan = 2;
9667 alu.last = 1;
9668
9669 r = r600_bytecode_add_alu(ctx->bc, &alu);
9670 if (r)
9671 return r;
9672 }
9673 }
9674
9675 /* result.w = 1.0; */
9676 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
9677 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9678
9679 alu.op = ALU_OP1_MOV;
9680 alu.src[0].sel = V_SQ_ALU_SRC_1;
9681 alu.src[0].chan = 0;
9682
9683 alu.dst.sel = ctx->temp_reg;
9684 alu.dst.chan = 3;
9685 alu.dst.write = 1;
9686 alu.last = 1;
9687
9688 r = r600_bytecode_add_alu(ctx->bc, &alu);
9689 if (r)
9690 return r;
9691 }
9692
9693 return tgsi_helper_copy(ctx, inst);
9694 }
9695
9696 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
9697 {
9698 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9699 struct r600_bytecode_alu alu;
9700 int r;
9701 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9702 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
9703
9704 assert(inst->Dst[0].Register.Index < 3);
9705 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9706
9707 switch (inst->Instruction.Opcode) {
9708 case TGSI_OPCODE_ARL:
9709 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
9710 break;
9711 case TGSI_OPCODE_ARR:
9712 alu.op = ALU_OP1_FLT_TO_INT;
9713 break;
9714 case TGSI_OPCODE_UARL:
9715 alu.op = ALU_OP1_MOV;
9716 break;
9717 default:
9718 assert(0);
9719 return -1;
9720 }
9721
9722 for (i = 0; i <= lasti; ++i) {
9723 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9724 continue;
9725 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9726 alu.last = i == lasti;
9727 alu.dst.sel = reg;
9728 alu.dst.chan = i;
9729 alu.dst.write = 1;
9730 r = r600_bytecode_add_alu(ctx->bc, &alu);
9731 if (r)
9732 return r;
9733 }
9734
9735 if (inst->Dst[0].Register.Index > 0)
9736 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
9737 else
9738 ctx->bc->ar_loaded = 0;
9739
9740 return 0;
9741 }
9742 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
9743 {
9744 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9745 struct r600_bytecode_alu alu;
9746 int r;
9747 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9748
9749 switch (inst->Instruction.Opcode) {
9750 case TGSI_OPCODE_ARL:
9751 memset(&alu, 0, sizeof(alu));
9752 alu.op = ALU_OP1_FLOOR;
9753 alu.dst.sel = ctx->bc->ar_reg;
9754 alu.dst.write = 1;
9755 for (i = 0; i <= lasti; ++i) {
9756 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9757 alu.dst.chan = i;
9758 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9759 alu.last = i == lasti;
9760 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9761 return r;
9762 }
9763 }
9764
9765 memset(&alu, 0, sizeof(alu));
9766 alu.op = ALU_OP1_FLT_TO_INT;
9767 alu.src[0].sel = ctx->bc->ar_reg;
9768 alu.dst.sel = ctx->bc->ar_reg;
9769 alu.dst.write = 1;
9770 /* FLT_TO_INT is trans-only on r600/r700 */
9771 alu.last = TRUE;
9772 for (i = 0; i <= lasti; ++i) {
9773 alu.dst.chan = i;
9774 alu.src[0].chan = i;
9775 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9776 return r;
9777 }
9778 break;
9779 case TGSI_OPCODE_ARR:
9780 memset(&alu, 0, sizeof(alu));
9781 alu.op = ALU_OP1_FLT_TO_INT;
9782 alu.dst.sel = ctx->bc->ar_reg;
9783 alu.dst.write = 1;
9784 /* FLT_TO_INT is trans-only on r600/r700 */
9785 alu.last = TRUE;
9786 for (i = 0; i <= lasti; ++i) {
9787 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9788 alu.dst.chan = i;
9789 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9790 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9791 return r;
9792 }
9793 }
9794 break;
9795 case TGSI_OPCODE_UARL:
9796 memset(&alu, 0, sizeof(alu));
9797 alu.op = ALU_OP1_MOV;
9798 alu.dst.sel = ctx->bc->ar_reg;
9799 alu.dst.write = 1;
9800 for (i = 0; i <= lasti; ++i) {
9801 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9802 alu.dst.chan = i;
9803 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9804 alu.last = i == lasti;
9805 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9806 return r;
9807 }
9808 }
9809 break;
9810 default:
9811 assert(0);
9812 return -1;
9813 }
9814
9815 ctx->bc->ar_loaded = 0;
9816 return 0;
9817 }
9818
9819 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9820 {
9821 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9822 struct r600_bytecode_alu alu;
9823 int i, r = 0;
9824
9825 for (i = 0; i < 4; i++) {
9826 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9827
9828 alu.op = ALU_OP2_MUL;
9829 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9830
9831 if (i == 0 || i == 3) {
9832 alu.src[0].sel = V_SQ_ALU_SRC_1;
9833 } else {
9834 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9835 }
9836
9837 if (i == 0 || i == 2) {
9838 alu.src[1].sel = V_SQ_ALU_SRC_1;
9839 } else {
9840 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9841 }
9842 if (i == 3)
9843 alu.last = 1;
9844 r = r600_bytecode_add_alu(ctx->bc, &alu);
9845 if (r)
9846 return r;
9847 }
9848 return 0;
9849 }
9850
9851 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
9852 struct r600_bytecode_alu_src *src)
9853 {
9854 struct r600_bytecode_alu alu;
9855 int r;
9856
9857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9858 alu.op = opcode;
9859 alu.execute_mask = 1;
9860 alu.update_pred = 1;
9861
9862 alu.dst.sel = ctx->temp_reg;
9863 alu.dst.write = 1;
9864 alu.dst.chan = 0;
9865
9866 alu.src[0] = *src;
9867 alu.src[1].sel = V_SQ_ALU_SRC_0;
9868 alu.src[1].chan = 0;
9869
9870 alu.last = 1;
9871
9872 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9873 if (r)
9874 return r;
9875 return 0;
9876 }
9877
9878 static int pops(struct r600_shader_ctx *ctx, int pops)
9879 {
9880 unsigned force_pop = ctx->bc->force_add_cf;
9881
9882 if (!force_pop) {
9883 int alu_pop = 3;
9884 if (ctx->bc->cf_last) {
9885 if (ctx->bc->cf_last->op == CF_OP_ALU)
9886 alu_pop = 0;
9887 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9888 alu_pop = 1;
9889 }
9890 alu_pop += pops;
9891 if (alu_pop == 1) {
9892 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9893 ctx->bc->force_add_cf = 1;
9894 } else if (alu_pop == 2) {
9895 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9896 ctx->bc->force_add_cf = 1;
9897 } else {
9898 force_pop = 1;
9899 }
9900 }
9901
9902 if (force_pop) {
9903 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9904 ctx->bc->cf_last->pop_count = pops;
9905 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9906 }
9907
9908 return 0;
9909 }
9910
9911 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
9912 unsigned reason)
9913 {
9914 struct r600_stack_info *stack = &ctx->bc->stack;
9915 unsigned elements;
9916 int entries;
9917
9918 unsigned entry_size = stack->entry_size;
9919
9920 elements = (stack->loop + stack->push_wqm ) * entry_size;
9921 elements += stack->push;
9922
9923 switch (ctx->bc->chip_class) {
9924 case R600:
9925 case R700:
9926 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9927 * the stack must be reserved to hold the current active/continue
9928 * masks */
9929 if (reason == FC_PUSH_VPM) {
9930 elements += 2;
9931 }
9932 break;
9933
9934 case CAYMAN:
9935 /* r9xx: any stack operation on empty stack consumes 2 additional
9936 * elements */
9937 elements += 2;
9938
9939 /* fallthrough */
9940 /* FIXME: do the two elements added above cover the cases for the
9941 * r8xx+ below? */
9942
9943 case EVERGREEN:
9944 /* r8xx+: 2 extra elements are not always required, but one extra
9945 * element must be added for each of the following cases:
9946 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9947 * stack usage.
9948 * (Currently we don't use ALU_ELSE_AFTER.)
9949 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9950 * PUSH instruction executed.
9951 *
9952 * NOTE: it seems we also need to reserve additional element in some
9953 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9954 * then STACK_SIZE should be 2 instead of 1 */
9955 if (reason == FC_PUSH_VPM) {
9956 elements += 1;
9957 }
9958 break;
9959
9960 default:
9961 assert(0);
9962 break;
9963 }
9964
9965 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9966 * for all chips, so we use 4 in the final formula, not the real entry_size
9967 * for the chip */
9968 entry_size = 4;
9969
9970 entries = (elements + (entry_size - 1)) / entry_size;
9971
9972 if (entries > stack->max_entries)
9973 stack->max_entries = entries;
9974 }
9975
9976 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9977 {
9978 switch(reason) {
9979 case FC_PUSH_VPM:
9980 --ctx->bc->stack.push;
9981 assert(ctx->bc->stack.push >= 0);
9982 break;
9983 case FC_PUSH_WQM:
9984 --ctx->bc->stack.push_wqm;
9985 assert(ctx->bc->stack.push_wqm >= 0);
9986 break;
9987 case FC_LOOP:
9988 --ctx->bc->stack.loop;
9989 assert(ctx->bc->stack.loop >= 0);
9990 break;
9991 default:
9992 assert(0);
9993 break;
9994 }
9995 }
9996
9997 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9998 {
9999 switch (reason) {
10000 case FC_PUSH_VPM:
10001 ++ctx->bc->stack.push;
10002 break;
10003 case FC_PUSH_WQM:
10004 ++ctx->bc->stack.push_wqm;
10005 case FC_LOOP:
10006 ++ctx->bc->stack.loop;
10007 break;
10008 default:
10009 assert(0);
10010 }
10011
10012 callstack_update_max_depth(ctx, reason);
10013 }
10014
10015 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10016 {
10017 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10018
10019 sp->mid = realloc((void *)sp->mid,
10020 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10021 sp->mid[sp->num_mid] = ctx->bc->cf_last;
10022 sp->num_mid++;
10023 }
10024
10025 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10026 {
10027 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10028 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10029 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10030 ctx->bc->fc_sp++;
10031 }
10032
10033 static void fc_poplevel(struct r600_shader_ctx *ctx)
10034 {
10035 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10036 free(sp->mid);
10037 sp->mid = NULL;
10038 sp->num_mid = 0;
10039 sp->start = NULL;
10040 sp->type = 0;
10041 ctx->bc->fc_sp--;
10042 }
10043
10044 #if 0
10045 static int emit_return(struct r600_shader_ctx *ctx)
10046 {
10047 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10048 return 0;
10049 }
10050
10051 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10052 {
10053
10054 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10055 ctx->bc->cf_last->pop_count = pops;
10056 /* XXX work out offset */
10057 return 0;
10058 }
10059
10060 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10061 {
10062 return 0;
10063 }
10064
10065 static void emit_testflag(struct r600_shader_ctx *ctx)
10066 {
10067
10068 }
10069
10070 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10071 {
10072 emit_testflag(ctx);
10073 emit_jump_to_offset(ctx, 1, 4);
10074 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10075 pops(ctx, ifidx + 1);
10076 emit_return(ctx);
10077 }
10078
10079 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10080 {
10081 emit_testflag(ctx);
10082
10083 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10084 ctx->bc->cf_last->pop_count = 1;
10085
10086 fc_set_mid(ctx, fc_sp);
10087
10088 pops(ctx, 1);
10089 }
10090 #endif
10091
10092 static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10093 struct r600_bytecode_alu_src *src)
10094 {
10095 int alu_type = CF_OP_ALU_PUSH_BEFORE;
10096
10097 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10098 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10099 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10100 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10101 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
10102 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10103 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10104 alu_type = CF_OP_ALU;
10105 }
10106
10107 emit_logic_pred(ctx, opcode, alu_type, src);
10108
10109 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10110
10111 fc_pushlevel(ctx, FC_IF);
10112
10113 callstack_push(ctx, FC_PUSH_VPM);
10114 return 0;
10115 }
10116
10117 static int tgsi_if(struct r600_shader_ctx *ctx)
10118 {
10119 struct r600_bytecode_alu_src alu_src;
10120 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10121
10122 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10123 }
10124
10125 static int tgsi_uif(struct r600_shader_ctx *ctx)
10126 {
10127 struct r600_bytecode_alu_src alu_src;
10128 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10129 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10130 }
10131
10132 static int tgsi_else(struct r600_shader_ctx *ctx)
10133 {
10134 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10135 ctx->bc->cf_last->pop_count = 1;
10136
10137 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10138 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10139 return 0;
10140 }
10141
10142 static int tgsi_endif(struct r600_shader_ctx *ctx)
10143 {
10144 pops(ctx, 1);
10145 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10146 R600_ERR("if/endif unbalanced in shader\n");
10147 return -1;
10148 }
10149
10150 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10151 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10152 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10153 } else {
10154 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
10155 }
10156 fc_poplevel(ctx);
10157
10158 callstack_pop(ctx, FC_PUSH_VPM);
10159 return 0;
10160 }
10161
10162 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10163 {
10164 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10165 * limited to 4096 iterations, like the other LOOP_* instructions. */
10166 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10167
10168 fc_pushlevel(ctx, FC_LOOP);
10169
10170 /* check stack depth */
10171 callstack_push(ctx, FC_LOOP);
10172 return 0;
10173 }
10174
10175 static int tgsi_endloop(struct r600_shader_ctx *ctx)
10176 {
10177 int i;
10178
10179 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10180
10181 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10182 R600_ERR("loop/endloop in shader code are not paired.\n");
10183 return -EINVAL;
10184 }
10185
10186 /* fixup loop pointers - from r600isa
10187 LOOP END points to CF after LOOP START,
10188 LOOP START point to CF after LOOP END
10189 BRK/CONT point to LOOP END CF
10190 */
10191 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10192
10193 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10194
10195 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10196 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10197 }
10198 /* XXX add LOOPRET support */
10199 fc_poplevel(ctx);
10200 callstack_pop(ctx, FC_LOOP);
10201 return 0;
10202 }
10203
10204 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10205 {
10206 unsigned int fscp;
10207
10208 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10209 {
10210 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10211 break;
10212 }
10213
10214 if (fscp == 0) {
10215 R600_ERR("Break not inside loop/endloop pair\n");
10216 return -EINVAL;
10217 }
10218
10219 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10220
10221 fc_set_mid(ctx, fscp - 1);
10222
10223 return 0;
10224 }
10225
10226 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10227 {
10228 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10229 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10230 int r;
10231
10232 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10233 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10234
10235 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10236 if (!r) {
10237 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10238 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10239 return emit_inc_ring_offset(ctx, stream, TRUE);
10240 }
10241 return r;
10242 }
10243
10244 static int tgsi_umad(struct r600_shader_ctx *ctx)
10245 {
10246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10247 struct r600_bytecode_alu alu;
10248 int i, j, r;
10249 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10250
10251 /* src0 * src1 */
10252 for (i = 0; i < lasti + 1; i++) {
10253 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10254 continue;
10255
10256 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10257
10258 alu.dst.chan = i;
10259 alu.dst.sel = ctx->temp_reg;
10260 alu.dst.write = 1;
10261
10262 alu.op = ALU_OP2_MULLO_UINT;
10263 for (j = 0; j < 2; j++) {
10264 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10265 }
10266
10267 alu.last = 1;
10268 r = emit_mul_int_op(ctx->bc, &alu);
10269 if (r)
10270 return r;
10271 }
10272
10273
10274 for (i = 0; i < lasti + 1; i++) {
10275 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10276 continue;
10277
10278 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10279 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10280
10281 alu.op = ALU_OP2_ADD_INT;
10282
10283 alu.src[0].sel = ctx->temp_reg;
10284 alu.src[0].chan = i;
10285
10286 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10287 if (i == lasti) {
10288 alu.last = 1;
10289 }
10290 r = r600_bytecode_add_alu(ctx->bc, &alu);
10291 if (r)
10292 return r;
10293 }
10294 return 0;
10295 }
10296
10297 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10298 {
10299 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10300 struct r600_bytecode_alu alu;
10301 int r, i;
10302 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10303
10304 /* temp.xy = f32_to_f16(src) */
10305 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10306 alu.op = ALU_OP1_FLT32_TO_FLT16;
10307 alu.dst.chan = 0;
10308 alu.dst.sel = ctx->temp_reg;
10309 alu.dst.write = 1;
10310 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10311 r = r600_bytecode_add_alu(ctx->bc, &alu);
10312 if (r)
10313 return r;
10314 alu.dst.chan = 1;
10315 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10316 alu.last = 1;
10317 r = r600_bytecode_add_alu(ctx->bc, &alu);
10318 if (r)
10319 return r;
10320
10321 /* dst.x = temp.y * 0x10000 + temp.x */
10322 for (i = 0; i < lasti + 1; i++) {
10323 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10324 continue;
10325
10326 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10327 alu.op = ALU_OP3_MULADD_UINT24;
10328 alu.is_op3 = 1;
10329 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10330 alu.last = i == lasti;
10331 alu.src[0].sel = ctx->temp_reg;
10332 alu.src[0].chan = 1;
10333 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10334 alu.src[1].value = 0x10000;
10335 alu.src[2].sel = ctx->temp_reg;
10336 alu.src[2].chan = 0;
10337 r = r600_bytecode_add_alu(ctx->bc, &alu);
10338 if (r)
10339 return r;
10340 }
10341
10342 return 0;
10343 }
10344
10345 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10346 {
10347 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10348 struct r600_bytecode_alu alu;
10349 int r, i;
10350 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10351
10352 /* temp.x = src.x */
10353 /* note: no need to mask out the high bits */
10354 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10355 alu.op = ALU_OP1_MOV;
10356 alu.dst.chan = 0;
10357 alu.dst.sel = ctx->temp_reg;
10358 alu.dst.write = 1;
10359 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10360 r = r600_bytecode_add_alu(ctx->bc, &alu);
10361 if (r)
10362 return r;
10363
10364 /* temp.y = src.x >> 16 */
10365 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10366 alu.op = ALU_OP2_LSHR_INT;
10367 alu.dst.chan = 1;
10368 alu.dst.sel = ctx->temp_reg;
10369 alu.dst.write = 1;
10370 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10371 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10372 alu.src[1].value = 16;
10373 alu.last = 1;
10374 r = r600_bytecode_add_alu(ctx->bc, &alu);
10375 if (r)
10376 return r;
10377
10378 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10379 for (i = 0; i < lasti + 1; i++) {
10380 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10381 continue;
10382 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10383 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10384 alu.op = ALU_OP1_FLT16_TO_FLT32;
10385 alu.src[0].sel = ctx->temp_reg;
10386 alu.src[0].chan = i % 2;
10387 alu.last = i == lasti;
10388 r = r600_bytecode_add_alu(ctx->bc, &alu);
10389 if (r)
10390 return r;
10391 }
10392
10393 return 0;
10394 }
10395
10396 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10397 {
10398 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10399 struct r600_bytecode_alu alu;
10400 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10401 int r, i;
10402 int dst = -1;
10403
10404 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10405 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10406 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10407 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10408 dst = r600_get_temp(ctx);
10409
10410 r = tgsi_op3_dst(ctx, dst);
10411 if (r)
10412 return r;
10413
10414 for (i = 0; i < lasti + 1; i++) {
10415 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10416 alu.op = ALU_OP2_SETGE_INT;
10417 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10418 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10419 alu.src[1].value = 32;
10420 alu.dst.sel = ctx->temp_reg;
10421 alu.dst.chan = i;
10422 alu.dst.write = 1;
10423 if (i == lasti)
10424 alu.last = 1;
10425 r = r600_bytecode_add_alu(ctx->bc, &alu);
10426 if (r)
10427 return r;
10428 }
10429
10430 for (i = 0; i < lasti + 1; i++) {
10431 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10432 alu.op = ALU_OP3_CNDE_INT;
10433 alu.is_op3 = 1;
10434 alu.src[0].sel = ctx->temp_reg;
10435 alu.src[0].chan = i;
10436
10437 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10438 if (dst != -1)
10439 alu.src[1].sel = dst;
10440 else
10441 alu.src[1].sel = alu.dst.sel;
10442 alu.src[1].chan = i;
10443 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10444 alu.dst.write = 1;
10445 if (i == lasti)
10446 alu.last = 1;
10447 r = r600_bytecode_add_alu(ctx->bc, &alu);
10448 if (r)
10449 return r;
10450 }
10451
10452 return 0;
10453 }
10454
10455 static int tgsi_clock(struct r600_shader_ctx *ctx)
10456 {
10457 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10458 struct r600_bytecode_alu alu;
10459 int r;
10460
10461 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10462 alu.op = ALU_OP1_MOV;
10463 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10464 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10465 r = r600_bytecode_add_alu(ctx->bc, &alu);
10466 if (r)
10467 return r;
10468 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10469 alu.op = ALU_OP1_MOV;
10470 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10471 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10472 r = r600_bytecode_add_alu(ctx->bc, &alu);
10473 if (r)
10474 return r;
10475 return 0;
10476 }
10477
10478 static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10479 int treg,
10480 int src0_sel, int src0_chan,
10481 int src1_sel, int src1_chan)
10482 {
10483 struct r600_bytecode_alu alu;
10484 int r;
10485 int opc;
10486
10487 if (op == ALU_OP2_ADD_INT)
10488 opc = ALU_OP2_ADDC_UINT;
10489 else
10490 opc = ALU_OP2_SUBB_UINT;
10491
10492 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10493 alu.op = op; ;
10494 alu.dst.sel = treg;
10495 alu.dst.chan = 0;
10496 alu.dst.write = 1;
10497 alu.src[0].sel = src0_sel;
10498 alu.src[0].chan = src0_chan + 0;
10499 alu.src[1].sel = src1_sel;
10500 alu.src[1].chan = src1_chan + 0;
10501 alu.src[1].neg = 0;
10502 r = r600_bytecode_add_alu(ctx->bc, &alu);
10503 if (r)
10504 return r;
10505
10506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10507 alu.op = op;
10508 alu.dst.sel = treg;
10509 alu.dst.chan = 1;
10510 alu.dst.write = 1;
10511 alu.src[0].sel = src0_sel;
10512 alu.src[0].chan = src0_chan + 1;
10513 alu.src[1].sel = src1_sel;
10514 alu.src[1].chan = src1_chan + 1;
10515 alu.src[1].neg = 0;
10516 r = r600_bytecode_add_alu(ctx->bc, &alu);
10517 if (r)
10518 return r;
10519
10520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10521 alu.op = opc;
10522 alu.dst.sel = treg;
10523 alu.dst.chan = 2;
10524 alu.dst.write = 1;
10525 alu.last = 1;
10526 alu.src[0].sel = src0_sel;
10527 alu.src[0].chan = src0_chan + 0;
10528 alu.src[1].sel = src1_sel;
10529 alu.src[1].chan = src1_chan + 0;
10530 alu.src[1].neg = 0;
10531 r = r600_bytecode_add_alu(ctx->bc, &alu);
10532 if (r)
10533 return r;
10534
10535 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10536 alu.op = op;
10537 alu.dst.sel = treg;
10538 alu.dst.chan = 1;
10539 alu.dst.write = 1;
10540 alu.src[0].sel = treg;
10541 alu.src[0].chan = 1;
10542 alu.src[1].sel = treg;
10543 alu.src[1].chan = 2;
10544 alu.last = 1;
10545 r = r600_bytecode_add_alu(ctx->bc, &alu);
10546 if (r)
10547 return r;
10548 return 0;
10549 }
10550
10551 static int egcm_u64add(struct r600_shader_ctx *ctx)
10552 {
10553 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10554 struct r600_bytecode_alu alu;
10555 int r;
10556 int treg = ctx->temp_reg;
10557 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
10558
10559 if (ctx->src[1].neg) {
10560 op = ALU_OP2_SUB_INT;
10561 opc = ALU_OP2_SUBB_UINT;
10562 }
10563 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10564 alu.op = op; ;
10565 alu.dst.sel = treg;
10566 alu.dst.chan = 0;
10567 alu.dst.write = 1;
10568 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10569 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10570 alu.src[1].neg = 0;
10571 r = r600_bytecode_add_alu(ctx->bc, &alu);
10572 if (r)
10573 return r;
10574
10575 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10576 alu.op = op;
10577 alu.dst.sel = treg;
10578 alu.dst.chan = 1;
10579 alu.dst.write = 1;
10580 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10581 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
10582 alu.src[1].neg = 0;
10583 r = r600_bytecode_add_alu(ctx->bc, &alu);
10584 if (r)
10585 return r;
10586
10587 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10588 alu.op = opc ;
10589 alu.dst.sel = treg;
10590 alu.dst.chan = 2;
10591 alu.dst.write = 1;
10592 alu.last = 1;
10593 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10594 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10595 alu.src[1].neg = 0;
10596 r = r600_bytecode_add_alu(ctx->bc, &alu);
10597 if (r)
10598 return r;
10599
10600 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10601 alu.op = op;
10602 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10603 alu.src[0].sel = treg;
10604 alu.src[0].chan = 1;
10605 alu.src[1].sel = treg;
10606 alu.src[1].chan = 2;
10607 alu.last = 1;
10608 r = r600_bytecode_add_alu(ctx->bc, &alu);
10609 if (r)
10610 return r;
10611 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10612 alu.op = ALU_OP1_MOV;
10613 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10614 alu.src[0].sel = treg;
10615 alu.src[0].chan = 0;
10616 alu.last = 1;
10617 r = r600_bytecode_add_alu(ctx->bc, &alu);
10618 if (r)
10619 return r;
10620 return 0;
10621 }
10622
10623 /* result.y = mul_high a, b
10624 result.x = mul a,b
10625 result.y += a.x * b.y + a.y * b.x;
10626 */
10627 static int egcm_u64mul(struct r600_shader_ctx *ctx)
10628 {
10629 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10630 struct r600_bytecode_alu alu;
10631 int r;
10632 int treg = ctx->temp_reg;
10633
10634 /* temp.x = mul_lo a.x, b.x */
10635 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10636 alu.op = ALU_OP2_MULLO_UINT;
10637 alu.dst.sel = treg;
10638 alu.dst.chan = 0;
10639 alu.dst.write = 1;
10640 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10641 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10642 r = emit_mul_int_op(ctx->bc, &alu);
10643 if (r)
10644 return r;
10645
10646 /* temp.y = mul_hi a.x, b.x */
10647 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10648 alu.op = ALU_OP2_MULHI_UINT;
10649 alu.dst.sel = treg;
10650 alu.dst.chan = 1;
10651 alu.dst.write = 1;
10652 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10653 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10654 r = emit_mul_int_op(ctx->bc, &alu);
10655 if (r)
10656 return r;
10657
10658 /* temp.z = mul a.x, b.y */
10659 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10660 alu.op = ALU_OP2_MULLO_UINT;
10661 alu.dst.sel = treg;
10662 alu.dst.chan = 2;
10663 alu.dst.write = 1;
10664 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10665 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
10666 r = emit_mul_int_op(ctx->bc, &alu);
10667 if (r)
10668 return r;
10669
10670 /* temp.w = mul a.y, b.x */
10671 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10672 alu.op = ALU_OP2_MULLO_UINT;
10673 alu.dst.sel = treg;
10674 alu.dst.chan = 3;
10675 alu.dst.write = 1;
10676 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10677 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10678 r = emit_mul_int_op(ctx->bc, &alu);
10679 if (r)
10680 return r;
10681
10682 /* temp.z = temp.z + temp.w */
10683 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10684 alu.op = ALU_OP2_ADD_INT;
10685 alu.dst.sel = treg;
10686 alu.dst.chan = 2;
10687 alu.dst.write = 1;
10688 alu.src[0].sel = treg;
10689 alu.src[0].chan = 2;
10690 alu.src[1].sel = treg;
10691 alu.src[1].chan = 3;
10692 alu.last = 1;
10693 r = r600_bytecode_add_alu(ctx->bc, &alu);
10694 if (r)
10695 return r;
10696
10697 /* temp.y = temp.y + temp.z */
10698 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10699 alu.op = ALU_OP2_ADD_INT;
10700 alu.dst.sel = treg;
10701 alu.dst.chan = 1;
10702 alu.dst.write = 1;
10703 alu.src[0].sel = treg;
10704 alu.src[0].chan = 1;
10705 alu.src[1].sel = treg;
10706 alu.src[1].chan = 2;
10707 alu.last = 1;
10708 r = r600_bytecode_add_alu(ctx->bc, &alu);
10709 if (r)
10710 return r;
10711
10712 /* dst.x = temp.x */
10713 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10714 alu.op = ALU_OP1_MOV;
10715 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10716 alu.src[0].sel = treg;
10717 alu.src[0].chan = 0;
10718 r = r600_bytecode_add_alu(ctx->bc, &alu);
10719 if (r)
10720 return r;
10721
10722 /* dst.y = temp.y */
10723 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10724 alu.op = ALU_OP1_MOV;
10725 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10726 alu.src[0].sel = treg;
10727 alu.src[0].chan = 1;
10728 alu.last = 1;
10729 r = r600_bytecode_add_alu(ctx->bc, &alu);
10730 if (r)
10731 return r;
10732
10733 return 0;
10734 }
10735
10736 static int emit_u64sge(struct r600_shader_ctx *ctx,
10737 int treg,
10738 int src0_sel, int src0_base_chan,
10739 int src1_sel, int src1_base_chan)
10740 {
10741 int r;
10742 /* for 64-bit sge */
10743 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
10744 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
10745 treg, 1,
10746 src0_sel, src0_base_chan + 1,
10747 src1_sel, src1_base_chan + 1);
10748 if (r)
10749 return r;
10750
10751 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
10752 treg, 0,
10753 src0_sel, src0_base_chan,
10754 src1_sel, src1_base_chan);
10755 if (r)
10756 return r;
10757
10758 r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
10759 treg, 2,
10760 src0_sel, src0_base_chan + 1,
10761 src1_sel, src1_base_chan + 1);
10762 if (r)
10763 return r;
10764
10765 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
10766 treg, 0,
10767 treg, 0,
10768 treg, 2);
10769 if (r)
10770 return r;
10771
10772 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
10773 treg, 0,
10774 treg, 0,
10775 treg, 1);
10776 if (r)
10777 return r;
10778 return 0;
10779 }
10780
10781 /* this isn't a complete div it's just enough for qbo shader to work */
10782 static int egcm_u64div(struct r600_shader_ctx *ctx)
10783 {
10784 struct r600_bytecode_alu alu;
10785 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
10786 int r, i;
10787 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10788
10789 /* make sure we are dividing my a const with 0 in the high bits */
10790 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
10791 return -1;
10792 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
10793 return -1;
10794 /* make sure we are doing one division */
10795 if (inst->Dst[0].Register.WriteMask != 0x3)
10796 return -1;
10797
10798 /* emit_if uses ctx->temp_reg so we can't */
10799 int treg = r600_get_temp(ctx);
10800 int tmp_num = r600_get_temp(ctx);
10801 int sub_tmp = r600_get_temp(ctx);
10802
10803 /* tmp quot are tmp_num.zw */
10804 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
10805 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
10806 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
10807 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
10808
10809 /* MOV tmp_num.xy, numerator */
10810 r = single_alu_op2(ctx, ALU_OP1_MOV,
10811 tmp_num, 0,
10812 alu_num_lo.sel, alu_num_lo.chan,
10813 0, 0);
10814 if (r)
10815 return r;
10816 r = single_alu_op2(ctx, ALU_OP1_MOV,
10817 tmp_num, 1,
10818 alu_num_hi.sel, alu_num_hi.chan,
10819 0, 0);
10820 if (r)
10821 return r;
10822
10823 r = single_alu_op2(ctx, ALU_OP1_MOV,
10824 tmp_num, 2,
10825 V_SQ_ALU_SRC_LITERAL, 0,
10826 0, 0);
10827 if (r)
10828 return r;
10829
10830 r = single_alu_op2(ctx, ALU_OP1_MOV,
10831 tmp_num, 3,
10832 V_SQ_ALU_SRC_LITERAL, 0,
10833 0, 0);
10834 if (r)
10835 return r;
10836
10837 /* treg 0 is log2_denom */
10838 /* normally this gets the MSB for the denom high value
10839 - however we know this will always be 0 here. */
10840 r = single_alu_op2(ctx,
10841 ALU_OP1_MOV,
10842 treg, 0,
10843 V_SQ_ALU_SRC_LITERAL, 32,
10844 0, 0);
10845 if (r)
10846 return r;
10847
10848 /* normally check demon hi for 0, but we know it is already */
10849 /* t0.z = num_hi >= denom_lo */
10850 r = single_alu_op2(ctx,
10851 ALU_OP2_SETGE_UINT,
10852 treg, 1,
10853 alu_num_hi.sel, alu_num_hi.chan,
10854 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
10855 if (r)
10856 return r;
10857
10858 memset(&alu_src, 0, sizeof(alu_src));
10859 alu_src.sel = treg;
10860 alu_src.chan = 1;
10861 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10862 if (r)
10863 return r;
10864
10865 /* for loops in here */
10866 /* get msb t0.x = msb(src[1].x) first */
10867 int msb_lo = util_last_bit(alu_denom_lo.value);
10868 r = single_alu_op2(ctx, ALU_OP1_MOV,
10869 treg, 0,
10870 V_SQ_ALU_SRC_LITERAL, msb_lo,
10871 0, 0);
10872 if (r)
10873 return r;
10874
10875 /* unroll the asm here */
10876 for (i = 0; i < 31; i++) {
10877 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
10878 treg, 2,
10879 V_SQ_ALU_SRC_LITERAL, i,
10880 treg, 0);
10881 if (r)
10882 return r;
10883
10884 /* we can do this on the CPU */
10885 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
10886 /* t0.z = tmp_num.y >= t0.z */
10887 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
10888 treg, 1,
10889 tmp_num, 1,
10890 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
10891 if (r)
10892 return r;
10893
10894 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
10895 treg, 1,
10896 treg, 1,
10897 treg, 2);
10898 if (r)
10899 return r;
10900
10901 memset(&alu_src, 0, sizeof(alu_src));
10902 alu_src.sel = treg;
10903 alu_src.chan = 1;
10904 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10905 if (r)
10906 return r;
10907
10908 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
10909 tmp_num, 1,
10910 tmp_num, 1,
10911 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
10912 if (r)
10913 return r;
10914
10915 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
10916 tmp_num, 3,
10917 tmp_num, 3,
10918 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
10919 if (r)
10920 return r;
10921
10922 r = tgsi_endif(ctx);
10923 if (r)
10924 return r;
10925 }
10926
10927 /* log2_denom is always <= 31, so manually peel the last loop
10928 * iteration.
10929 */
10930 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
10931 treg, 1,
10932 tmp_num, 1,
10933 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
10934 if (r)
10935 return r;
10936
10937 memset(&alu_src, 0, sizeof(alu_src));
10938 alu_src.sel = treg;
10939 alu_src.chan = 1;
10940 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10941 if (r)
10942 return r;
10943
10944 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
10945 tmp_num, 1,
10946 tmp_num, 1,
10947 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
10948 if (r)
10949 return r;
10950
10951 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
10952 tmp_num, 3,
10953 tmp_num, 3,
10954 V_SQ_ALU_SRC_LITERAL, 1U);
10955 if (r)
10956 return r;
10957 r = tgsi_endif(ctx);
10958 if (r)
10959 return r;
10960
10961 r = tgsi_endif(ctx);
10962 if (r)
10963 return r;
10964
10965 /* onto the second loop to unroll */
10966 for (i = 0; i < 31; i++) {
10967 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
10968 treg, 1,
10969 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
10970 treg, 0);
10971 if (r)
10972 return r;
10973
10974 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
10975 r = single_alu_op2(ctx, ALU_OP1_MOV,
10976 treg, 2,
10977 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
10978 0, 0);
10979 if (r)
10980 return r;
10981
10982 r = single_alu_op2(ctx, ALU_OP1_MOV,
10983 treg, 3,
10984 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
10985 0, 0);
10986 if (r)
10987 return r;
10988
10989 r = emit_u64sge(ctx, sub_tmp,
10990 tmp_num, 0,
10991 treg, 2);
10992 if (r)
10993 return r;
10994
10995 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
10996 treg, 1,
10997 treg, 1,
10998 sub_tmp, 0);
10999 if (r)
11000 return r;
11001
11002 memset(&alu_src, 0, sizeof(alu_src));
11003 alu_src.sel = treg;
11004 alu_src.chan = 1;
11005 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11006 if (r)
11007 return r;
11008
11009
11010 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11011 sub_tmp,
11012 tmp_num, 0,
11013 treg, 2);
11014 if (r)
11015 return r;
11016
11017 r = single_alu_op2(ctx, ALU_OP1_MOV,
11018 tmp_num, 0,
11019 sub_tmp, 0,
11020 0, 0);
11021 if (r)
11022 return r;
11023
11024 r = single_alu_op2(ctx, ALU_OP1_MOV,
11025 tmp_num, 1,
11026 sub_tmp, 1,
11027 0, 0);
11028 if (r)
11029 return r;
11030
11031 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11032 tmp_num, 2,
11033 tmp_num, 2,
11034 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11035 if (r)
11036 return r;
11037
11038 r = tgsi_endif(ctx);
11039 if (r)
11040 return r;
11041 }
11042
11043 /* log2_denom is always <= 63, so manually peel the last loop
11044 * iteration.
11045 */
11046 uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11047 r = single_alu_op2(ctx, ALU_OP1_MOV,
11048 treg, 2,
11049 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11050 0, 0);
11051 if (r)
11052 return r;
11053
11054 r = single_alu_op2(ctx, ALU_OP1_MOV,
11055 treg, 3,
11056 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11057 0, 0);
11058 if (r)
11059 return r;
11060
11061 r = emit_u64sge(ctx, sub_tmp,
11062 tmp_num, 0,
11063 treg, 2);
11064 if (r)
11065 return r;
11066
11067 memset(&alu_src, 0, sizeof(alu_src));
11068 alu_src.sel = sub_tmp;
11069 alu_src.chan = 0;
11070 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11071 if (r)
11072 return r;
11073
11074 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11075 sub_tmp,
11076 tmp_num, 0,
11077 treg, 2);
11078 if (r)
11079 return r;
11080
11081 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11082 tmp_num, 2,
11083 tmp_num, 2,
11084 V_SQ_ALU_SRC_LITERAL, 1U);
11085 if (r)
11086 return r;
11087 r = tgsi_endif(ctx);
11088 if (r)
11089 return r;
11090
11091 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11092 alu.op = ALU_OP1_MOV;
11093 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11094 alu.src[0].sel = tmp_num;
11095 alu.src[0].chan = 2;
11096 r = r600_bytecode_add_alu(ctx->bc, &alu);
11097 if (r)
11098 return r;
11099
11100 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11101 alu.op = ALU_OP1_MOV;
11102 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11103 alu.src[0].sel = tmp_num;
11104 alu.src[0].chan = 3;
11105 alu.last = 1;
11106 r = r600_bytecode_add_alu(ctx->bc, &alu);
11107 if (r)
11108 return r;
11109 return 0;
11110 }
11111
11112 static int egcm_u64sne(struct r600_shader_ctx *ctx)
11113 {
11114 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11115 struct r600_bytecode_alu alu;
11116 int r;
11117 int treg = ctx->temp_reg;
11118
11119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11120 alu.op = ALU_OP2_SETNE_INT;
11121 alu.dst.sel = treg;
11122 alu.dst.chan = 0;
11123 alu.dst.write = 1;
11124 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11125 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11126 r = r600_bytecode_add_alu(ctx->bc, &alu);
11127 if (r)
11128 return r;
11129
11130 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11131 alu.op = ALU_OP2_SETNE_INT;
11132 alu.dst.sel = treg;
11133 alu.dst.chan = 1;
11134 alu.dst.write = 1;
11135 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11136 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11137 alu.last = 1;
11138 r = r600_bytecode_add_alu(ctx->bc, &alu);
11139 if (r)
11140 return r;
11141
11142 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11143 alu.op = ALU_OP2_OR_INT;
11144 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11145 alu.src[0].sel = treg;
11146 alu.src[0].chan = 0;
11147 alu.src[1].sel = treg;
11148 alu.src[1].chan = 1;
11149 alu.last = 1;
11150 r = r600_bytecode_add_alu(ctx->bc, &alu);
11151 if (r)
11152 return r;
11153 return 0;
11154 }
11155
11156 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11157 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
11158 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11159 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11160
11161 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11162
11163 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11164 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11165 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11166 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11167 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11168 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11169 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11170 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11171 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11172 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11173 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11174 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11175 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11176 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11177 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11178 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
11179 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11180 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11181 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11182 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11183 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11184 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11185 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11186 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11187 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11188 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11189 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11190 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11191 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11192 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
11193 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11194 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11195 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11196 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11197 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11198 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11199 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
11200 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11201 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11202 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11203 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11204 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11205 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11206 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11207 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11208 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11209 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11210 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11211 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11212 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11213 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11214 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
11215 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11216 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11217 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11218 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11219 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11220 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
11221 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11222 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11223 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11224 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11225 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11226 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11227 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11228 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11229 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11230 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11231 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11232 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11233 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11234 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11235 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11236 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11237 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11238 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11239 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11240 [81] = { ALU_OP0_NOP, tgsi_unsupported},
11241 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11242 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11243 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11244 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11245 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11246 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11247 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11248 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11249 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11250 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11251 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11252 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11253 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11254 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11255 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11256 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11257 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11258 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11259 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11260 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11261 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11262 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11263 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11264 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
11265 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11266 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11267 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11268 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11269 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11270 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11271 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
11272 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11273 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11274 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11275 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11276 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11277 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
11278 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11279 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11280 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11281 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11282 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11283 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11284 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11285 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11286 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11287 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11288 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11289 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11290 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11291 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11292 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11293 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11294 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11295 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11296 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11297 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11298 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11299 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11300 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11301 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11302 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11303 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11304 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11305 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11306 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11307 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11308 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11309 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11310 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11311 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11312 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11313 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11314 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11315 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11316 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11317 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11318 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11319 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11320 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
11321 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
11322 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11323 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11324 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11325 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
11326 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
11327 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
11328 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
11329 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
11330 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
11331 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
11332 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11333 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11334 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11335 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11336 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11337 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11338 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11339 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11340 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11341 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
11342 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
11343 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
11344 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
11345 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
11346 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
11347 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
11348 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
11349 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
11350 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11351 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
11352 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
11353 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
11354 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11355 };
11356
11357 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11358 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11359 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11360 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11361 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11362 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11363 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11364 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11365 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11366 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11367 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11368 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11369 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11370 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11371 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11372 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11373 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11374 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11375 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11376 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11377 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11378 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11379 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11380 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11381 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11382 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11383 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11384 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11385 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11386 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11387 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11388 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11389 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11390 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11391 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11392 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11393 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11394 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11395 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11396 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11397 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11398 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11399 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11400 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11401 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11402 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11403 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11404 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11405 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11406 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11407 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11408 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11409 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11410 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11411 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11412 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11413 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11414 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11415 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11416 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11417 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11418 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11419 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11420 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11421 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11422 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11423 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11424 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11425 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11426 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11427 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11428 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11429 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11430 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11431 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11432 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11433 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11434 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11435 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11436 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11437 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11438 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11439 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11440 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11441 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11442 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11443 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11444 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11445 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11446 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11447 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11448 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11449 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11450 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11451 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11452 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11453 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11454 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11455 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11456 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11457 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11458 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11459 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11460 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11461 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11462 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11463 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11464 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11465 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11466 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11467 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11468 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11469 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11470 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11471 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11472 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11473 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11474 /* Refer below for TGSI_OPCODE_DFMA */
11475 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11476 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11477 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11478 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11479 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11480 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11481 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11482 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11483 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11484 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11485 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11486 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11487 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11488 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11489 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11490 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11491 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11492 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11493 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11494 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
11495 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11496 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
11497 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11498 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11499 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11500 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11501 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11502 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11503 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11504 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11505 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11506 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11507 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11508 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11509 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11510 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11511 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11512 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11513 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
11514 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11515 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11516 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11517 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
11518 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
11519 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11520 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11521 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11522 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11523 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
11524 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
11525 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
11526 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
11527 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
11528 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
11529 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
11530 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
11531 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
11532 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
11533 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11534 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11535 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11536 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11537 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11538 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
11539 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
11540 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
11541 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
11542 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
11543 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
11544 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
11545 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
11546 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
11547 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
11548 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
11549 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
11550 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
11551 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
11552 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
11553 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
11554 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
11555 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
11556 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
11557 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
11558 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
11559 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
11560 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
11561 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
11562 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
11563 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
11564 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
11565 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
11566 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
11567 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
11568 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
11569 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
11570 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
11571 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
11572 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
11573 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
11574 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
11575 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
11576 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
11577 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
11578 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
11579 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
11580 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11581 };
11582
11583 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
11584 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11585 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11586 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11587 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
11588 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
11589 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11590 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11591 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11592 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11593 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11594 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11595 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11596 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11597 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11598 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11599 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11600 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11601 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11602 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11603 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
11604 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11605 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11606 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11607 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11608 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11609 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11610 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11611 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
11612 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
11613 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
11614 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11615 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11616 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11617 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11618 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11619 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
11620 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11621 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11622 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11623 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11624 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11625 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11626 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11627 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11628 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11629 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11630 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11631 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
11632 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11633 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11634 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11635 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11636 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11637 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11638 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11639 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11640 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11641 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11642 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11643 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11644 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11645 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11646 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11647 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11648 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11649 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11650 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11651 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11652 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11653 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11654 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11655 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11656 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11657 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11658 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11659 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11660 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11661 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11662 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11663 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11664 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11665 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11666 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
11667 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11668 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11669 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11670 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11671 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11672 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11673 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11674 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11675 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11676 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11677 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11678 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11679 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11680 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11681 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11682 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11683 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11684 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11685 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11686 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11687 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11688 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11689 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11690 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11691 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11692 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11693 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11694 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11695 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11696 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11697 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11698 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11699 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11700 /* Refer below for TGSI_OPCODE_DFMA */
11701 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
11702 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11703 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11704 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11705 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11706 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11707 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11708 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11709 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
11710 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
11711 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11712 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11713 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11714 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11715 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11716 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11717 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
11718 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11719 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11720 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
11721 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11722 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
11723 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11724 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11725 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11726 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11727 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11728 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11729 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11730 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11731 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11732 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11733 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11734 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11735 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11736 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11737 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11738 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11739 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
11740 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11741 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11742 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11743 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
11744 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
11745 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11746 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11747 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11748 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11749 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
11750 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
11751 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
11752 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
11753 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
11754 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
11755 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
11756 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
11757 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
11758 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
11759 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11760 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11761 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11762 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
11763 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
11764 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
11765 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
11766 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
11767 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
11768 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
11769 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
11770 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
11771 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
11772 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
11773 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
11774 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
11775 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
11776 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
11777 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
11778 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
11779 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
11780 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
11781 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
11782 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
11783 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
11784 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
11785 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
11786 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
11787 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
11788 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
11789 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
11790 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
11791 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
11792 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
11793 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
11794 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
11795 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
11796 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
11797 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
11798 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
11799 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
11800 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
11801 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
11802 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
11803 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
11804 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
11805 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
11806 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11807 };